In [1]:
# Q1. What is data encoding? How is it useful in data science?

In [2]:
# Data Encoding is the process of converting categorical data into a numerical format that can be used by machine learning algorithms. It is essential because most algorithms require numerical input.

# Types of Data Encoding:
#     Label Encoding: Converts categories into unique integers.
#     One-Hot Encoding: Creates binary columns for each category.
#     Binary Encoding: Converts categories to binary code and creates columns for each bit.
#     Frequency Encoding: Encodes categories based on their frequency in the dataset.
    
# Usefulness in Data Science:
#     Algorithm Compatibility: Makes categorical data usable for algorithms that require numerical input.
#     Model Performance: Improves model accuracy by avoiding incorrect assumptions about category relationships.
#     Feature Engineering: Helps in creating features that capture relationships between categorical data and target variables.
#     Data Integration: Ensures consistency when integrating datasets from different sources.
#     Handling Non-Numeric Data: Transforms non-numeric data into a format suitable for analysis and predictions.

In [3]:
# Q2. What is nominal encoding? Provide an example of how you would use it in a real-world scenario.

In [4]:
# Nominal Encoding transforms categorical data into numerical format for use in machine learning models. Using scikit-learn,
# you can apply one-hot encoding to convert categorical variables into a suitable format.

In [10]:
from sklearn.datasets import load_iris
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [11]:
data = load_iris()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['species'] = data.target 

In [13]:
df['species'] = df['species'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})

In [14]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [17]:
encoder = OneHotEncoder(sparse=False, drop='first')

In [18]:
encoded_species = encoder.fit_transform(df[['species']])



In [21]:
encoded_df = pd.DataFrame(encoded_species, columns=encoder.get_feature_names_out(['species']))

In [22]:
final_df = pd.concat([df.drop('species', axis=1), encoded_df], axis=1)

In [25]:
final_df = pd.concat([df.drop('species', axis=1), encoded_df], axis=1)

In [26]:
final_df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species_versicolor,species_virginica
0,5.1,3.5,1.4,0.2,0.0,0.0
1,4.9,3.0,1.4,0.2,0.0,0.0
2,4.7,3.2,1.3,0.2,0.0,0.0
3,4.6,3.1,1.5,0.2,0.0,0.0
4,5.0,3.6,1.4,0.2,0.0,0.0
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,0.0,1.0
146,6.3,2.5,5.0,1.9,0.0,1.0
147,6.5,3.0,5.2,2.0,0.0,1.0
148,6.2,3.4,5.4,2.3,0.0,1.0


In [27]:
# Q3. In what situations is nominal encoding preferred over one-hot encoding? Provide a practical example.

In [28]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

In [29]:
data = {
    'user_id': ['user1', 'user2', 'user3', 'user4', 'user5'],
    'churn': [0, 1, 0, 1, 0] 
}

In [30]:
df = pd.DataFrame(data)

In [32]:
label_encoder = LabelEncoder()

In [33]:
df['user_id_encoded'] = label_encoder.fit_transform(df['user_id'])

In [34]:
df

Unnamed: 0,user_id,churn,user_id_encoded
0,user1,0,0
1,user2,1,1
2,user3,0,2
3,user4,1,3
4,user5,0,4


In [35]:
# Q4. Suppose you have a dataset containing categorical data with 5 unique values. Which encoding
# technique would you use to transform this data into a format suitable for machine learning algorithms?
# Explain why you made this choice.

In [44]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [45]:
data = {'category': ['A', 'B', 'C', 'D', 'E']}
df = pd.DataFrame(data)

In [48]:
encoded_data = encoder.fit_transform(df[['category']])



In [49]:
ncoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(['category']))

In [51]:
encoded_df

Unnamed: 0,species_versicolor,species_virginica
0,0.0,0.0
1,0.0,0.0
2,0.0,0.0
3,0.0,0.0
4,0.0,0.0
...,...,...
145,0.0,1.0
146,0.0,1.0
147,0.0,1.0
148,0.0,1.0


In [52]:
from sklearn.preprocessing import LabelEncoder

In [54]:
label_encoder = LabelEncoder()
df['category_encoded'] = label_encoder.fit_transform(df['category'])

In [55]:
df

Unnamed: 0,category,category_encoded
0,A,0
1,B,1
2,C,2
3,D,3
4,E,4


In [56]:
# Use One-Hot Encoding when categories do not have an inherent order and you are working with algorithms that require 
# numerical inputs without assuming an ordinal relationship.

# Use Label Encoding when dealing with tree-based models or when there is an ordinal relationship among categories.

In [57]:
# Q5. In a machine learning project, you have a dataset with 1000 rows and 5 columns. Two of the columns
# are categorical, and the remaining three columns are numerical. If you were to use nominal encoding to
# transform the categorical data, how many new columns would be created? Show your calculations.

In [58]:
# Dataset Details: 1000 rows and 5 columns (2 categorical, 3 numerical).

# Nominal Encoding (Label Encoding): Converts each categorical value into a unique integer but does not create additional 
# columns.

# New Columns Created: 0 (The number of columns remains unchanged).

# Label encoding replaces each categorical column with a single numerical column, preserving the original number of columns 
# in the dataset.

In [59]:
# Q6. You are working with a dataset containing information about different types of animals, including their
# species, habitat, and diet. Which encoding technique would you use to transform the categorical data into
# a format suitable for machine learning algorithms? Justify your answer.

In [60]:
# For transforming categorical data in a dataset about animals (including features like species, habitat, and diet), 
# one-hot encoding is generally the preferred technique. This method converts each category into a new binary column,
# avoiding any assumptions about the order of categorie

In [61]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [62]:
data = {'species': ['lion', 'tiger', 'bear'],
        'habitat': ['savanna', 'forest', 'forest'],
        'diet': ['carnivore', 'carnivore', 'omnivore']}
df = pd.DataFrame(data)

In [63]:
encoder = OneHotEncoder(sparse=False, drop='first')

In [64]:
encoded_data = encoder.fit_transform(df[['species', 'habitat', 'diet']])



In [65]:
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(['species', 'habitat', 'diet']))

In [66]:
encoded_df

Unnamed: 0,species_lion,species_tiger,habitat_savanna,diet_omnivore
0,1.0,0.0,1.0,0.0
1,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,1.0


In [67]:
# Q7.You are working on a project that involves predicting customer churn for a telecommunications
# company. You have a dataset with 5 features, including the customer's gender, age, contract type,
# monthly charges, and tenure. Which encoding technique(s) would you use to transform the categorical
# data into numerical data? Provide a step-by-step explanation of how you would implement the encoding.

In [68]:
from sklearn.preprocessing import OneHotEncoder