### Data Encoding-

#### (1)- Nominal/ One Hot Encoding-

In [1]:
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder

In [2]:
# OneHotEncoder- input to it should be an array of values taken on by the categorical feature. 
# The categories are encoded using a one-hot(aka 'one-of-k' or 'dummy') encoding scheme. 
# This creates a binary column for each category and returns a sparse matrix or dense array.

In [3]:
# create a simple data frame-
df= pd.DataFrame({"color": ["red","blue","green","green","red","blue"]})
df

Unnamed: 0,color
0,red
1,blue
2,green
3,green
4,red
5,blue


In [4]:
# create an instance or object of OneHotEncoder-
encoder= OneHotEncoder()

#perform fit and transform-
encoder.fit_transform(df[["color"]])

<6x3 sparse matrix of type '<class 'numpy.float64'>'
	with 6 stored elements in Compressed Sparse Row format>

In [6]:
encoded= encoder.fit_transform(df[["color"]]).toarray()
encoded

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

In [11]:
# so, it has encoded based on alphabetical order.

In [9]:
encoded_df= pd.DataFrame(encoded, columns=encoder.get_feature_names_out())
encoded_df

Unnamed: 0,color_blue,color_green,color_red
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,0.0,1.0,0.0
3,0.0,1.0,0.0
4,0.0,0.0,1.0
5,1.0,0.0,0.0


In [10]:
pd.concat([df, encoded_df], axis=1)

Unnamed: 0,color,color_blue,color_green,color_red
0,red,0.0,0.0,1.0
1,blue,1.0,0.0,0.0
2,green,0.0,1.0,0.0
3,green,0.0,1.0,0.0
4,red,0.0,0.0,1.0
5,blue,1.0,0.0,0.0


In [13]:
# for new data entries-
encoder.transform([["blue"]]).toarray()



array([[1., 0., 0.]])

In [14]:
df1= sns.load_dataset("tips")
df1

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [19]:
encoder1= OneHotEncoder()       # creating an object of OneHotEncoder class and initializing it
encoded1= encoder1.fit_transform(df1[["sex", "smoker"]]).toarray()

In [20]:
encoded1_df= pd.DataFrame(encoded1, columns= encoder1.get_feature_names_out())
encoded1_df

Unnamed: 0,sex_Female,sex_Male,smoker_No,smoker_Yes
0,1.0,0.0,1.0,0.0
1,0.0,1.0,1.0,0.0
2,0.0,1.0,1.0,0.0
3,0.0,1.0,1.0,0.0
4,1.0,0.0,1.0,0.0
...,...,...,...,...
239,0.0,1.0,1.0,0.0
240,1.0,0.0,0.0,1.0
241,0.0,1.0,0.0,1.0
242,0.0,1.0,1.0,0.0


#### (2) a) Label encoding-

In [1]:
# Each category of the categorical variable is assigned an unique numerical label

In [2]:
#create a sample dataframe-
import pandas as pd
df2= pd.DataFrame({"color": ["red","blue","green","green","red","blue"]})
df2

Unnamed: 0,color
0,red
1,blue
2,green
3,green
4,red
5,blue


In [3]:
from sklearn.preprocessing import LabelEncoder

# create an instance of LabelEncoder-
lbl_encoder= LabelEncoder()

# perform fit and transform-
lbl_encoder.fit_transform(df2[["color"]])

  y = column_or_1d(y, warn=True)


array([2, 0, 1, 1, 2, 0])

In [4]:
# so it performs labelling alphabetically, so blue=0, green=1, red=2

In [5]:
# for new data entries-
lbl_encoder.transform([["blue"]])

  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


array([0])

In [6]:
lbl_encoder.transform([["red"]])

  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


array([2])

#### (2) b) Ordinal Encoding-

In [7]:
# Each category of the categorical variable is assigned an unique numerical value based on its position or rank

In [10]:
#create a sample dataframe with ordinal variable-
import pandas as pd
df3= pd.DataFrame({"size": ["small","medium","large","medium","small","large"]})
df3

Unnamed: 0,size
0,small
1,medium
2,large
3,medium
4,small
5,large


In [15]:
from sklearn.preprocessing import OrdinalEncoder

#create an instance of OrdinalEncoder-
or_encoder= OrdinalEncoder(categories=[["small","medium","large"]])

# perform fit and transform
or_encoder.fit_transform(df3[["size"]])

array([[0.],
       [1.],
       [2.],
       [1.],
       [0.],
       [2.]])

In [19]:
or_encoded= or_encoder.fit_transform(df3[["size"]])
or_encoded_df= pd.DataFrame(or_encoded, columns=df3.columns)
pd.concat([df3, or_encoded_df], axis=1)

Unnamed: 0,size,size.1
0,small,0.0
1,medium,1.0
2,large,2.0
3,medium,1.0
4,small,0.0
5,large,2.0


In [20]:
# for new data entries-
or_encoder.transform([["medium"]])



array([[1.]])

#### (3) Target Guided Ordinal Encoding-

In [3]:
# Each category of the categorical variable is replaced with a numerical value equal to the mean/median of the corresponding target variable value

In [7]:
# create a sample data frame with categorical variable and a target variable-

import pandas as pd
df4= pd.DataFrame({"city": ["New York", "London", "Paris", "Tokyo", "New York", "Paris"],
                   "price": [200,150,300,250,180,320]})
df4

Unnamed: 0,city,price
0,New York,200
1,London,150
2,Paris,300
3,Tokyo,250
4,New York,180
5,Paris,320


In [8]:
df4.groupby("city")["price"].mean()

city
London      150.0
New York    190.0
Paris       310.0
Tokyo       250.0
Name: price, dtype: float64

In [10]:
mean_price= df4.groupby("city")["price"].mean().to_dict()
mean_price

{'London': 150.0, 'New York': 190.0, 'Paris': 310.0, 'Tokyo': 250.0}

In [11]:
df4["city_encoded"]= df4["city"].map(mean_price)
df4

Unnamed: 0,city,price,city_encoded
0,New York,200,190.0
1,London,150,150.0
2,Paris,300,310.0
3,Tokyo,250,250.0
4,New York,180,190.0
5,Paris,320,310.0


In [12]:
# so we give the city_encoded column to our ML model for training

In [13]:
import seaborn as sns
df5= sns.load_dataset("tips")
df5

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [14]:
df5.groupby("time")["total_bill"].mean()

time
Lunch     17.168676
Dinner    20.797159
Name: total_bill, dtype: float64

In [16]:
mean= df5.groupby("time")["total_bill"].mean().to_dict()
mean

{'Lunch': 17.168676470588235, 'Dinner': 20.79715909090909}

In [17]:
df5["time_encoded"]= df5["time"].map(mean)
df5

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,time_encoded
0,16.99,1.01,Female,No,Sun,Dinner,2,20.797159
1,10.34,1.66,Male,No,Sun,Dinner,3,20.797159
2,21.01,3.50,Male,No,Sun,Dinner,3,20.797159
3,23.68,3.31,Male,No,Sun,Dinner,2,20.797159
4,24.59,3.61,Female,No,Sun,Dinner,4,20.797159
...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,20.797159
240,27.18,2.00,Female,Yes,Sat,Dinner,2,20.797159
241,22.67,2.00,Male,Yes,Sat,Dinner,2,20.797159
242,17.82,1.75,Male,No,Sat,Dinner,2,20.797159
