# Encoding

### Mean Encoding (For Nominal)

In [2]:
# importing libraries 
import pandas as pd 
  
# creating dataset 
data={'SubjectName':['s1','s2','s3','s1','s4','s3','s2','s1','s2','s4','s1'], 
      'Target':[1,0,1,1,1,0,0,1,1,1,0]} 
  
df = pd.DataFrame(data) 
  
print(df) 

   SubjectName  Target
0           s1       1
1           s2       0
2           s3       1
3           s1       1
4           s4       1
5           s3       0
6           s2       0
7           s1       1
8           s2       1
9           s4       1
10          s1       0


In [3]:
df.groupby(['SubjectName'])['Target'].count() 

SubjectName
s1    4
s2    3
s3    2
s4    2
Name: Target, dtype: int64

In [4]:
df.groupby(['SubjectName'])['Target'].mean()

SubjectName
s1    0.750000
s2    0.333333
s3    0.500000
s4    1.000000
Name: Target, dtype: float64

In [5]:
Mean_encoded_subject = df.groupby(['SubjectName'])['Target'].mean().to_dict() 
  
df['SubjectName'] =  df['SubjectName'].map(Mean_encoded_subject) 
  
print(df) 

    SubjectName  Target
0      0.750000       1
1      0.333333       0
2      0.500000       1
3      0.750000       1
4      1.000000       1
5      0.500000       0
6      0.333333       0
7      0.750000       1
8      0.333333       1
9      1.000000       1
10     0.750000       0


### Label Encoding (For Ordinal)

In [10]:
import numpy as np 
import pandas as pd 
  
# Import dataset 
df = pd.read_csv('Iris.csv') 


In [11]:
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [13]:
df['Species'].unique()


array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [14]:
# Import label encoder 
from sklearn import preprocessing 
  
# label_encoder object knows how to understand word labels. 
label_encoder = preprocessing.LabelEncoder() 
  
# Encode labels in column 'species'. 
df['Species']= label_encoder.fit_transform(df['Species']) 
  
df['Species'].unique()

array([0, 1, 2])

In [15]:
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,0
1,2,4.9,3.0,1.4,0.2,0
2,3,4.7,3.2,1.3,0.2,0
3,4,4.6,3.1,1.5,0.2,0
4,5,5.0,3.6,1.4,0.2,0


### One Hot Encoding 

In [56]:
import pandas as pd
import numpy as np



df = pd.read_csv('Iris.csv', usecols=['Species'])
df.head()

Unnamed: 0,Species
0,Iris-setosa
1,Iris-setosa
2,Iris-setosa
3,Iris-setosa
4,Iris-setosa


In [57]:
for col in df:
    print(df[col].unique())

['Iris-setosa' 'Iris-versicolor' 'Iris-virginica']


In [59]:
df.shape

(150, 1)

In [60]:
# let's find the top 10 most frequent categories for the variable X2

data.Species.value_counts().sort_values(ascending=False).head(20)

0    50
1    50
2    50
Name: Species, dtype: int64

In [62]:
# let's make a list with the most frequent categories of the variable

top_10_labels = [y for y in df.Species.value_counts().sort_values(ascending=False).head(10).index]
top_10_labels

['Iris-versicolor', 'Iris-virginica', 'Iris-setosa']

In [63]:
# get whole set of dummy variables, for all the categorical variables

def one_hot_encoding_top_x(df, variable, top_x_labels):
    # function to create the dummy variables for the most frequent labels
    # we can vary the number of most frequent labels that we encode
    
    for label in top_x_labels:
        df[variable+'_'+label] = np.where(df[variable]==label, 1, 0)

In [65]:
# read the data again
df = pd.read_csv('Iris.csv')

# encode X2 into the 10 most frequent categories
one_hot_encoding_top_x(df, 'Species', top_10_labels)
df.head()


Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species,Species_Iris-versicolor,Species_Iris-virginica,Species_Iris-setosa
0,1,5.1,3.5,1.4,0.2,Iris-setosa,0,0,1
1,2,4.9,3.0,1.4,0.2,Iris-setosa,0,0,1
2,3,4.7,3.2,1.3,0.2,Iris-setosa,0,0,1
3,4,4.6,3.1,1.5,0.2,Iris-setosa,0,0,1
4,5,5.0,3.6,1.4,0.2,Iris-setosa,0,0,1


In [68]:
df= df.drop(['Species'], axis=1)

In [69]:
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species_Iris-versicolor,Species_Iris-virginica,Species_Iris-setosa
0,1,5.1,3.5,1.4,0.2,0,0,1
1,2,4.9,3.0,1.4,0.2,0,0,1
2,3,4.7,3.2,1.3,0.2,0,0,1
3,4,4.6,3.1,1.5,0.2,0,0,1
4,5,5.0,3.6,1.4,0.2,0,0,1
