In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("House_price.csv", usecols=["BsmtQual",'FireplaceQu', 'GarageType',"SalePrice"])
df.head(5)

Unnamed: 0,BsmtQual,FireplaceQu,GarageType,SalePrice
0,Gd,,Attchd,208500
1,Gd,TA,Attchd,181500
2,Gd,TA,Attchd,223500
3,TA,Gd,Detchd,140000
4,Gd,TA,Attchd,250000


In [3]:
df["BsmtQual"].value_counts()

TA    649
Gd    618
Ex    121
Fa     35
Name: BsmtQual, dtype: int64

## NOminal Encoding

### One hot encoding

In [4]:
pd.get_dummies(df["BsmtQual"],drop_first=True)

Unnamed: 0,Fa,Gd,TA
0,0,1,0
1,0,1,0
2,0,1,0
3,0,0,1
4,0,1,0
...,...,...,...
1455,0,1,0
1456,0,1,0
1457,0,0,1
1458,0,0,1


#### onehot encoding if we have more categorical values
- only take the top more frequent values

In [4]:
df = pd.read_csv("mercedes.csv", usecols=["X0","X1","X2","X3","X4","X5","X6"])

In [5]:
df.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6
0,k,v,at,a,d,u,j
1,k,t,av,e,d,y,l
2,az,w,n,c,d,x,j
3,az,t,n,f,d,x,l
4,az,v,n,f,d,h,d


In [6]:
df["X0"].value_counts()

z     360
ak    349
y     324
ay    313
t     306
x     300
o     269
f     227
n     195
w     182
j     181
az    175
aj    151
s     106
ap    103
h      75
d      73
al     67
v      36
af     35
m      34
ai     34
e      32
ba     27
at     25
a      21
ax     19
aq     18
am     18
i      18
u      17
aw     16
l      16
ad     14
au     11
k      11
b      11
r      10
as     10
bc      6
ao      4
c       3
aa      2
q       2
ac      1
g       1
ab      1
Name: X0, dtype: int64

In [17]:
lst_10 = df["X0"].value_counts().sort_values(ascending=False).head(10).index
lst_10 = list(lst_10)
lst_10

['z', 'ak', 'y', 'ay', 't', 'x', 'o', 'f', 'n', 'w']

In [18]:
for categories in lst_10:
    df[categories] = np.where(df["X0"] == categories, 1,0)

In [22]:
lst_10.append("X0")

In [23]:
df[lst_10]

Unnamed: 0,z,ak,y,ay,t,x,o,f,n,w,X0
0,0,0,0,0,0,0,0,0,0,0,k
1,0,0,0,0,0,0,0,0,0,0,k
2,0,0,0,0,0,0,0,0,0,0,az
3,0,0,0,0,0,0,0,0,0,0,az
4,0,0,0,0,0,0,0,0,0,0,az
...,...,...,...,...,...,...,...,...,...,...,...
4204,0,1,0,0,0,0,0,0,0,0,ak
4205,0,0,0,0,0,0,0,0,0,0,j
4206,0,1,0,0,0,0,0,0,0,0,ak
4207,0,0,0,0,0,0,0,0,0,0,al


### Mean Encoding - used to pincode
Advantages
- captures information with in the label

Disadvantage
- prones to overfitting

In [62]:
df = pd.read_csv("titanic_train.csv", usecols=["Survived", "Embarked", "Cabin"])
df.head()

Unnamed: 0,Survived,Cabin,Embarked
0,0,,S
1,1,C85,C
2,1,,S
3,1,C123,S
4,0,,S


In [63]:
df["Cabin"].fillna("Missing", inplace = True)

In [64]:
df["Cabin"] = df["Cabin"].astype(str).str[0]
df.head()

Unnamed: 0,Survived,Cabin,Embarked
0,0,M,S
1,1,C,C
2,1,M,S
3,1,C,S
4,0,M,S


In [65]:
df["Cabin"].unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [68]:
mean_ordinal = df.groupby(["Cabin"])["Survived"].mean().to_dict()

In [69]:
df["Mean_Ordinal"] = df["Cabin"].map(mean_ordinal)
df.head()

Unnamed: 0,Survived,Cabin,Embarked,Mean_Ordinal
0,0,M,S,0.299854
1,1,C,C,0.59322
2,1,M,S,0.299854
3,1,C,S,0.59322
4,0,M,S,0.299854


## Ordinal Encoding

### Label Encoding

In [None]:
df = pd.read_csv("titanic_train.csv", usecols=["Survived", "Embarked", "Cabin"])
df.head()

In [11]:
from sklearn.preprocessing import LabelEncoder

In [17]:
label=LabelEncoder()
if(type(df["Cabin"][1])==str):
    df["Cabin"]=label.fit_transform(df["Cabin"])    

In [18]:
df.head()

Unnamed: 0,Survived,Cabin,Embarked
0,0,147,S
1,1,81,C
2,1,147,S
3,1,55,S
4,0,147,S


### Target Guided ordinal Encoding
- ordering the labels according to the target feature

In [41]:
df = pd.read_csv("titanic_train.csv", usecols=["Survived", "Embarked", "Cabin"])
df.head()

Unnamed: 0,Survived,Cabin,Embarked
0,0,,S
1,1,C85,C
2,1,,S
3,1,C123,S
4,0,,S


In [43]:
df["Cabin"].fillna("Missing", inplace = True)

In [47]:
 df["Cabin"] = df["Cabin"].astype(str).str[0]

In [55]:
target_label = df.groupby(["Cabin"])["Survived"].mean().sort_values().index

In [57]:
target_label2 = {k:i for i,k in enumerate(target_label,0)}
target_label2

{'T': 0, 'M': 1, 'A': 2, 'G': 3, 'C': 4, 'F': 5, 'B': 6, 'E': 7, 'D': 8}

In [58]:
df["Cabin_label"] = df["Cabin"].map(target_label2)
df.head()

Unnamed: 0,Survived,Cabin,Embarked,Cabin_label
0,0,M,S,1
1,1,C,C,4
2,1,M,S,1
3,1,C,S,4
4,0,M,S,1
