In [1]:
import pandas as pd
import numpy as np
import sklearn

### 1. One-hot encoding
#### Assumptions : 1. There are finite set of features . 2. Where no ordinal relationship exist B/W them
Note : One-hot encoding has a disadvantage of performance issues as the No. of features increases. 

In [2]:
d1 = dict(Smoking= ['Current','Ex','never','Current','Ex','never','Current','Ex','never','Current','Ex','never','Current'],
          Target = [1,1,0,1,0,0,0,0,1,1,1,0,1])
df = pd.DataFrame(d1)
df

Unnamed: 0,Smoking,Target
0,Current,1
1,Ex,1
2,never,0
3,Current,1
4,Ex,0
5,never,0
6,Current,0
7,Ex,0
8,never,1
9,Current,1


In [3]:
from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
scaler = lb.fit_transform(df['Smoking'])
scaler

array([[1, 0, 0],
       [0, 1, 0],
       [0, 0, 1],
       [1, 0, 0],
       [0, 1, 0],
       [0, 0, 1],
       [1, 0, 0],
       [0, 1, 0],
       [0, 0, 1],
       [1, 0, 0],
       [0, 1, 0],
       [0, 0, 1],
       [1, 0, 0]])

In [4]:
df1 = pd.DataFrame(scaler)
all = pd.concat([df,df1], axis = 1)
all[['Smoking','Target']] = all[['Target','Smoking']]

In [5]:
all

Unnamed: 0,Smoking,Target,0,1,2
0,1,Current,1,0,0
1,1,Ex,0,1,0
2,0,never,0,0,1
3,1,Current,1,0,0
4,0,Ex,0,1,0
5,0,never,0,0,1
6,0,Current,1,0,0
7,0,Ex,0,1,0
8,1,never,0,0,1
9,1,Current,1,0,0


#### Second method for same above results ( but not effiecient )

In [6]:
### Alternate calculations for same above results
l1 = df.Smoking.value_counts(ascending=False).index # Note value-counts have an arg 'normalize = True/False' to present values_counts
print(l1)                                           # in normalize order
l1 = list(l1)
for category in l1:
    df['smoking'] = np.where(df['Smoking'] == category , 1,0)
df

Index(['Current', 'never', 'Ex'], dtype='object')


Unnamed: 0,Smoking,Target,smoking
0,Current,1,0
1,Ex,1,1
2,never,0,0
3,Current,1,0
4,Ex,0,1
5,never,0,0
6,Current,0,0
7,Ex,0,1
8,never,1,0
9,Current,1,0


### 2. Ordinal-encoding ( most popular )
#### Assumptions : 1.The integers values have a numbered-ordered relationship like Current>Ex>never etc.
Note : Ordinal-encoding has disadvantage , it may result in unexpected results if the ordering of numbers is not related in any order

In [7]:
df.head(3)

Unnamed: 0,Smoking,Target,smoking
0,Current,1,0
1,Ex,1,1
2,never,0,0


In [8]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Smoking'] = le.fit_transform(df['Smoking'])

In [9]:
df.head()

Unnamed: 0,Smoking,Target,smoking
0,0,1,0
1,1,1,1
2,2,0,0
3,0,1,0
4,1,0,1


In [18]:
### Alternate method
d1 = dict(Smoking= ['Current','Ex','never','Current','Ex','never','Current','Ex','never','Current','Ex','never','Current'],
          Target = [1,1,0,1,0,0,0,0,1,1,1,0,1])
df = pd.DataFrame(d1)
df['Smoking_2'] = df['Smoking']
df.head(3)

Unnamed: 0,Smoking,Target,Smoking_2
0,Current,1,Current
1,Ex,1,Ex
2,never,0,never


In [24]:
varlist = ['Smoking','Smoking_2']
#def encode(x):
#    print(x)
#    return df[x].map()

for var in varlist:
    print(var)
    df[var] = df[var].map({'Current':1, 'Ex':2, 'never':3})

Smoking
Smoking_2


In [25]:
df.head()

Unnamed: 0,Smoking,Target,Smoking_2
0,1,1,1
1,2,1,2
2,3,0,3
3,1,1,1
4,2,0,2


### 3. Count or frequency Encoding
##### In this type of encofing , the count of existance of each category in a variable is calculated and then encoded according to it's value-counts in that variable . Below Example is a good picture of it. 
#### Assumptions : There is no category of variable having similar frequency.
Note : Will not be able to handle if frequencies are sam for two or more categories.

In [26]:
d1 = dict(Smoking= ['Current','Ex','never','Current','Ex','never','Current','Ex','never','Current','Ex','never','Current'],
          Target = [1,1,0,1,0,0,0,0,1,1,1,0,1])
df = pd.DataFrame(d1)
df.head(3)

Unnamed: 0,Smoking,Target
0,Current,1
1,Ex,1
2,never,0


In [28]:
d1 = df['Smoking'].value_counts().to_dict()
d1

{'Current': 5, 'never': 4, 'Ex': 4}

In [30]:
### Count/Frequency Encoding of Data
df['Smoking_freq'] = df['Smoking'].map(d1)
df.head()

Unnamed: 0,Smoking,Target,Smoking_freq
0,Current,1,5
1,Ex,1,4
2,never,0,4
3,Current,1,5
4,Ex,0,4


### Mean encoding
#### In this type of encoding categories are assigned mean value as per the Target value ( like below pivot_table ) . For each category mean is calculated as per Target and the same value is assigned.

#### Assumptions : 
1. High Cardinality categorical features.
2. Does,nt effect the volume helps model learn faster.
#### Disadvantages :
1. Model may overfit 
2. Hard to validate results.
3. Information can be lost , if categories are divided into very few categories because of thier density.

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_clipboard()
df.head()

Unnamed: 0,Smoking,Target
0,Current,1
1,Ex,1
2,never,0
3,Current,1
4,Ex,0


In [5]:
dic_mean = df.groupby('Smoking').Target.mean().to_dict()
dic_mean

{'Current': 0.8, 'Ex': 0.5, 'never': 0.25}

In [6]:
pd.pivot_table(data=df , index='Smoking', aggfunc=np.mean)

Unnamed: 0_level_0,Target
Smoking,Unnamed: 1_level_1
Current,0.8
Ex,0.5
never,0.25


In [7]:
dic_mean = df.groupby(['Smoking'])['Target'].mean().to_dict()

In [9]:
print(dic_mean)
df['Smoking_mean_labels'] = df['Smoking'].map(dic_mean)
df

{'Current': 0.8, 'Ex': 0.5, 'never': 0.25}


Unnamed: 0,Smoking,Target,Smoking_mean_labels
0,Current,1,0.8
1,Ex,1,0.5
2,never,0,0.25
3,Current,1,0.8
4,Ex,0,0.5
5,never,0,0.25
6,Current,0,0.8
7,Ex,0,0.5
8,never,1,0.25
9,Current,1,0.8
