# Preprocessing steps
1. Imputating missing values (`sklearn.impute`)

In [1]:
import pandas as pd
import numpy as np

a = pd.DataFrame({
    'A': [1, 2, 3, None, None, 23,56,67,76],
    'B': [54,None,6, None, 1,2,2,5,5],
    'C': ['A','B', np.nan, np.nan, 'B', 'B', 'A', 'A', 'B'],
    'D':['Purchase','Not Purchase','Purchase','Not Purchase','Purchase',
         'Not Purchase','Purchase','Not Purchase','Purchase']
})
a

Unnamed: 0,A,B,C,D
0,1.0,54.0,A,Purchase
1,2.0,,B,Not Purchase
2,3.0,6.0,,Purchase
3,,,,Not Purchase
4,,1.0,B,Purchase
5,23.0,2.0,B,Not Purchase
6,56.0,2.0,A,Purchase
7,67.0,5.0,A,Not Purchase
8,76.0,5.0,B,Purchase


Rule: never impute the missing values in target variable, drop the rows with missing target variable values

In [2]:
from sklearn.impute import SimpleImputer
# object creation
si = SimpleImputer()
# fit the imputer
a[['A','B']] = si.fit_transform(a[['A','B']])
si2 = SimpleImputer(strategy='most_frequent')
a['C'] = si2.fit_transform(a[['C']])
a

Unnamed: 0,A,B,C,D
0,1.0,54.0,A,Purchase
1,2.0,10.714286,B,Not Purchase
2,3.0,6.0,B,Purchase
3,32.571429,10.714286,B,Not Purchase
4,32.571429,1.0,B,Purchase
5,23.0,2.0,B,Not Purchase
6,56.0,2.0,A,Purchase
7,67.0,5.0,A,Not Purchase
8,76.0,5.0,B,Purchase


2. Encoding categorical variables (`sklearn.preprocessing`)
    - Ordinal encoding (`OrdinalEncoder`)
    - Label encoding (`LabelEncoder`) for target variable only
    - One-hot encoding (`OneHotEncoder`)

In [3]:
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder()
a['C'] = oe.fit_transform(a[['C']])
a

Unnamed: 0,A,B,C,D
0,1.0,54.0,0.0,Purchase
1,2.0,10.714286,1.0,Not Purchase
2,3.0,6.0,1.0,Purchase
3,32.571429,10.714286,1.0,Not Purchase
4,32.571429,1.0,1.0,Purchase
5,23.0,2.0,1.0,Not Purchase
6,56.0,2.0,0.0,Purchase
7,67.0,5.0,0.0,Not Purchase
8,76.0,5.0,1.0,Purchase


In [4]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
a['D'] =le.fit_transform(a[['D']])
a

  y = column_or_1d(y, warn=True)


Unnamed: 0,A,B,C,D
0,1.0,54.0,0.0,1
1,2.0,10.714286,1.0,0
2,3.0,6.0,1.0,1
3,32.571429,10.714286,1.0,0
4,32.571429,1.0,1.0,1
5,23.0,2.0,1.0,0
6,56.0,2.0,0.0,1
7,67.0,5.0,0.0,0
8,76.0,5.0,1.0,1


In [5]:
from sklearn.preprocessing import OneHotEncoder

df = pd.DataFrame({
    'City' : ['Delhi', 'Mumbai', 'Hyderabad', 'Mumbai', 'Delhi'],
    'pop' : [12, 13, 14, 15, 16]
})
he = OneHotEncoder(drop='first')
enc_city = he.fit_transform(df[['City']])
df = pd.concat([df, pd.DataFrame(enc_city.toarray())], axis=1)
df.drop('City', axis=1, inplace=True)
df

Unnamed: 0,pop,0,1
0,12,0.0,0.0
1,13,0.0,1.0
2,14,1.0,0.0
3,15,0.0,1.0
4,16,0.0,0.0


In [6]:
he.inverse_transform([[1,0],[0,0]])

array([['Hyderabad'],
       ['Delhi']], dtype=object)

In [7]:
from sklearn.preprocessing import StandardScaler
df = pd.DataFrame({
    'salary': [123000,100000,500000,120300],
    'age': [44,34,45,30],
})
sc = StandardScaler()
df[['salary','age']] = sc.fit_transform(df)
df

Unnamed: 0,salary,age
0,-0.525295,0.895953
1,-0.662862,-0.662226
2,1.729601,1.051771
3,-0.541444,-1.285497
