# Handling Null Values

In [98]:
import numpy as np
import pandas as pd

In [99]:
dataset = [['Ankur',25,65],['Abhinav',23],['Suraj',22,70],['Ashwini',22],['Vaishnavi',23,55]]
dataset = pd.DataFrame(dataset,columns=['Name','Age','Weight'])

In [100]:
dataset

Unnamed: 0,Name,Age,Weight
0,Ankur,25,65.0
1,Abhinav,23,
2,Suraj,22,70.0
3,Ashwini,22,
4,Vaishnavi,23,55.0


In [101]:
dataset.isnull().sum()

Name      0
Age       0
Weight    2
dtype: int64

In [102]:
dataset.dropna()

Unnamed: 0,Name,Age,Weight
0,Ankur,25,65.0
2,Suraj,22,70.0
4,Vaishnavi,23,55.0


In [103]:
dataset

Unnamed: 0,Name,Age,Weight
0,Ankur,25,65.0
1,Abhinav,23,
2,Suraj,22,70.0
3,Ashwini,22,
4,Vaishnavi,23,55.0


In [104]:
from sklearn.impute import SimpleImputer

In [105]:
imputer = SimpleImputer(missing_values =np.nan,strategy = 'mean')
dataset['Weight'] = imputer.fit_transform(dataset[['Weight']])

In [106]:
dataset

Unnamed: 0,Name,Age,Weight
0,Ankur,25,65.0
1,Abhinav,23,63.333333
2,Suraj,22,70.0
3,Ashwini,22,63.333333
4,Vaishnavi,23,55.0


In [107]:
dataset.fillna(dataset.mean())

  dataset.fillna(dataset.mean())


Unnamed: 0,Name,Age,Weight
0,Ankur,25,65.0
1,Abhinav,23,63.333333
2,Suraj,22,70.0
3,Ashwini,22,63.333333
4,Vaishnavi,23,55.0


# Standardization

In [108]:
from sklearn.preprocessing import StandardScaler 

In [109]:
std = StandardScaler()
data = std.fit_transform(dataset[['Age','Weight']])
data

array([[ 1.82574186,  0.34503278],
       [ 0.        ,  0.        ],
       [-0.91287093,  1.38013112],
       [-0.91287093,  0.        ],
       [ 0.        , -1.7251639 ]])

# Handling Categorical Variables

### Ordinal Categiorical Varaibles

In [145]:
# Creating a dummy dataframe
data = pd.DataFrame(data = 
                     [['green','M',10.1,'class1'],
                      ['blue','L',20.1,'class2'],
                      ['white','M',30.1,'class1'],
                     ['orange','S',15.1,'class3'],
                     ['pink','XL',19.1,'class4']])
data.columns = ['Color','size','price','classlabel']

In [146]:
data

Unnamed: 0,Color,size,price,classlabel
0,green,M,10.1,class1
1,blue,L,20.1,class2
2,white,M,30.1,class1
3,orange,S,15.1,class3
4,pink,XL,19.1,class4


#### Using Map function()

In [147]:
Size = {'S':0,'M':1,'L':2,'XL':3}
data['size'] = data['size'].map(Size)

In [148]:
data

Unnamed: 0,Color,size,price,classlabel
0,green,1,10.1,class1
1,blue,2,20.1,class2
2,white,1,30.1,class1
3,orange,0,15.1,class3
4,pink,3,19.1,class4


In [149]:
data['classlabel'] = data['classlabel'].astype('category')

In [150]:
data['classlabel'] = data['classlabel'].cat.codes
data

Unnamed: 0,Color,size,price,classlabel
0,green,1,10.1,0
1,blue,2,20.1,1
2,white,1,30.1,0
3,orange,0,15.1,2
4,pink,3,19.1,3


#### Using Label Encoder

In [126]:
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
data['classlabel'] = label.fit_transform(data['classlabel'].values)

In [127]:
data

Unnamed: 0,Color,size,price,classlabel
0,green,1,10.1,0
1,blue,2,20.1,1
2,white,1,30.1,0
3,orange,0,15.1,2
4,pink,3,19.1,3


# Nominal Categorical Variables

In [133]:
## Using One-hot encoding
data= pd.get_dummies(data[['Color','size','price']],drop_first = True)

In [134]:
data

Unnamed: 0,size,price,Color_green,Color_orange,Color_pink,Color_white
0,1,10.1,1,0,0,0
1,2,20.1,0,0,0,0
2,1,30.1,0,0,0,1
3,0,15.1,0,1,0,0
4,3,19.1,0,0,1,0


In [157]:
from sklearn.preprocessing import OneHotEncoder

In [158]:
enc = OneHotEncoder(handle_unknown='ignore')
data_cf = pd.DataFrame(enc.fit_transform(data[['Color']]).toarray())

In [160]:
data = data.join(data_cf)
data

Unnamed: 0,Color,size,price,classlabel,0,1,2,3,4
0,green,1,10.1,0,0.0,1.0,0.0,0.0,0.0
1,blue,2,20.1,1,1.0,0.0,0.0,0.0,0.0
2,white,1,30.1,0,0.0,0.0,0.0,0.0,1.0
3,orange,0,15.1,2,0.0,0.0,1.0,0.0,0.0
4,pink,3,19.1,3,0.0,0.0,0.0,1.0,0.0
