## Introduction to Data Preprocessing

In [1]:
import pandas as pd
from io import StringIO 
# StringIO - It is only used for the purpose of illustration,so that the csv_data will behave as if it was present in our Disk.


In [3]:
csv_data = \
'''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
9.0,10.0,11.0,
'''

In [4]:
df = pd.read_csv(StringIO(csv_data))

In [5]:
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,9.0,10.0,11.0,


In [6]:
df.isnull().sum()

A    0
B    0
C    1
D    1
dtype: int64

In [7]:
df.dropna(subset=['D'])

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0


### Using Imputer 

In [8]:
from sklearn.preprocessing import Imputer

In [9]:
imr = Imputer(strategy='median')

In [14]:
imr = imr.fit(df.values)

In [15]:
imputed_data = imr.transform(df.values)

In [16]:
imputed_data

array([[ 1.,  2.,  3.,  4.],
       [ 5.,  6.,  7.,  8.],
       [ 9., 10., 11.,  6.]])

In [17]:
# .values
# Return a Numpy representation of the DataFrame.
# Only the values in the DataFrame will be returned, the axes labels will be removed.


### Handling Categorical Data

In [18]:
# Categorical Data - 
# ordinal - It can be ordered ex. - M<L<XL
# nominal - It can't be ordered ex. - Color of a shirt . We can't say Red>Blue

In [19]:
df_cat = pd.DataFrame(data = 
                     [['green','M',10.1,'class1'],
                      ['blue','L',20.1,'class2'],
                      ['white','M',30.1,'class1']])
df_cat.columns = ['color','size','price','classlabel']

In [20]:
df_cat

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class1
1,blue,L,20.1,class2
2,white,M,30.1,class1


#### Mapping Ordinal Features

In [21]:
size_mapping = {'M':1,'L':2}

In [22]:
df_cat['size'] = df_cat['size'].map(size_mapping)

In [23]:
df_cat

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class1
1,blue,2,20.1,class2
2,white,1,30.1,class1


#### Using LabelEncoder

In [29]:
from sklearn.preprocessing import LabelEncoder
class_le = LabelEncoder()
df_cat['classlabel'] = class_le.fit_transform(df_cat['classlabel'].values)

In [30]:
df_cat

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,0
1,blue,2,20.1,1
2,white,1,30.1,0


#### Handling nominal categorical variables 

In [31]:
# If we use the same mapping strategy that we used with ordinal feature like 'Size', then we are actually misleading our model
# into believing that there is some sort of relationship between the various colors. So if use-
# blue = 0 and green = 1, Then the model will still think of it as some sort of a relationship like green>blue which doesn't make any sense.
# But the model might still produce some results based on this but those results won't be optimal for us

In [32]:
# Using get_dummies() for One-Hot Encoding
pd.get_dummies(df_cat[['color','size','price']])

Unnamed: 0,size,price,color_blue,color_green,color_white
0,1,10.1,0,1,0
1,2,20.1,1,0,0
2,1,30.1,0,0,1


In [33]:
pd.get_dummies(df_cat[['color','size','price']],drop_first=True)

Unnamed: 0,size,price,color_green,color_white
0,1,10.1,1,0
1,2,20.1,0,0
2,1,30.1,0,1
