# Data Pre Processing Using Pandas and scikit Learn 
This is a basic Jupter Notebook that deals with data preprocessing tools used in Python including but not limited to Imputers, LabelEncoders, OneHotEncoder, etc 

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import Imputer, LabelEncoder, OneHotEncoder
import matplotlib.pyplot as plt
path="Data.csv"
dataset=pd.read_csv(path)

In [43]:
X=dataset.iloc[:,:-1].values  #The input set
Y=dataset.iloc[:,3].values #The output that we need to predict
dataset.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


Now we see that this data contains both cateogrical data and Null data. We need to remove all these anomalies in the data in order to train it. There are 2 methods to do this- Either using Pandas or by using sklearn. First of we try to run it using sklearn and then through Pandas

In [44]:
#we can use this one for Pandas
X2, Y2, dataset2=X, Y, dataset
imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
#imputer = imputer.fit(X[:, 1:3])
#X[:,1:3]=imputer.transform(X[:,1:3])
X[:,1:3]=imputer.fit_transform(X[:,1:3])
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [45]:
labelencoder = LabelEncoder()
Y=labelencoder.fit_transform(Y)
print(Y)

[0 1 0 0 1 1 0 1 0 1]


In [47]:
onehotencoder = OneHotEncoder(categorical_features = [0])
X = onehotencoder.fit_transform(X).toarray()
print(X)

[[  0.00000000e+00   1.00000000e+00   0.00000000e+00   0.00000000e+00
    4.40000000e+01   7.20000000e+04]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00   1.00000000e+00
    2.70000000e+01   4.80000000e+04]
 [  1.00000000e+00   0.00000000e+00   1.00000000e+00   0.00000000e+00
    3.00000000e+01   5.40000000e+04]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00   1.00000000e+00
    3.80000000e+01   6.10000000e+04]
 [  1.00000000e+00   0.00000000e+00   1.00000000e+00   0.00000000e+00
    4.00000000e+01   6.37777778e+04]
 [  0.00000000e+00   1.00000000e+00   0.00000000e+00   0.00000000e+00
    3.50000000e+01   5.80000000e+04]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00   1.00000000e+00
    3.87777778e+01   5.20000000e+04]
 [  0.00000000e+00   1.00000000e+00   0.00000000e+00   0.00000000e+00
    4.80000000e+01   7.90000000e+04]
 [  1.00000000e+00   0.00000000e+00   1.00000000e+00   0.00000000e+00
    5.00000000e+01   8.30000000e+04]
 [  0.00000000e+00   1.00000000e+00  

In [52]:
print(pd.DataFrame(X))

     0    1    2    3          4             5
0  0.0  1.0  0.0  0.0  44.000000  72000.000000
1  1.0  0.0  0.0  1.0  27.000000  48000.000000
2  1.0  0.0  1.0  0.0  30.000000  54000.000000
3  1.0  0.0  0.0  1.0  38.000000  61000.000000
4  1.0  0.0  1.0  0.0  40.000000  63777.777778
5  0.0  1.0  0.0  0.0  35.000000  58000.000000
6  1.0  0.0  0.0  1.0  38.777778  52000.000000
7  0.0  1.0  0.0  0.0  48.000000  79000.000000
8  1.0  0.0  1.0  0.0  50.000000  83000.000000
9  0.0  1.0  0.0  0.0  37.000000  67000.000000


Now that we have scaled all the vectors using sklearn we can use Pandas to do the same. For that we have stored the variables in (2) variable set- X2, Y2, etc. The fundamental difference is that you can operate directly on the dataframe instead of using values. 

In [60]:
dataset['Age']=dataset['Age'].fillna(dataset['Age'].mean())
dataset['Salary']=dataset['Salary'].fillna(dataset['Salary'].mean())
dataset.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes


In [64]:
dataset['Purchased']=dataset['Purchased'].replace({'True':'Yes','False':'No'})
dataset.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes


#### Feature Scaling 
We will now use sklearn to scale the features in order to normalize the dataset 

In [67]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test= train_test_split(X,Y,test_size=0.2,random_state=0)
print(X_train)
print("--------------------------------------------")
print(Y_train)

[[  1.00000000e+00   0.00000000e+00   1.00000000e+00   0.00000000e+00
    4.00000000e+01   6.37777778e+04]
 [  0.00000000e+00   1.00000000e+00   0.00000000e+00   0.00000000e+00
    3.70000000e+01   6.70000000e+04]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00   1.00000000e+00
    2.70000000e+01   4.80000000e+04]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00   1.00000000e+00
    3.87777778e+01   5.20000000e+04]
 [  0.00000000e+00   1.00000000e+00   0.00000000e+00   0.00000000e+00
    4.80000000e+01   7.90000000e+04]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00   1.00000000e+00
    3.80000000e+01   6.10000000e+04]
 [  0.00000000e+00   1.00000000e+00   0.00000000e+00   0.00000000e+00
    4.40000000e+01   7.20000000e+04]
 [  0.00000000e+00   1.00000000e+00   0.00000000e+00   0.00000000e+00
    3.50000000e+01   5.80000000e+04]]
--------------------------------------------
[1 1 1 0 1 0 0 1]


In [81]:
from sklearn.preprocessing import StandardScaler 
#The module for feature scaling 
sc_X= StandardScaler()
X_train=sc_X.fit_transform(X_train)
sc_y = StandardScaler()
X_test = sc_y.fit_transform(X_test)
print(X_train)
#We do not need to scale an already binary value for practical appliation but we can choose to do so if we want to 
#We are using the StandardScaler here but we can also use the MinMaxScaler available in the module

[[ 1.         -1.          2.64575131 -0.77459667  0.26306757  0.12381479]
 [-1.          1.         -0.37796447 -0.77459667 -0.25350148  0.46175632]
 [ 1.         -1.         -0.37796447  1.29099445 -1.97539832 -1.53093341]
 [ 1.         -1.         -0.37796447  1.29099445  0.05261351 -1.11141978]
 [-1.          1.         -0.37796447 -0.77459667  1.64058505  1.7202972 ]
 [ 1.         -1.         -0.37796447  1.29099445 -0.0813118  -0.16751412]
 [-1.          1.         -0.37796447 -0.77459667  0.95182631  0.98614835]
 [-1.          1.         -0.37796447 -0.77459667 -0.59788085 -0.48214934]]
