In [1]:
#check if you are in right directory
import os
print(os.listdir())

['.ipynb_checkpoints', 'CleanupData.csv', 'dataPreprocessing.ipynb']


In [2]:
#import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#plotting is visible
%matplotlib inline

In [3]:
#import dataset using pandas
data = pd.read_csv('CleanupData.csv')

In [4]:
data


Unnamed: 0,State,Age,Pocket Money,Course Purchased
0,Delhi,34.0,7200.0,No
1,Mumbai,17.0,4800.0,Yes
2,Banglore,20.0,5400.0,No
3,Mumbai,28.0,6100.0,No
4,Banglore,30.0,,Yes
5,Delhi,25.0,5800.0,Yes
6,Mumbai,,5200.0,No
7,Delhi,38.0,7900.0,Yes
8,Banglore,40.0,8300.0,No
9,Delhi,27.0,6700.0,Yes


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
State               10 non-null object
Age                 9 non-null float64
Pocket Money        9 non-null float64
Course Purchased    10 non-null object
dtypes: float64(2), object(2)
memory usage: 400.0+ bytes


In [6]:
#get your independent and dependent matrix out of dataset
X = data.iloc[: , :-1 ].values

In [7]:
#matrix of independent variable
X

array([['Delhi', 34.0, 7200.0],
       ['Mumbai', 17.0, 4800.0],
       ['Banglore', 20.0, 5400.0],
       ['Mumbai', 28.0, 6100.0],
       ['Banglore', 30.0, nan],
       ['Delhi', 25.0, 5800.0],
       ['Mumbai', nan, 5200.0],
       ['Delhi', 38.0, 7900.0],
       ['Banglore', 40.0, 8300.0],
       ['Delhi', 27.0, 6700.0]], dtype=object)

In [8]:
#matrix of dependent variable
y = data.iloc[: , 3].values

In [9]:
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

In [10]:
# how to handle missing values

In [11]:
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

In [12]:
X

array([['Delhi', 34.0, 7200.0],
       ['Mumbai', 17.0, 4800.0],
       ['Banglore', 20.0, 5400.0],
       ['Mumbai', 28.0, 6100.0],
       ['Banglore', 30.0, 6377.777777777777],
       ['Delhi', 25.0, 5800.0],
       ['Mumbai', 28.77777777777778, 5200.0],
       ['Delhi', 38.0, 7900.0],
       ['Banglore', 40.0, 8300.0],
       ['Delhi', 27.0, 6700.0]], dtype=object)

In [13]:
#handle categorical data in our data
from sklearn.preprocessing import LabelEncoder
labelencoder_X = LabelEncoder()
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])
X

array([[1, 34.0, 7200.0],
       [2, 17.0, 4800.0],
       [0, 20.0, 5400.0],
       [2, 28.0, 6100.0],
       [0, 30.0, 6377.777777777777],
       [1, 25.0, 5800.0],
       [2, 28.77777777777778, 5200.0],
       [1, 38.0, 7900.0],
       [0, 40.0, 8300.0],
       [1, 27.0, 6700.0]], dtype=object)

In [14]:
#create dummy matrix for categorical dataset
from sklearn.preprocessing import OneHotEncoder
onehotencoder = OneHotEncoder(categorical_features=[0])
X = onehotencoder.fit_transform(X).toarray()
np.set_printoptions(suppress=True)
X

array([[   0.        ,    1.        ,    0.        ,   34.        ,
        7200.        ],
       [   0.        ,    0.        ,    1.        ,   17.        ,
        4800.        ],
       [   1.        ,    0.        ,    0.        ,   20.        ,
        5400.        ],
       [   0.        ,    0.        ,    1.        ,   28.        ,
        6100.        ],
       [   1.        ,    0.        ,    0.        ,   30.        ,
        6377.77777778],
       [   0.        ,    1.        ,    0.        ,   25.        ,
        5800.        ],
       [   0.        ,    0.        ,    1.        ,   28.77777778,
        5200.        ],
       [   0.        ,    1.        ,    0.        ,   38.        ,
        7900.        ],
       [   1.        ,    0.        ,    0.        ,   40.        ,
        8300.        ],
       [   0.        ,    1.        ,    0.        ,   27.        ,
        6700.        ]])

In [16]:
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

In [17]:
#handle y matrix for categorical data
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

In [18]:
#prepare test and training data set
# split our data into 2 parts, one will be training and other will be test

from sklearn.cross_validation import train_test_split
#NOTE - there might be a deprication warning here
#instead of above import, use this
#from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [19]:
X_train

array([[   1.        ,    0.        ,    0.        ,   40.        ,
        8300.        ],
       [   0.        ,    0.        ,    1.        ,   17.        ,
        4800.        ],
       [   1.        ,    0.        ,    0.        ,   30.        ,
        6377.77777778],
       [   0.        ,    0.        ,    1.        ,   28.77777778,
        5200.        ],
       [   0.        ,    1.        ,    0.        ,   27.        ,
        6700.        ],
       [   0.        ,    0.        ,    1.        ,   28.        ,
        6100.        ],
       [   0.        ,    1.        ,    0.        ,   38.        ,
        7900.        ],
       [   0.        ,    1.        ,    0.        ,   34.        ,
        7200.        ]])

In [21]:
y_test

array([1, 0])

In [22]:
#Feature scaling
#Standard deviation or Normalization

from sklearn.preprocessing import StandardScaler
scale_X = StandardScaler()
X_train = scale_X.fit_transform(X_train)
X_test = scale_X.transform(X_test)

In [23]:
X_train

array([[ 1.73205081, -0.77459667, -0.77459667,  1.43392268,  1.51222312],
       [-0.57735027, -0.77459667,  1.29099445, -1.98273337, -1.55112275],
       [ 1.73205081, -0.77459667, -0.77459667, -0.05157995, -0.17018588],
       [-0.57735027, -0.77459667,  1.29099445, -0.23314138, -1.20102608],
       [-0.57735027,  1.29099445, -0.77459667, -0.49723074,  0.11183644],
       [-0.57735027, -0.77459667,  1.29099445, -0.34868048, -0.41330857],
       [-0.57735027,  1.29099445, -0.77459667,  1.13682215,  1.16212645],
       [-0.57735027,  1.29099445, -0.77459667,  0.5426211 ,  0.54945728]])