# Data Preprocessing Tools

## Importing the libraries

In [203]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [204]:
dataset = pd.read_csv('Data.csv')
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [205]:
#Independant data
X = dataset.iloc[:,:-1].values #values of all columns except last(at -1 which is index of last column)
X #X is a numpy array

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [206]:
#Dependant Variable
Y = dataset.iloc[:,-1].values
Y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

## Taking care of missing data

In [207]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    10 non-null     object 
 1   Age        9 non-null      float64
 2   Salary     9 non-null      float64
 3   Purchased  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 448.0+ bytes


In [208]:
#Replace nan/missing values by Average: Using SciKit Learn

In [209]:
from sklearn.impute import SimpleImputer #From Library "sklearn" import module "impute" Import class "SimpleImputer"
imputer = SimpleImputer(missing_values=np.nan,strategy="mean") #Instance of class created. missing values chosen as np.nan and strategy chosen as mean
imputer.fit(X[:,1:3]) #fit method of class will find out the average from matrix given to it(X). fit method expects a 2-D array with numerical values only
X[:,1:3] = imputer.transform(X[:,1:3]) #this method will do the replacement. By default it creates a new instance so we asssign it back

In [210]:
X #nan values are replaced by mean

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [211]:
imputer.statistics_ #after fit and before transform; shows the mean calculated for both age and salary

array([3.87777778e+01, 6.37777778e+04])

In [212]:
#Testing fit concept
imputer.fit([[1, 2, 3], [2, 3, 4]]) #Takes in 2-D array
imputer.statistics_

array([1.5, 2.5, 3.5])

## Encoding categorical data - One hot encoding

In [213]:
X[:,0]

array(['France', 'Spain', 'Germany', 'Spain', 'Germany', 'France',
       'Spain', 'France', 'Germany', 'France'], dtype=object)

In [214]:
#Why do we have to do this?
#It will be difficult for the ML model to develop correlation between categorical data & dependant variable
#That's why we need to convert categorical data into numerical columns

#Why can't we simply do encode countries to 1,2,3,4,5..
#We don't want ML model to interpret any numerical order associated with countries and develop a wrong correlation(as 1,2,3 don't really mean anything) which will negatively impact the model

In [215]:
#Solution: One hot encoding

### Encoding the Independent Variable

In [216]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [219]:
#Param 1: transformers : [()] : kind of transformation, kind of encoding, on which column(index/es of that column/s). 
#Param 2: What to do with remaining columns. 'passthrough'=keep remaining columns
ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[0])],remainder='passthrough')

#We haven't yet connected this to our data
#X = ct.fit_transform(X)

#We need to convert the output of fit_transform into numpy array for future purpose when we train the model

X = np.array(ct.fit_transform(X))
X

array([[1.0, 0.0, 1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 1.0, 0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 1.0, 0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 1.0, 0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

### Encoding the Dependent Variable

In [220]:
#Way 1
np.array(pd.Series(Y).map({'Yes':1,'No':0}))

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1], dtype=int64)

In [221]:
#Way 2
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
Y = le.fit_transform(Y)

In [222]:
Y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

## Splitting the dataset into the Training set and Test set

In [223]:
from sklearn.model_selection import train_test_split

#Variables Returned
#Matrix of features of train set, Matrix of features of test set, Dependant values of train set, Dependant values of test set 

#Parameters
#Matrix of independant variables, dependant variable, split size(80-20 is ideal) (0.2 will give 20% to test set),
#random_state=1 will give a fix determinstic order to random selection of train-test. Just fixing the seat so that it doesn't change on re-runs

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1)

In [224]:
#One hot encoders, age , salary of 8 observations
print(X_train)

[[0.0 1.0 0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 1.0 0.0 0.0 44.0 72000.0]
 [0.0 1.0 0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 1.0 0.0 0.0 35.0 58000.0]]


In [225]:
#One hot encoders, age , salary of 2 observations
print(X_test)

[[0.0 1.0 0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 1.0 0.0 0.0 37.0 67000.0]]


In [226]:
#8 purchase decisions corresponding to X_train
print(Y_train)

[0 1 0 0 1 1 0 1]


In [227]:
#2 purchase decisions to Y_train
print(Y_test)

[0 1]


## Feature Scaling

In [228]:
#WHY?

#We have a lot of features/predictor variables & we need to bring all their values in a same range
#Eg Some might be in range 100-300 and some might be in 1-10 so we want to give them equal weightage

In [234]:
#TYPES?
#1. Standardisation: Produces values between -3 and +3 : [x - mean(x)]/[S.D(x)]
#2. Normalisaion: Produces values between 0 and 1 : [x - min(x)]/[max(x) - min(x)]

In [229]:
from sklearn.preprocessing import StandardScaler

In [230]:
sc = StandardScaler()

In [231]:
#Using only non-dummy(discarding one hot encoding variables) vars: age & salary
X_train[:,3:] = sc.fit_transform(X_train[:,3:]) 
X_test[:,3:] = sc.transform(X_test[:,3:]) 

In [232]:
X_train

array([[0.0, 1.0, 0.0, -0.5773502691896258, 1.2909944487358056,
        -0.19159184384578545, -1.0781259408412425],
       [0.0, 1.0, 0.0, 1.7320508075688774, -0.7745966692414834,
        -0.014117293757057777, -0.07013167641635372],
       [1.0, 0.0, 1.0, -0.5773502691896258, -0.7745966692414834,
        0.566708506533324, 0.633562432710455],
       [0.0, 1.0, 0.0, -0.5773502691896258, 1.2909944487358056,
        -0.30453019390224867, -0.30786617274297867],
       [0.0, 1.0, 0.0, -0.5773502691896258, 1.2909944487358056,
        -1.9018011447007988, -1.420463615551582],
       [1.0, 0.0, 1.0, -0.5773502691896258, -0.7745966692414834,
        1.1475343068237058, 1.232653363453549],
       [0.0, 1.0, 0.0, 1.7320508075688774, -0.7745966692414834,
        1.4379472069688968, 1.5749910381638885],
       [1.0, 0.0, 1.0, -0.5773502691896258, -0.7745966692414834,
        -0.7401495441200351, -0.5646194287757332]], dtype=object)

In [233]:
X_test

array([[0.0, 1.0, 0.0, 1.7320508075688774, -0.7745966692414834,
        -1.4661817944830124, -0.9069571034860727],
       [1.0, 0.0, 1.0, -0.5773502691896258, -0.7745966692414834,
        -0.44973664397484414, 0.2056403393225306]], dtype=object)