# Data Preprocessing Tools

## Importing the libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Importing the dataset

In [None]:
#dataframe
dataset = pd.read_csv('Data.csv')
#iloc: integer-location based indexing for selection by position
X = dataset.iloc[:,:-1].values #[rows, columns]
y = dataset.iloc[:,-1].values

In [None]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [None]:
print(y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


## Taking care of missing data

In [None]:
from sklearn.impute import SimpleImputer
#replace missing values by the mean value of that feature
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
#only numerical values
imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])

print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


## Encoding categorical data

### Encoding the Independent Variable

**OneHotEncoding:** used to encode the categorical variables into numerical values by producing a matrix with only 1 in each column that represents one of categories.
Using this instead of simply converting variables into values, such as 0, 1, 2, is mainly due to the issue of ordinality of using these values directly.

e.g.
France, Spain, Germany will be encoded into:
1, 0, 0 for France,
0, 1, 0 for Spain,
0, 0, 1 for Germany

In [None]:
#ColumnTransformer only process with columns
from sklearn.compose import ColumnTransformer
#OneHotEncoder used to process variables with several categories
from sklearn.preprocessing import OneHotEncoder

#implement the OneHotEncoder on column 0 while keeping others the same
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [None]:
print(X)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


### Encoding the Dependent Variable

In [None]:
#LabelEncoder used for binary outcomes in category variables
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = np.array(le.fit_transform(y))

In [None]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


## Splitting the dataset into the Training set and Test set

***Do splitting the dataset into training set and test set before feature scaling; otherwise, it will have information leakage on the test set.***

In [None]:
from sklearn.model_selection import train_test_split

#80% data will be the training set, and 20% will be the test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [None]:
print(X_train)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [None]:
print(X_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [None]:
print(y_train)

[0 1 0 0 1 1 0 1]


In [None]:
print(y_test)

[0 1]


## Feature Scaling

1.  **Standardisatoin:** X_new = (X - mean)/Std #useful all the case
2.  **Normalisation**: X_new = (X - X_min)/(X_max - X_min) #works best for normally distributed data

***Do not apply feature scaling on the dummy variables***

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

#fit first calculates all the needed values
#while transform will actually apply the formula using these values
X_train[:,3:] = sc.fit_transform(X_train[:,3:])

#would continue using the scale calculated above
X_test[:,3:] = sc.transform(X_test[:,3:])

In [None]:
print(X_train)

[[0.0 0.0 1.0 -0.19159184384578545 -1.0781259408412425]
 [0.0 1.0 0.0 -0.014117293757057777 -0.07013167641635372]
 [1.0 0.0 0.0 0.566708506533324 0.633562432710455]
 [0.0 0.0 1.0 -0.30453019390224867 -0.30786617274297867]
 [0.0 0.0 1.0 -1.9018011447007988 -1.420463615551582]
 [1.0 0.0 0.0 1.1475343068237058 1.232653363453549]
 [0.0 1.0 0.0 1.4379472069688968 1.5749910381638885]
 [1.0 0.0 0.0 -0.7401495441200351 -0.5646194287757332]]


In [None]:
print(X_test)

[[0.0 1.0 0.0 -1.4661817944830124 -0.9069571034860727]
 [1.0 0.0 0.0 -0.44973664397484414 0.2056403393225306]]
