# Data Preprocessing Tools

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [15]:
dataset = pd.read_csv('Data.csv')
X = dataset.iloc[:, :-1].values #iloc - locate indexes
y = dataset.iloc[:, -1].values


Country      0
Age          1
Salary       1
Purchased    0
dtype: int64


In [3]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [4]:
print(y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


## Taking care of missing data

In [20]:
# 1. Remove them. That works for a large dataset. 
# But if there are many such missing data,
# 2. Replace the missing fields with the mean of the column

print(dataset.isnull().sum())

#sklearn - data science library with lot of tools - data preprocessing, ...
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
# Generally, fit all the numerical columns. The categorical data cannot be used with 'mean' here.
# not 1:2 because then it will exclude the salary
imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64


In [21]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


## Encoding categorical data

### Encoding the Independent Variable

In [22]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
#OneHotEncoding - Splitting each class of a column into a seperate column. 
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
#passthrough so that the other columns that are not being onehotencoded are also retained.
X = np.array(ct.fit_transform(X))

In [23]:
print(X)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


### Encoding the Dependent Variable

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [None]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


In [24]:
#EXAMPLE WITH TITANIC

# Importing the necessary libraries
import pandas as pd 
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

# Load the dataset
dataset = pd.read_csv('titanic.csv')

# Identify the categorical data
categorical_features = ['Sex', 'Embarked', 'Pclass']

# Implement an instance of the ColumnTransformer class
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), categorical_features)], remainder='passthrough')

# Apply the fit_transform method on the instance of ColumnTransformer
# Convert the output into a NumPy array
X = np.array(ct.fit_transform(dataset))

# Use LabelEncoder to encode binary categorical data
le = LabelEncoder()
y = le.fit_transform(dataset['Survived'])

# Print the updated matrix of features and the dependent variable vector
print(X)
print(y)


## Splitting the dataset into the Training set and Test set

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [26]:
print(X_train)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [27]:
print(X_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [28]:
print(y_train)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes']


In [29]:
print(y_test)

['No' 'Yes']


## Feature Scaling
Always apply after splitting the datasets - To avoid information leakage<br>
This is because test set is something we shouldn't be manipulating like that. The mean and stddev<br>
calculated will include the values of the test set which it is not supposed to.<br><br>
We apply feature scaling so that the larger values do not have more weightage.<br>
Each of the columns will have the same weightage in this way. <br><br>
Not needed for all ML models. Ex- Consider Multiple linear regression<br>
y = b0 + b1x1 + b2x2 + ... + bnxn<br>
The coefficient will compensate by taking small values when the feature has large values.<br><br>
Feature Scaling is Always applied to columns<br>
Two main types - <br>
1. Normalization<br>
X’ = (X-Xmin) / (Xmax-Xmin)<br>
[0,1]<br>
Generally works well but does not rely at all on normally distributed data<br>
Does not work all the time<br><br>
2. Standardization<br>
X’ = (X - mean) / (Std dev)<br>
[-3,+3], except outliers<br>
Standardization recommended for features following normal distributions. That's because it keeps the normal distribution but converts it to have zero mean and unit variance<br>
Will work all the time<br>

In [31]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
#We are not applying it for the one hot encoded columns because it will lose meaning 
#and may actually make it worse
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])
#fit method will calculate the mean and std dev for the x_train dataset
#transform values will transform the fit so that they can all be in the same scale
#Using the mean and std dev values from train set for transforming test set as well

#So now using the same scaler to perform the same transformation 
X_test[:, 3:] = sc.transform(X_test[:, 3:])

In [32]:
print(X_train)

[[0.0 0.0 1.0 -0.1915918438457856 -1.0781259408412427]
 [0.0 1.0 0.0 -0.014117293757057902 -0.07013167641635401]
 [1.0 0.0 0.0 0.5667085065333239 0.6335624327104546]
 [0.0 0.0 1.0 -0.3045301939022488 -0.30786617274297895]
 [0.0 0.0 1.0 -1.901801144700799 -1.4204636155515822]
 [1.0 0.0 0.0 1.1475343068237056 1.2326533634535488]
 [0.0 1.0 0.0 1.4379472069688966 1.5749910381638883]
 [1.0 0.0 0.0 -0.7401495441200352 -0.5646194287757336]]


In [33]:
print(X_test)

[[0.0 1.0 0.0 -1.4661817944830127 -0.9069571034860731]
 [1.0 0.0 0.0 -0.44973664397484425 0.20564033932253029]]
