# Data Preprocessing ✨

*a property of Bagus Cipta Pratama*

## importing package 

In [211]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## importing the dataset 

In [213]:
dataset = pd.read_csv('Data.csv')

In [214]:
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [215]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    10 non-null     object 
 1   Age        9 non-null      float64
 2   Salary     9 non-null      float64
 3   Purchased  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 452.0+ bytes


In [216]:
dataset.describe()

Unnamed: 0,Age,Salary
count,9.0,9.0
mean,38.777778,63777.777778
std,7.693793,12265.579662
min,27.0,48000.0
25%,35.0,54000.0
50%,38.0,61000.0
75%,44.0,72000.0
max,50.0,83000.0


## separating the test and training data

In [218]:
x = dataset.iloc[:,:-1].values

In [219]:
x

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [220]:
y = dataset.iloc[:,-1].values

In [221]:
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

## taking care of missing data 

In [223]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
imputer.fit(x[:,1:3])
x[:,1:3] = imputer.transform(x[:,1:3])

In [224]:
x

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

## encoding categorical data 

### encoding independent variable 

In [227]:
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import OneHotEncoder 
ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[0])], remainder = 'passthrough')
x = np.array(ct.fit_transform(x))

In [228]:
x

array([[1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

### encoding dependent variable 

In [230]:
from sklearn.preprocessing import LabelEncoder 
le = LabelEncoder()
y = le.fit_transform(y)

In [231]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

## Splitting into training and test set 

In [233]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=1)

In [234]:
x_train

array([[0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 35.0, 58000.0]], dtype=object)

In [235]:
x_test

array([[0.0, 1.0, 0.0, 30.0, 54000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

In [236]:
y_train

array([0, 1, 0, 0, 1, 1, 0, 1])

In [237]:
y_test

array([0, 1])

## feature scaling

In [239]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [240]:
x_train

array([[-0.77459667, -0.57735027,  1.29099445, -0.19159184, -1.07812594],
       [-0.77459667,  1.73205081, -0.77459667, -0.01411729, -0.07013168],
       [ 1.29099445, -0.57735027, -0.77459667,  0.56670851,  0.63356243],
       [-0.77459667, -0.57735027,  1.29099445, -0.30453019, -0.30786617],
       [-0.77459667, -0.57735027,  1.29099445, -1.90180114, -1.42046362],
       [ 1.29099445, -0.57735027, -0.77459667,  1.14753431,  1.23265336],
       [-0.77459667,  1.73205081, -0.77459667,  1.43794721,  1.57499104],
       [ 1.29099445, -0.57735027, -0.77459667, -0.74014954, -0.56461943]])

In [241]:
x_test

array([[-0.77459667,  1.73205081, -0.77459667, -1.46618179, -0.9069571 ],
       [ 1.29099445, -0.57735027, -0.77459667, -0.44973664,  0.20564034]])

---

## some personal notes 📌 

### N1 : why do we have to do feature scaling ?
one of the reasons we have to do feature scaling is to avoid the dominance of certain features so that the model becomes optimal

### N2 : what is random_state and why do we have to set it to 1 ?
random_state is used to control randomization when splitting data . by setting random_state to a fixed value (ex : 1) , you ensure that the result of dividing the data will always be the same everytime the code is run . this helps in replication of experimental results

### N3 : which should be done first? splitting dataset or feature scaling ? and why ?
it simply because we need to avoid leakage of data . if we perform feature scaling before dividing dataset , the scaler will calculate the average and standard deviation of the entire dataset , causing information from the testing data to leak into the training data

### N4 : why doesnt x_test use the fit method in the feature scaling above ?
its because we dont want to recalculate the average and standard deviation based on test data (this would cause the test and training data to be on different scales)