# Data Preprocessing Tools

## Importing the libraries

In [16]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset
Para el nombrado de variables se suele seguir la convención de usar letras mayúsculas para cuando se tiene una matriz como en el caso de `X` que representa las variables independientes (n filas y m columnas) mientras que `y` representa, en este caso, un vector de la variable dependiente

In [17]:
dataset = pd.read_csv('Data.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values
print(f"{dataset} \n\n X:\n{X} \n\n y: {y}")

   Country   Age   Salary Purchased
0   France  44.0  72000.0        No
1    Spain  27.0  48000.0       Yes
2  Germany  30.0  54000.0        No
3    Spain  38.0  61000.0        No
4  Germany  40.0      NaN       Yes
5   France  35.0  58000.0       Yes
6    Spain   NaN  52000.0        No
7   France  48.0  79000.0       Yes
8  Germany  50.0  83000.0        No
9   France  37.0  67000.0       Yes 

 X:
[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]] 

 y: ['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


## Taking care of missing data (mean strategy)

In [18]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

print(pd.DataFrame(X))

         0          1             2
0   France       44.0       72000.0
1    Spain       27.0       48000.0
2  Germany       30.0       54000.0
3    Spain       38.0       61000.0
4  Germany       40.0  63777.777778
5   France       35.0       58000.0
6    Spain  38.777778       52000.0
7   France       48.0       79000.0
8  Germany       50.0       83000.0
9   France       37.0       67000.0


## Encoding categorical data

### Encoding the Independent Variable

In [19]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('categorical', OneHotEncoder(), [0])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

print(pd.DataFrame(X))

     0    1    2          3             4
0  1.0  0.0  0.0       44.0       72000.0
1  0.0  0.0  1.0       27.0       48000.0
2  0.0  1.0  0.0       30.0       54000.0
3  0.0  0.0  1.0       38.0       61000.0
4  0.0  1.0  0.0       40.0  63777.777778
5  1.0  0.0  0.0       35.0       58000.0
6  0.0  0.0  1.0  38.777778       52000.0
7  1.0  0.0  0.0       48.0       79000.0
8  0.0  1.0  0.0       50.0       83000.0
9  1.0  0.0  0.0       37.0       67000.0


### Encoding the Dependent Variable

In [20]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

## Splitting the dataset into the Training set and Test set

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

print(f"X_train:\n{pd.DataFrame(X_train)} \n\n X_test:\n{pd.DataFrame(X_test)} \n\n y_train:\n{y_train} \n\n y_test:\n{y_test}")

X_train:
     0    1    2          3             4
0  0.0  0.0  1.0  38.777778       52000.0
1  0.0  1.0  0.0       40.0  63777.777778
2  1.0  0.0  0.0       44.0       72000.0
3  0.0  0.0  1.0       38.0       61000.0
4  0.0  0.0  1.0       27.0       48000.0
5  1.0  0.0  0.0       48.0       79000.0
6  0.0  1.0  0.0       50.0       83000.0
7  1.0  0.0  0.0       35.0       58000.0 

 X_test:
     0    1    2     3        4
0  0.0  1.0  0.0  30.0  54000.0
1  1.0  0.0  0.0  37.0  67000.0 

 y_train:
[0 1 0 0 1 1 0 1] 

 y_test:
[0 1]


## Feature Scaling
Para el dataset de entrenamiento `X_train` se aplica `fit_transform` mientras que al de test `X_test` se le aplica `transform` para que tome el mismo tipo de transformación que se le aplicó al de entrenamiento.

En este ejemplo no se están escalando las variables dummy.

In [22]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])
X_test[:, 3:] = sc.transform(X_test[:, 3:])

print(f"X_train:\n{pd.DataFrame(X_train)} \n\n X_test:\n{pd.DataFrame(X_test)}")

X_train:
     0    1    2         3         4
0  0.0  0.0  1.0 -0.191592 -1.078126
1  0.0  1.0  0.0 -0.014117 -0.070132
2  1.0  0.0  0.0  0.566709  0.633562
3  0.0  0.0  1.0  -0.30453 -0.307866
4  0.0  0.0  1.0 -1.901801 -1.420464
5  1.0  0.0  0.0  1.147534  1.232653
6  0.0  1.0  0.0  1.437947  1.574991
7  1.0  0.0  0.0  -0.74015 -0.564619 

 X_test:
     0    1    2         3         4
0  0.0  1.0  0.0 -1.466182 -0.906957
1  1.0  0.0  0.0 -0.449737   0.20564
