# Data Preprocessing

### Importing the libraries

In [202]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Importing the dataset

In [203]:
dataset = pd.read_csv("Data.csv")
dataset.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


_Split dataset into independent variables (X) and dependant variable (y)_

In [204]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 3].values

In [205]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [206]:
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

### Handle Missing Data

In [207]:
from sklearn.preprocessing import Imputer

In [208]:
imputer = Imputer(missing_values='NaN', strategy='mean', axis = 0)

In [209]:
imputer = imputer.fit(X[:, 1:3])

In [210]:
X[:, 1:3] = imputer.transform(X[:, 1:3])

In [211]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

### Encoding Categorical Data

In [212]:
from sklearn.preprocessing import LabelEncoder

In [213]:
labelencoder_X = LabelEncoder()

In [214]:
#Encoding data from the colomn "Countries" which has an index "0"
X[:,0] = labelencoder_X.fit_transform(X[:,0])

In [215]:
#Now the names of the countries are represented by the numbers
X

array([[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, 63777.77777777778],
       [0, 35.0, 58000.0],
       [2, 38.77777777777778, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)

_However, we have a problem, because now machine learning algorithms will think that our countries is ordered (we have three categories: 0, 1, 2) that will lead to wrong predictions_


### One-Hot Encoding

In [216]:
from sklearn.preprocessing import OneHotEncoder

In [217]:
onehotencoder = OneHotEncoder(categorical_features=[0])

In [218]:
X = onehotencoder.fit_transform(X).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [219]:
X

array([[1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.40000000e+01,
        7.20000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 2.70000000e+01,
        4.80000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 3.00000000e+01,
        5.40000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.80000000e+01,
        6.10000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 4.00000000e+01,
        6.37777778e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.50000000e+01,
        5.80000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.87777778e+01,
        5.20000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.80000000e+01,
        7.90000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 5.00000000e+01,
        8.30000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.70000000e+01,
        6.70000000e+04]])

In [258]:
labelencoder_y = LabelEncoder()

#Encoding data from "y" column (encoding our dependant variable)
y = labelencoder_X.fit_transform(y)

In [259]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1], dtype=int64)

### Splitting the Dataset into the Training Set and Test Set

In [260]:
from sklearn.model_selection import train_test_split

In [261]:
X_train, y_train, X_test, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

### Feature Scaling

In [262]:
from sklearn.preprocessing import StandardScaler

In [263]:
sc_X = StandardScaler().fit(X_train)

In [271]:
X_train = sc_X.fit_transform(X_train)

In [272]:
X_test = sc_X.transform(X_test)

ValueError: Expected 2D array, got 1D array instead:
array=[1. 1. 1. 0. 1. 0. 0. 1.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.