# Import Neccessary Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Import Dataset

In [2]:
df = pd.read_csv('data.csv')

# Working with Missing Data

In [3]:
df.isna().sum()

Country      0
Age          2
Salary       2
Purchased    0
dtype: int64

If the count is too min, we can omit that column, means drop that. Remember, we are creating another refined dataframe with dropped columns, the original dataframe remains unchanged.

In [4]:
dropped_df = df.dropna()
dropped_df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
5,France,35.0,58000.0,Yes
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes
10,France,44.0,72000.0,No
11,Spain,27.0,48000.0,Yes


# Handling of Categorical Data

We can take care of categorical features by converting them to integers.

There are 2 common ways to do so.
1. Label Encoding
2. One Hot Encoding

In [18]:
from sklearn.preprocessing import LabelEncoder

print("Before Encoding: ", df.head())
Le = LabelEncoder()
Le.fit(df['Country'])
df.Country = Le.transform(df.Country)
print("After Encoding: ", df.head())

Before Encoding:     Country   Age   Salary Purchased
0        0  44.0  72000.0        No
1        2  27.0  48000.0       Yes
2        1  30.0  54000.0        No
3        2  38.0  61000.0        No
4        1  40.0      NaN       Yes
After Encoding:     Country   Age   Salary Purchased
0        0  44.0  72000.0        No
1        2  27.0  48000.0       Yes
2        1  30.0  54000.0        No
3        2  38.0  61000.0        No
4        1  40.0      NaN       Yes


In [19]:
Le.fit(df['Purchased'])
df.Purchased = Le.transform(df.Purchased)
print("After Encoding: ", df.head())

After Encoding:     Country   Age   Salary  Purchased
0        0  44.0  72000.0          0
1        2  27.0  48000.0          1
2        1  30.0  54000.0          0
3        2  38.0  61000.0          0
4        1  40.0      NaN          1


## One Hot Encoding

In [20]:
df1 = pd.get_dummies(df, columns=['Country'])
print(df1.head())

    Age   Salary  Purchased  Country_0  Country_1  Country_2
0  44.0  72000.0          0       True      False      False
1  27.0  48000.0          1      False      False       True
2  30.0  54000.0          0      False       True      False
3  38.0  61000.0          0      False      False       True
4  40.0      NaN          1      False       True      False


In [21]:
from sklearn.preprocessing import OneHotEncoder

oh = OneHotEncoder()
s1 = pd.DataFrame(oh.fit_transform(df.iloc[:, [0, 2]]))
pd.concat([df1, s1], axis=1)

Unnamed: 0,Age,Salary,Purchased,Country_0,Country_1,Country_2,0
0,44.0,72000.0,0,True,False,False,"(0, 0)\t1.0\n (0, 9)\t1.0"
1,27.0,48000.0,1,False,False,True,"(0, 2)\t1.0\n (0, 3)\t1.0"
2,30.0,54000.0,0,False,True,False,"(0, 1)\t1.0\n (0, 5)\t1.0"
3,38.0,61000.0,0,False,False,True,"(0, 2)\t1.0\n (0, 7)\t1.0"
4,40.0,,1,False,True,False,"(0, 1)\t1.0\n (0, 12)\t1.0"
5,35.0,58000.0,1,True,False,False,"(0, 0)\t1.0\n (0, 6)\t1.0"
6,,52000.0,0,False,False,True,"(0, 2)\t1.0\n (0, 4)\t1.0"
7,48.0,79000.0,1,True,False,False,"(0, 0)\t1.0\n (0, 10)\t1.0"
8,50.0,83000.0,0,False,True,False,"(0, 1)\t1.0\n (0, 11)\t1.0"
9,37.0,67000.0,1,True,False,False,"(0, 0)\t1.0\n (0, 8)\t1.0"


# Scaling and Normalization of Dataset

1. Standard Scaling - Variance before StandardScaler

In [23]:
from sklearn.preprocessing import StandardScaler
# ddof - Delta Degrees of Freedom
print('Before: ', df.var(ddof=0))

ss = StandardScaler()
df.iloc[:, 1:-1] = ss.fit_transform(df.iloc[:, 1:-1])
print(df.head())

print('After: ', df.var(ddof=0))

Before:  Country      6.900000e-01
Age          5.261728e+01
Salary       1.337284e+08
Purchased    2.500000e-01
dtype: float64
   Country       Age    Salary  Purchased
0        0  0.719931  0.711013          0
1        2 -1.623675 -1.364376          1
2        1 -1.210098 -0.845529          0
3        2 -0.107224 -0.240207          0
4        1  0.168495       NaN          1
After:  Country      0.69
Age          1.00
Salary       1.00
Purchased    0.25
dtype: float64


## Normalization

Normalization is the process of scaling individual samples to have unit norm.

In [25]:
from sklearn.preprocessing import Normalizer

norm = Normalizer()
norm_data = dropped_df.copy()
norm_data.iloc[:, 1:-1] = norm.fit_transform(norm_data.iloc[:, 1:-1])
print(norm_data.head())

   Country       Age  Salary Purchased
0   France  0.000611     1.0        No
1    Spain  0.000562     1.0       Yes
2  Germany  0.000556     1.0        No
3    Spain  0.000623     1.0        No
5   France  0.000603     1.0       Yes


## Min-Max Scaler

In [26]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler_data = scaler.fit_transform(df)
scaler_df = pd.DataFrame(scaler_data, columns=df.columns)
scaler_df.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,0.0,0.73913,0.685714,0.0
1,1.0,0.0,0.0,1.0
2,0.5,0.130435,0.171429,0.0
3,1.0,0.478261,0.371429,0.0
4,0.5,0.565217,,1.0


# Splitting Dataset into Training and Testing

In [31]:
from sklearn.model_selection import train_test_split

X = df.iloc[:, 0:3]
print(X)
y = df.iloc[:, -1]
print(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    Country       Age    Salary
0         0  0.719931  0.711013
1         2 -1.623675 -1.364376
2         1 -1.210098 -0.845529
3         2 -0.107224 -0.240207
4         1  0.168495       NaN
5         0 -0.520801 -0.499631
6         2       NaN -1.018478
7         0  1.271368  1.316334
8         1  1.547087  1.662233
9         0 -0.245083  0.278640
10        0  0.719931  0.711013
11        2 -1.623675 -1.364376
12        1 -1.210098 -0.845529
13        2 -0.107224 -0.240207
14        1  0.168495       NaN
15        0 -0.520801 -0.499631
16        2       NaN -1.018478
17        0  1.271368  1.316334
18        1  1.547087  1.662233
19        0 -0.245083  0.278640
0     0
1     1
2     0
3     0
4     1
5     1
6     0
7     1
8     0
9     1
10    0
11    1
12    0
13    0
14    1
15    1
16    0
17    1
18    0
19    1
Name: Purchased, dtype: int64
