Importing the required libraries

In [94]:
import numpy as np
import pandas as pd

Importing the DataSet

In [95]:
dataset = pd.read_csv(r"Data.csv")
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


Making features and Prediction matrices from dataset

In [96]:
x = dataset.iloc[:,0:3].values
y = dataset.iloc[:,-1:].values

In [97]:
x

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [98]:
y

array([['No'],
       ['Yes'],
       ['No'],
       ['No'],
       ['Yes'],
       ['Yes'],
       ['No'],
       ['Yes'],
       ['No'],
       ['Yes']], dtype=object)

Taking care of missing data (Filling missing data by taking mean of all observations)

In [99]:
from sklearn.impute import SimpleImputer

In [100]:
si = SimpleImputer(missing_values= np.nan, strategy='mean')
si.fit_transform(x[:,1:3])
x[:,1:3] = si.transform(x[:,1:3])
x[:,1:3]

array([[44.0, 72000.0],
       [27.0, 48000.0],
       [30.0, 54000.0],
       [38.0, 61000.0],
       [40.0, 63777.77777777778],
       [35.0, 58000.0],
       [38.77777777777778, 52000.0],
       [48.0, 79000.0],
       [50.0, 83000.0],
       [37.0, 67000.0]], dtype=object)

Encoding the DataSet to help the ML model understand better

Countries will be converted into Binary vectors 1,0,0 : 0,1,0 : 0,0,1 with OneHotEncode

In [101]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ColumnTransformer takes 2 arguments, fit_transform array return nahi krta

In [102]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
x = np.array(ct.fit_transform(x))
x


array([[1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

Now LabelEncoding for dependent variable data as it is much prefered

In [103]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()
y = le.fit_transform(y)         #Yes is 1 No is 0
y

Splitting the DataSet into training and test set (set 20% to testing)

In [105]:
from sklearn.model_selection import train_test_split

In [106]:
x_train, x_test, y_test, y_test = train_test_split(x,y,test_size=0.2,random_state=1)

In [107]:
x_train

array([[0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 35.0, 58000.0]], dtype=object)

In [108]:
y_test

array([0, 1, 0, 0, 1, 1, 0, 1])

Now last thing is Feature Scaling

Either do Standardization or Normalization (standardization is preferred)

In [109]:
from sklearn.preprocessing import StandardScaler

Binary vectors par scaling is nonsense

In [110]:
sc = StandardScaler()
x_train[:,3:5] = sc.fit_transform(x_train[:,3:5])
x_train

array([[0.0, 0.0, 1.0, -0.19159184384578545, -1.0781259408412425],
       [0.0, 1.0, 0.0, -0.014117293757057777, -0.07013167641635372],
       [1.0, 0.0, 0.0, 0.566708506533324, 0.633562432710455],
       [0.0, 0.0, 1.0, -0.30453019390224867, -0.30786617274297867],
       [0.0, 0.0, 1.0, -1.9018011447007988, -1.420463615551582],
       [1.0, 0.0, 0.0, 1.1475343068237058, 1.232653363453549],
       [0.0, 1.0, 0.0, 1.4379472069688968, 1.5749910381638885],
       [1.0, 0.0, 0.0, -0.7401495441200351, -0.5646194287757332]],
      dtype=object)

In [111]:
x_test[:,3:5] = sc.fit_transform(x_test[:,3:5])
x_test


array([[0.0, 1.0, 0.0, -1.0, -1.0],
       [1.0, 0.0, 0.0, 1.0, 1.0]], dtype=object)

In [118]:
y_test = y_test.reshape(-1,1)
y_test[:,:] = sc.fit_transform(y_test[:,:])
y_test = y_test.flatten()
y_test

array([-1,  1])