In [20]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler



In [12]:
df = pd.read_csv("Data.csv")
x = df.iloc[:, :-1].values # Independent variables
y = df.iloc[:, -1].values # Dependent variable (label)
print(x, '\n', y)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]] 
 ['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


In [13]:
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
imputer.fit(x[:, 1:3])
x[:, 1:3] = imputer.transform(x[:, 1:3]) # Replace the nan values with the mode
print(x, '\n', y)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 61000.0]
 ['France' 35.0 58000.0]
 ['Spain' 38.0 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]] 
 ['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


# Encoding categorical data

In [14]:
# For independent variables,
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
x = np.array(ct.fit_transform(x))
print(x)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 61000.0]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.0 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


## Encoding the dependent variable

In [15]:
le = LabelEncoder()
y = le.fit_transform(y)
print(y)

[0 1 0 0 1 1 0 1 0 1]


## Splitting the dataset into training and test

In [19]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, train_size=0.75, random_state=42)
print(x_train, '\n\n', x_test, '\n\n', y_train, '\n\n', y_test)

[[1.0 0.0 0.0 44.0 72000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]
 [0.0 1.0 0.0 40.0 61000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 38.0 52000.0]] 

 [[0.0 1.0 0.0 50.0 83000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 35.0 58000.0]] 

 [0 1 0 1 1 0 0] 

 [0 1 1]


## Feature scaling to bring values down

In [21]:
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

print(x_train, '\n\n', x_test)

[[ 1.15470054 -0.63245553 -0.63245553  0.8968186   0.92684402]
 [ 1.15470054 -0.63245553 -0.63245553  1.6577556   1.70986742]
 [-0.8660254   1.58113883 -0.63245553 -1.76646088 -1.08664471]
 [ 1.15470054 -0.63245553 -0.63245553 -0.43482114  0.36754159]
 [-0.8660254   1.58113883 -0.63245553  0.13588161 -0.30362132]
 [-0.8660254  -0.63245553  1.58113883 -0.24458689 -0.30362132]
 [-0.8660254  -0.63245553  1.58113883 -0.24458689 -1.31036568]] 

 [[-0.8660254   1.58113883 -0.63245553  2.03822409  2.15730936]
 [-0.8660254  -0.63245553  1.58113883 -2.33716363 -1.75780762]
 [ 1.15470054 -0.63245553 -0.63245553 -0.81528964 -0.63920277]]
