In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv( "Data.csv" )
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [6]:
X = df.iloc[:, :-1].values # converting the DataFrame to a numpy array
y = df.iloc[:, -1].values
type( X )

numpy.ndarray

# Handling the missing values

In [21]:
from sklearn.impute import SimpleImputer
# Handling the missing values in the "Salary" and "Age" columns.
imputer = SimpleImputer( missing_values = np.nan, strategy = "median" )
imputer.fit( X[:, 1:] )
X[:, 1:] = imputer.transform( X[:, 1:] )

In [22]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 61000.0],
       ['France', 35.0, 58000.0],
       ['Spain', 38.0, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

# Encoding Categorical data

In [23]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer( transformers = [( "encoder", OneHotEncoder( ), [0] )],
                        remainder = "passthrough" )
X = np.array( ct.fit_transform( X ) )

In [24]:
X

array([[1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 40.0, 61000.0],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 1.0, 38.0, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

In [25]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder( )
y = le.fit_transform( y )

In [26]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

# Train Test Split

In [27]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split( X, y, test_size = 0.2, random_state = 1 )

In [28]:
len( x_train )

8

In [29]:
len( x_test )

2

In [30]:
x_train

array([[0.0, 0.0, 1.0, 38.0, 52000.0],
       [0.0, 1.0, 0.0, 40.0, 61000.0],
       [1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 35.0, 58000.0]], dtype=object)

In [31]:
y_train

array([0, 1, 0, 0, 1, 1, 0, 1])

# Feature Scaling

In [32]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler( )
x_train[:, 3:] = scaler.fit_transform( x_train[:, 3:] )
x_test[:, 3:] = scaler.fit_transform( x_test[:, 3:] )

In [33]:
x_train

array([[0.0, 0.0, 1.0, -0.28942984211696865, -1.0430254692900243],
       [0.0, 1.0, 0.0, 0.0, -0.2767210428728636],
       [1.0, 0.0, 0.0, 0.5788596842339373, 0.659873256081444],
       [0.0, 0.0, 1.0, -0.28942984211696865, -0.2767210428728636],
       [0.0, 0.0, 1.0, -1.8812939737602963, -1.383605214364318],
       [1.0, 0.0, 0.0, 1.1577193684678746, 1.2558878099614579],
       [0.0, 1.0, 0.0, 1.4471492105848434, 1.5964675550357514],
       [1.0, 0.0, 0.0, -0.7235746052924217, -0.5321558516785838]],
      dtype=object)

In [34]:
x_test

array([[0.0, 1.0, 0.0, -1.0, -1.0],
       [1.0, 0.0, 0.0, 1.0, 1.0]], dtype=object)