In [282]:
import pandas as pd
import numpy as np

In [283]:
df = pd.read_csv('Dataset/data.csv')
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [284]:
df.describe()

Unnamed: 0,Age,Salary
count,9.0,9.0
mean,38.777778,63777.777778
std,7.693793,12265.579662
min,27.0,48000.0
25%,35.0,54000.0
50%,38.0,61000.0
75%,44.0,72000.0
max,50.0,83000.0


In [285]:
df.nunique()

Country      3
Age          9
Salary       9
Purchased    2
dtype: int64

In [286]:
data = df.values
data

array([['France', 44.0, 72000.0, 'No'],
       ['Spain', 27.0, 48000.0, 'Yes'],
       ['Germany', 30.0, 54000.0, 'No'],
       ['Spain', 38.0, 61000.0, 'No'],
       ['Germany', 40.0, nan, 'Yes'],
       ['France', 35.0, 58000.0, 'Yes'],
       ['Spain', nan, 52000.0, 'No'],
       ['France', 48.0, 79000.0, 'Yes'],
       ['Germany', 50.0, 83000.0, 'No'],
       ['France', 37.0, 67000.0, 'Yes']], dtype=object)

In [287]:
X = data[:, :-1]
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [288]:
y = data[:, -1]
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

Splitting into test and train dataset

In [289]:
from sklearn.model_selection import train_test_split

In [290]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=1)

In [291]:
X_train

array([['Spain', nan, 52000.0],
       ['Germany', 40.0, nan],
       ['France', 44.0, 72000.0],
       ['Spain', 38.0, 61000.0],
       ['Spain', 27.0, 48000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 35.0, 58000.0]], dtype=object)

In [292]:
X_test

array([['Germany', 30.0, 54000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [293]:
y_train

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes'], dtype=object)

In [294]:
y_test

array(['No', 'Yes'], dtype=object)

In [295]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(8, 3)
(2, 3)
(8,)
(2,)


Handling missing Data

In [296]:
from sklearn.impute import SimpleImputer

In [297]:
#imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)
#strategy = mean, median, most_frequent, constant(fill_value=xxx)

In [298]:
imputer.fit(X_train[:,1:3])

In [299]:
X_train[:,1:3] = imputer.transform(X_train[:,1:3])
X_train

array([['Spain', 0, 52000.0],
       ['Germany', 40.0, 0],
       ['France', 44.0, 72000.0],
       ['Spain', 38.0, 61000.0],
       ['Spain', 27.0, 48000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 35.0, 58000.0]], dtype=object)

In [300]:
X_test[:,1:3] = imputer.transform(X_test[:,1:3])
X_test

array([['Germany', 30.0, 54000.0],
       ['France', 37.0, 67000.0]], dtype=object)

Categorical Variables

In [301]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [302]:
ct = ColumnTransformer(transformers=[('encode', OneHotEncoder(),[0])], remainder='passthrough')
X_train

array([['Spain', 0, 52000.0],
       ['Germany', 40.0, 0],
       ['France', 44.0, 72000.0],
       ['Spain', 38.0, 61000.0],
       ['Spain', 27.0, 48000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 35.0, 58000.0]], dtype=object)

In [303]:
X_train = ct.fit_transform(X_train)
X_train

array([[0.0, 0.0, 1.0, 0, 52000.0],
       [0.0, 1.0, 0.0, 40.0, 0],
       [1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 35.0, 58000.0]], dtype=object)

In [304]:
X_test = ct.transform(X_test)
X_test

array([[0.0, 1.0, 0.0, 30.0, 54000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

In [305]:
from sklearn.preprocessing import OrdinalEncoder

In [306]:
oe = OrdinalEncoder()
X1 = oe.fit_transform(X)
X1

array([[ 0.,  6.,  6.],
       [ 2.,  0.,  0.],
       [ 1.,  1.,  2.],
       [ 2.,  4.,  4.],
       [ 1.,  5., nan],
       [ 0.,  2.,  3.],
       [ 2., nan,  1.],
       [ 0.,  7.,  7.],
       [ 1.,  8.,  8.],
       [ 0.,  3.,  5.]])

In [307]:
from sklearn.preprocessing import LabelEncoder

In [308]:
le = LabelEncoder()
y = le.fit_transform(y)
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

In [310]:
ct1 = ColumnTransformer(transformers=[('imputer1', SimpleImputer(strategy='mean'),[1,2])], remainder='passthrough')

In [311]:
ct1