In [1]:
# Loading the Libraries

# import libraries
import pandas as pd
import numpy as np
# In order to evaluate missing Values
from sklearn.preprocessing import Imputer

#

import matplotlib.pyplot as plt

# Collect, Load and Understand the Data

In [2]:
dataset = pd.read_csv('Data.csv')

In [3]:
dataset.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [4]:

# Create X for independent Variable
X = dataset.iloc[:,:-1].values
print (X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [5]:

# Create Y for dependent Variable
Y = dataset.iloc[:,-1].values
print (Y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


# Clean, Impute, Perform Statistical Analysis

In [6]:
dataset.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [7]:
dataset.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

In [8]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
Country      10 non-null object
Age          9 non-null float64
Salary       9 non-null float64
Purchased    10 non-null object
dtypes: float64(2), object(2)
memory usage: 392.0+ bytes


In [9]:
dataset.head(10)

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [10]:
dataset['Age'].mean()

38.77777777777778

In [11]:
dataset['Salary'].mean()

63777.77777777778

In [12]:

# Apply the mean value to the missing datapoint
# Manage the missing data
imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
imputer = imputer.fit(X[:,1:3])

In [13]:
X[:,1:3] = imputer.transform(X[:,1:3])

In [14]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

# Label Encoding

In [15]:
# Encode the Categorical Value to Numbers
from sklearn.preprocessing import LabelEncoder

# Create a dummy variable
from sklearn.preprocessing import OneHotEncoder

label_encoder_X = LabelEncoder()
X[:,0] = label_encoder_X.fit_transform(X[:,0])


In [16]:
# Validate the Label Encoding
X[:,0:2]

array([[0L, 44.0],
       [2L, 27.0],
       [1L, 30.0],
       [2L, 38.0],
       [1L, 40.0],
       [0L, 35.0],
       [2L, 38.77777777777778],
       [0L, 48.0],
       [1L, 50.0],
       [0L, 37.0]], dtype=object)

# OneHotEncoder

In [17]:
one_hot_encoder = OneHotEncoder(categorical_features=[0])
X = one_hot_encoder.fit_transform(X).toarray()

In [18]:
# Validate the OneHot Encoder
X[:,0:4]

array([[  1.        ,   0.        ,   0.        ,  44.        ],
       [  0.        ,   0.        ,   1.        ,  27.        ],
       [  0.        ,   1.        ,   0.        ,  30.        ],
       [  0.        ,   0.        ,   1.        ,  38.        ],
       [  0.        ,   1.        ,   0.        ,  40.        ],
       [  1.        ,   0.        ,   0.        ,  35.        ],
       [  0.        ,   0.        ,   1.        ,  38.77777778],
       [  1.        ,   0.        ,   0.        ,  48.        ],
       [  0.        ,   1.        ,   0.        ,  50.        ],
       [  1.        ,   0.        ,   0.        ,  37.        ]])

In [19]:
# Encode the dependent Variable
label_encoder_Y = LabelEncoder()
Y = label_encoder_X.fit_transform(Y)

In [20]:
# Validate the Y
Y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1], dtype=int64)

# Split the Dataset

In [21]:
# Splitinng the data into the training set and testset
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y, test_size = 0.2, random_state = 42)



In [22]:
len(X_train), len(Y_train)

(8, 8)

In [23]:
len(X_test), len(Y_test)

(2, 2)

In [24]:
# Validate the Split
np.set_printoptions(suppress=True)
X_train
# Country [0:3] , Age, Salary

array([[     1.        ,      0.        ,      0.        ,     35.        ,
         58000.        ],
       [     1.        ,      0.        ,      0.        ,     44.        ,
         72000.        ],
       [     1.        ,      0.        ,      0.        ,     48.        ,
         79000.        ],
       [     0.        ,      1.        ,      0.        ,     30.        ,
         54000.        ],
       [     1.        ,      0.        ,      0.        ,     37.        ,
         67000.        ],
       [     0.        ,      1.        ,      0.        ,     40.        ,
         63777.77777778],
       [     0.        ,      0.        ,      1.        ,     38.        ,
         61000.        ],
       [     0.        ,      0.        ,      1.        ,     38.77777778,
         52000.        ]])

# Feature Scaling

In [25]:
from sklearn.preprocessing import StandardScaler

stan_scal_X = StandardScaler()
X_train = stan_scal_X.fit_transform(X_train)
X_test = stan_scal_X.transform(X_test)

In [26]:
np.set_printoptions(suppress=True)
X_test

array([[-1.        ,  1.73205081, -0.57735027,  2.18271808,  2.30089209],
       [-1.        , -0.57735027,  1.73205081, -2.3186283 , -1.79680973]])