In [114]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import (
    Imputer, LabelEncoder, OneHotEncoder, StandardScaler
)
from sklearn.cross_validation import train_test_split

In [115]:
# load the dataset into pandas
dataset = pd.read_csv('datasets/data.csv')
# load the feature set into X (the independant variables)
X = dataset.iloc[:, :-1].values
# load the predicted label into y (the dependant variable)
y = dataset.iloc[:, 3].values

In [116]:
# If there's missing data in the feature set we fill it with the mean
# of the column it is missing. The other values in feature set can then
# still be used and the missing values does not skew the output.
#
# Since our feature set consists of categorical and numeric data we
# can only use the imputer on our absolute values, there's no mean of 
# categorical features.

# Missing values are empty columns, expressed as NaN in pandas.
# The strategy is mean and axis is 0 for taking the mean of the column.
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
# the lower bound is inclusive, the upper bound is exclusive.
# [1:3] select columns 1 and 2
importer = imputer.fit(X[:,1:3])
# now we set the transformed data back into X, missing data is now filled
X[:, 1:3] = imputer.transform(X[:, 1:3])

In [117]:
# Now we convert our categorical data to numerical data, firstly 
# by changing the categories to numbers.
label_encoder = LabelEncoder()
X[:, 0] = label_encoder.fit_transform(X[:, 0])

# If we left our conversion here the model will think one country is
# better than the others. Since 2 > 1 > 0.
# Therefore we use One Hot Encoding to create three columns to 
# seperate the categories into 1 and 0 values.
one_hot_encoder = OneHotEncoder(categorical_features=[0])
X = one_hot_encoder.fit_transform(X).toarray()

# Luckily our predicted label only has 2 values. So it can be label
# encoded without a risk of confusing the model, it is then already
# One Hot Encoded.
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [118]:
# In order to measure how well our model performs predicting the label
# from the feature sets, we split the dataset into training and testing
# data. First the model will fit on the training data and then we'll
# measure it's accuracy on the testing data. The ratio between training
# and testing data is usually 75%/25%, we'll use a 80/20 split. 

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0
)

In [119]:
# Each independant variable is equal in the dataset, however the numerical
# values are not in the same scale. This means that higher numerical
# ranges will dominate lower numerical ranges.
# (f.e. column 2 dominating column 1)
# We use the StandardScaler, fitted to our training data, to standardise
# our feature sets to the same scale. The standardised value is
# the value minus the mean value of the column divided by the standard
# deviation of the value  
standard_scaler = StandardScaler()
X_train = standard_scaler.fit_transform(X_train)
X_test = standard_scaler.transform(X_test)