Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

Importing the dataset

In [2]:
dataset = pd.read_csv('Data.csv')
independent_variables = dataset.iloc[:, :-1].values
dependent_variables = dataset.iloc[:, 3].values

Taking care of the missing data

In [3]:
from sklearn.preprocessing import Imputer

imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
imputer = imputer.fit(independent_variables[:, 1:3])
independent_variables[:, 1:3] = imputer.transform(independent_variables[:, 1:3])

Encoding categorical data

In [4]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

encoder_country = LabelEncoder()
independent_variables[:, 0] = encoder_country.fit_transform(independent_variables[:, 0])
onehotencoder = OneHotEncoder(categorical_features = [0])
independent_variables = onehotencoder.fit_transform(independent_variables).toarray()

# Dependent variable does not need a OneHotEncoder since the machine learning 
# model will know it's a category and there is not order between the two
encoder_purchased = LabelEncoder()
dependent_variables = encoder_purchased.fit_transform(dependent_variables)


Set the formatting and display the arrays

In [5]:
np.set_printoptions(precision = 2, floatmode = 'fixed', suppress = True)
independent_variables #dependent_variables

array([[    1.00,     0.00,     0.00,    44.00, 72000.00],
       [    0.00,     0.00,     1.00,    27.00, 48000.00],
       [    0.00,     1.00,     0.00,    30.00, 54000.00],
       [    0.00,     0.00,     1.00,    38.00, 61000.00],
       [    0.00,     1.00,     0.00,    40.00, 63777.78],
       [    1.00,     0.00,     0.00,    35.00, 58000.00],
       [    0.00,     0.00,     1.00,    38.78, 52000.00],
       [    1.00,     0.00,     0.00,    48.00, 79000.00],
       [    0.00,     1.00,     0.00,    50.00, 83000.00],
       [    1.00,     0.00,     0.00,    37.00, 67000.00]])

Splitting the dataset into the Training set and Test set

In [6]:
from sklearn.model_selection import train_test_split

independent_train, independent_test, dependent_train, dependent_test \
    = train_test_split(independent_variables, dependent_variables, test_size = 0.2, random_state = 0)

Feature Scaling

In [7]:
from sklearn.preprocessing import StandardScaler

scaler_independent = StandardScaler()

# Scaling the dummy variables is not always needed as you lose some interpretation 
# but it will be done for the sake of the tutorial
independent_train = scaler_independent.fit_transform(independent_train)
independent_test = scaler_independent.transform(independent_test)

# The scaler was fitted to train set and not to test set
# because we want them both to have same value range