In [17]:
# Importing the libraries

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [18]:
# Importing the dataset

# datasets have these entities: features and dependent variable vector
# features (also called independent variables) are the columns used to predict the dependent variable
dataset = pd.read_csv('Data.csv')

# x = matrix of features
# y = dependent variable vector

# iloc = locate indexes, will take the indexes of the rows and the columns we want to extract from the dataset
# : if is empty before and after :, it means the entire range, use to get all the rows
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [19]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [20]:
print(y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


In [21]:
# Taking care of the missing data

# If you have a big set of data with few missing data you can remove them
# You can replace the missing values with the avreage (mean) of that column

from sklearn.impute import SimpleImputer

# Use SimpleImputer to complete the missing values: SimpleImputer()
# First specify which missing values to replace. To replace all the missing values, use: missing_values=np.nan
# The second argument specify how to be replaced: strategy='mean'
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# Use the fit method to connect the imputer to the matrix of features
# The fit method will look at the missing values and will compute the average
# The general rule is to select only the numerical columns
# use X[:, 1:3] to include all the rows, but only the numerical columns, exclude the string columns
imputer.fit(X[:, 1:3])

# Use the transform method to apply the transformation, to replace the missing values with the average value
X[:, 1:3] = imputer.transform(X[:, 1:3])

In [22]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [23]:
# Encoding categorical data

# Encode the strings into numbers
# One hot encoding
# Will turn the string column into more columns (how manny different items are in the column, that manny columns)
# Create binary vectors for each item from the column

# Encoding the independent variable
# Encode using one hot encoding the Country column
# We have 3 countries, so we will have 3 columns (if we would have 5 different countries, we would create 5 columns)

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# ColumnTransformer gets 2 arguments: 
# first argument: transformers: 
# 1: we specify what kind of transformation (encoding), 
# 2: what kind of encoding (one hot encoding), 
# 3: the indexes columns we want to encode
# second argument: reminder: which will specify that we actually want to keep the columns that won't be applied the transformation
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')

# Connect the column transformer object to data by useing the fit_transform() method
# The train function will expect the features X as an numpy array, so we transofrm the result to an np array
X = np.array(ct.fit_transform(X))


In [24]:
print(X)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [25]:
# Encode the dependent variable

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [26]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


In [27]:
# Spliting the dataset into Training set and Test set

# Make 2 separate sets: 
# one training set for training the machine learning model on existing observations
# one test set to evaluate the performance of your model on new observations, these new observations are like the future data you're gonna get
# The recommended size of the split is: 80% train, 20% test

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [28]:
print(X_train)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [29]:
print(X_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [30]:
print(y_train)

[0 1 0 0 1 1 0 1]


In [31]:
print(y_test)

[0 1]


In [32]:
# Feature Scaling

# Feature scaling consists of scaling all the features, to make sure they all take values in the same scale
# We do this to prevent one feature to dominate the other, which would be neglected by the machine learning model

# We have to apply feature scaling after spliting the dataset into training set and test set
# Because the test set is supposed to be a brand new set 
# Feature scaling is a technique that will get the mean and the standard deviation of your feature. 
# If we apply before the split we will get the mean and the standard deviation of all the values, including these for test.

# Feature scaling techniques: standardisation and normalisation

# Standardisation
# X_stand = (X - mean(X)) / standard_deviation(X)
# Standardisation will result in values between -3 and 3
# Standardisation works fine all the time

# Normalisation
# X_norm = (X - min(X)) / (max(X) - min(X))
# Normalisation will result in values between 0 and 1
# Normalisation is recommended when you have normal distribution in most of the features

# Use standardisation

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

# We don't have to apply feature scaling to the dummy variables in the matrix of features
# The goal of the feature scaling is to have all the features values in the same range
# Standardisation transofrms the values of the features between -3 and 3

# The fit method: For each feature of X_train, compute the mean of the feature and then compute the standard deviation of the feature
# The transform method will apply the standardisation formula for each feature value
X_train[:,3:] = sc.fit_transform(X_train[:,3:])

# Since the data for test is like new data, for this data will only apply tansofrm method
# Because the featrues of the test set need to be scaled by the same scaler that was used on the training set
# In order to make predictions that will be congruent with the way the model was trained 
# we need to apply the same scaler that was used on the training set onto the test set
# So we can get the same transformation and some relevant predictions
X_test[:,3:] = sc.transform(X_test[:,3:])
