In [None]:
# 1 - Import libraries
# 2 - import data
# 3 - assign features and dependable variable
# 3 - deal with nulls
# 4 - encode categorical data and labels if needed
# 5 - split into train and test datasets

In [1]:
# Data Preprocessing
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

In [3]:
import data
df = pd.read_csv("C:\\Users\\Sasha\Desktop\\Py\\Git\\UdemyExerciseML\Machine Learning A-Z (Codes and Datasets)\\Part 1 - Data Preprocessing\\Section 2 -------------------- Part 1 - Data Preprocessing --------------------\\Python\\Data.csv")

In [4]:
# Check data
df.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [13]:
df.info() # check for nulls

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    10 non-null     object 
 1   Age        9 non-null      float64
 2   Salary     9 non-null      float64
 3   Purchased  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 448.0+ bytes


In [10]:
# Assign X and y variables, X for features and y for the dependable variable
X = df.iloc[:, :-1].values # first range for the rows, second range for the columns
y = df.iloc[:, -1].values # all rows, only last column

In [11]:
print(X) # features

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [12]:
print(y) # dependable variable

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


In [14]:
# when dealing with nulls, if its a small percentage, remove them, if its quite a bit, you might want to replace missing values with average or mean value

In [15]:
# simple imputer will replace nulls with average
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy="mean") # assign values to be replaced and what they will be replaced with
imputer.fit(X[:, 1:3]) # replace missing data in the selected rows and columns
X[:, 1:3] = imputer.transform(X[:, 1:3]) # will return new matrix of features, will need to be assigned

In [16]:
print(X) # check new data set

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [17]:
# encoding categorical data (text) using one hot encoding, since there is no relationship in between the countries column, each of them has to be encoded as a separate feature instead of being france = 1 spain = 2 and germany = 3, since if it was that way it will look like there is numerical order to it

In [18]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[("encoder", OneHotEncoder(), [0])], remainder="passthrough")# choose ype of encoding and whether to keep other aka non encoded columns, passthrough will make sure that other columns are kept in
X = np.array(ct.fit_transform(X)) # assign returned matrix to the X (features), force it to be transformed into a numpy array

In [19]:
print(X)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [20]:
# lets encode y (dependable variable) using lable encoding turn into 0 and 1
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y) # 0 represents NO and 1 represent YES, binary outcome

In [21]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


In [22]:
# Feature scaling before or after train/test split, if we do feature scaling before splitting data we will have average values from test set portion of data leaking into train set, which will compromise our model

In [23]:
# lets split our train and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=1) # random state will ensure that we have same random seed, for same output purpose

In [33]:
print(X_train)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [25]:
print(X_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [26]:
print(y_train)

[0 1 0 0 1 1 0 1]


In [27]:
print(y_test)

[0 1]


In [28]:
# Feature scaling, ensures that features are used proportionally and one of them does not dominate the others, not required for all the models
# standardisation will get values to be between -3 and 3, Normalisation will get values in range 0 to 1
# normalisation recomended when we have normal distribution in all of our feature, standardisation works well all of the times (safer to use)

In [34]:
from sklearn.preprocessing import StandardScaler # do not apply feature scaling to our dummy data(country name)
sc = StandardScaler()
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:]) # .fit computes . transform actually updates data
X_test[:, 3:] = sc.transform(X_test[:, 3:]) # for our model to work correctly on the new data we cannot fit it to the test matrix, we will use scaler from train set

In [35]:
print(X_train)

[[0.0 0.0 1.0 -0.19159184384578545 -1.0781259408412425]
 [0.0 1.0 0.0 -0.014117293757057777 -0.07013167641635372]
 [1.0 0.0 0.0 0.566708506533324 0.633562432710455]
 [0.0 0.0 1.0 -0.30453019390224867 -0.30786617274297867]
 [0.0 0.0 1.0 -1.9018011447007988 -1.420463615551582]
 [1.0 0.0 0.0 1.1475343068237058 1.232653363453549]
 [0.0 1.0 0.0 1.4379472069688968 1.5749910381638885]
 [1.0 0.0 0.0 -0.7401495441200351 -0.5646194287757332]]


In [36]:
print(X_test)

[[0.0 1.0 0.0 -1.4661817944830124 -0.9069571034860727]
 [1.0 0.0 0.0 -0.44973664397484414 0.2056403393225306]]
