# Data Preprocessing Tools

#### DISCLAIMER:
Before using this template, make sure your data is structured in the following manner:
1. Your LABEL, i.e., dependent variable is the LAST column
2. Your relevant features are all other columns
3. You have handled all RELEVANT MISSING DATA properly by filling in values (Missing data that was considered ok to be deleted, will be deleted here.)
4. In the "Taking Care of Missing Data" section, the columns that will have their empty cells "fit" with a fill value are fixed! Change them, case by case!
5. In the "Encoding Categorical Data - Independent variable", the column to change is fixed: Column 0! Change it, case by case!

## Importing the libraries

In [13]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [14]:
dataset = pd.read_csv("Data.csv")
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [15]:
X = dataset.iloc[:,:-1].values # Get features (all except last column)
y = dataset.iloc[:,-1].values  # Get dependent variable (only the last column)

In [16]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [17]:
print(y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


## Taking care of missing data

In [18]:
# Replace the missing data with the average of all other values
# REMEMBER: pd.NA cannot be used in scikit-learn. Use np.NaN
# REMEMBER: Depending on the use case, it may make sense to use the MEDIAN of a GROUP,
#           rather than the whole dataset or rather than the average!
# --------------------------------------------------------------------------------------

In [19]:
# https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy = "mean") # ONLY THE NUMERICAL COLUMNS (Age and Salary)

imputer.fit(X[:,1:3])                  # Fit imputer on X
X[:,1:3] = imputer.transform(X[:,1:3]) # Impute all missing values on X

In [20]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


## Encoding categorical data

In [None]:
# The labelEncoder is used for categories with no more than two different values
# such as "yes or no", "male or female" ...0 or 1...

# The one hot Encoder is used for categories with more than two different values
# such as countries, cities

### Encoding the Independent Variable

In [21]:
# ONE HOT ENCODING
# REMEMBER: pd.get_dummies() to one hot encode a variable! Used in ML project in Pandas training!
#    feat1 = features.select_dtypes("float").apply(lambda x: stats.zscore(x)) # Normalize all numerical variables
#    dummies = pd.get_dummies(features.[COL.NAME] ])                          # Get dummies for string variables
#    features = pd.concat([feat1, dummies], axis = 1)                         # Concatenate
# ----------------------------------------------------------------------------------------------------------------

# WARNING
# In independent variables, NEVER use LABELENCONDING. That is reserved for dependent variables.
# Otherwise, you lead the ML algorithm into believing an order exists in the data!

In [22]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Encode column 0 (string) with a OneHotEncoder. All other columns are "passed through" (alternative would be dropping them!)
ct = ColumnTransformer(transformers = [('encoder', OneHotEncoder(), [0])], remainder = 'passthrough')

# Fit ct and transform the data on X (does not return a numpy array!)
# However, ML algorithms expect numpy arrays. Use np.array()
X = np.array(ct.fit_transform(X))

In [23]:
# FEATURES DATAFRAME
print(X) # First few (3 in this case), NEW columns are the string column encoded!

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


### Encoding the Dependent Variable

In [24]:
# LABEL DATAFRAME
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(y) # Encode labels

In [25]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


## Splitting the dataset into the Training set and Test set

In [26]:
# In PySpark
# train,test = df.randomSplit([0.7,0.3])
from sklearn.model_selection import train_test_split

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)

In [50]:
# Double-Check the splits
print(X_train.size)
print(X_test.size)
print(y_train.size)
print(y_test.size)

40
10
8
2


## Feature Scaling

In [34]:
# Feature Scaling, i.e., normalizing the variables, comes AFTER the split!
# Because the test set is meant to be a "brand new" set, you are not supposed to "treat" it before.
# If you do it before, the test set will be normalized with mean and std values of the ENTIRE SET!

In [35]:
# Take into account you may not need this in several ML models!
# For example, in Multiple Linear Regression, all coeficients 'xn' are multiplied by weights 'bn'

In [None]:
# STANDARDISATION (Values between -3 and +3)
# (x-mean(x)) / stddev(x)

# NORMALISATION (Values between 0 and 1)
# (x-min(x)) / (max(x)-min(x))

# Normalisation is recommended WHEN you have a normal distribution in most of your features
# Standardisation works at all times and always improves the model training process!
# Be pragmatic, use standardisation!

In [37]:
from sklearn.preprocessing import StandardScaler

In [38]:
# ONLY IF YOU HAVE NO DUMMY VARIABLES! YOU DO NOT APPLY STANDARDISATION TO THOSE!
# sc = StandardScaler().fit(X_train).transform(X_train)

# In this example, we skip the first 3 columns (one hot encoded)
sc = StandardScaler()
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])

In [39]:
# If you apply the fit() method on test data, you are changing the scaler from generic model to fit only that data!
# You will only use the fitted scaler to transform new data, i.e., standardise

X_test[:, 3:] = sc.transform(X_test[:, 3:])

In [42]:
X_train

array([[0.0, 0.0, 1.0, 0.003747263739951552, -0.14740926029993145],
       [0.0, 0.0, 1.0, -1.480169177280821, -1.242449479670851],
       [1.0, 0.0, 0.0, 1.352762210122472, 1.3688002742136494],
       [0.0, 0.0, 1.0, 0.10867064845859215, -0.9055140275567218],
       [0.0, 1.0, 0.0, 1.622565199398976, 1.7057357263277784],
       [1.0, 0.0, 0.0, -0.4009572201748046, -0.40011084938552827],
       [0.0, 1.0, 0.0, -1.0754646933660648, -0.7370463014996573],
       [1.0, 0.0, 0.0, -0.1311542308983005, 0.3579939178712621]],
      dtype=object)

In [43]:
X_test

array([[1.0, 0.0, 0.0, 0.8131562315694638, 0.7791632330139234],
       [0.0, 1.0, 0.0, 0.27355025301645564, 0.08657369255710287]],
      dtype=object)