# Preprocessing Datas for Machine Learning

***

In [None]:
# Data frames
import pandas as pd

# Machine Learning
import sklearn as sk

# Nearest neighbors
import sklearn.neighbors as ne

# Preprocessing
import sklearn.preprocessing as pre

# Decomposition
import sklearn.decomposition as dec

# Statistical test
import scipy.stats as ss

# Plots
import matplotlib.pyplot as plt

# Statistical plots
import seaborn as sns

: 

In [None]:
# Load penguins
df = pd.read_csv('penguins.csv')

# Show
df

: 

In [None]:
# Drop any rows with NA/Nan
df = df.dropna()

# Show
df

: 

In [None]:
# Independent variables
X = df[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']]

# Show
X

: 

In [None]:
# First row
X.loc[0]

: 

In [None]:
# Last row
X.loc[342]

: 

In [None]:
# Euclidean distance - first step
X.loc[0] - X.loc[342]

: 

In [None]:
# Euclidean distance - second step
(X.loc[0] - X.loc[342])**2

: 

In [None]:
# Euclidean distance - third step - add them together
((X.loc[0] - X.loc[342])**2).sum()

: 

In [None]:
# Euclidean distance - final step - take the square root
(((X.loc[0] - X.loc[342])**2).sum())**0.5

: 

In [None]:
# Convert body mass to kg
df['body_mass_kg'] = df['body_mass_g'] / 1000.0

# Show
df

: 

In [None]:
# Independent variables
X = df[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_kg']]

# Show
X

: 

In [None]:
# Euclidean distance - first step
X.loc[0] - X.loc[342]

: 

In [None]:
# Euclidean distance - second step
(X.loc[0] - X.loc[342])**2

: 

In [None]:
# Euclidean distance - final step
(((X.loc[0] - X.loc[342])**2).sum())**0.5

: 

# Tests for Normality

https://statistics.laerd.com/spss-tutorials/testing-for-normality-using-spss-statistics.php

https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.shapiro.html

***

In [None]:
# Histogram.
df['body_mass_g'].hist()

: 

In [None]:
# Shapiro test
ss.shapiro(df['body_mass_g'])

: 

In [None]:
# Separate out gentoos
df_gentoos = df[df['species'] == 'Gentoo']

# Histogram
df_gentoos['body_mass_g'].hist()

: 

In [None]:
# Shapiro test
ss.shapiro(df_gentoos['body_mass_g'])

: 

# Scaling Data

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html

https://stackoverflow.com/questions/55073423/should-i-normalize-or-standardize-my-dataset-for-knn

https://stackoverflow.com/questions/55601928/apply-multiple-standardscalers-to-individual-groups

***

In [None]:
# Load penguins
df = pd.read_csv('penguins.csv').dropna()

# Show
df

: 

In [None]:
# Independent variables
X = df[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']]

# Show
X

: 

In [None]:
# Create a standard scaler
scaler = pre.StandardScaler()

# Show
scaler

: 

In [None]:
# Fit the data to the scaler
scaler.fit(X)

: 

In [None]:
# Show the means and variances
scaler.mean_, scaler.var_

: 

In [None]:
# Verify the above
X.describe()

: 

In [None]:
# Transformed X array
X_transformed = scaler.transform(X)

# Show
X_transformed

: 

In [None]:
# Means
X_transformed.mean(axis=0)

: 

In [None]:
# Means
X_transformed.std(axis=0)

: 

In [None]:
# Differences squared between first and last row
(X_transformed[0] - X_transformed[-1])**2

: 

In [None]:
# Original column names
X.columns

: 

In [None]:
# Re-create data frame
df_X_trans = pd.DataFrame(X_transformed, columns=X.columns)

# Show
df_X_trans

: 

# Dimensions

***

In [None]:
# Look at the data again
df

: 

In [None]:
# Scatter plots and histograms
sns.pairplot(df, hue='species');

: 

In [None]:
# Empty plot
fig, ax = plt.subplots(figsize=(12,6))

# Separate the gentoos
df_gentoo = df[df['species'] == 'Gentoo']

# Scatter plot of two variables
ax.plot(df_gentoo['bill_length_mm'], df_gentoo['flipper_length_mm'], 'k.')

# Plot a single point I picked out
ax.plot(df_gentoo.loc[259]['bill_length_mm'], df_gentoo.loc[259]['flipper_length_mm'], 'rx') 

# Plot a single point I picked out
ax.plot(df_gentoo.loc[327]['bill_length_mm'], df_gentoo.loc[327]['flipper_length_mm'], 'gx')

# Distance in x direction
ax.hlines([df_gentoo.loc[259]['flipper_length_mm']], df_gentoo.loc[259]['bill_length_mm'], df_gentoo.loc[327]['bill_length_mm'], 'b')

# Distance in y direction
ax.vlines([df_gentoo.loc[327]['bill_length_mm']], df_gentoo.loc[259]['flipper_length_mm'], df_gentoo.loc[327]['flipper_length_mm'], 'b');

: 

# Principal Component Analysis

https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html

***

In [None]:
# Create a new PCA instance
pca = dec.PCA(n_components=2)

: 

In [None]:
# The X data
X

: 

In [None]:
# Fit the data to pca
pca.fit(X)

: 

In [None]:
# Show the variablity
pca.explained_variance_ratio_

: 

In [None]:
# Transform X
X_pca = pca.transform(X)

# Show
X_pca

: 

In [None]:
# Create an empty plot
fig, ax = plt.subplots()

# Plot scatter plot
ax.plot(X_pca[:, 0], X_pca[:, 1], 'k.');

: 

In [None]:
# Original classifications
df_pca = pd.DataFrame(df[['species', 'sex']])

# Show
df_pca

: 

In [None]:
# Incorporate our PCA variables
df_pca['pca0'] = X_pca[:, 0]
df_pca['pca1'] = X_pca[:, 1]

# Show
df_pca

: 

In [None]:
# Pair plot
sns.pairplot(df_pca, hue='species')

: 

In [None]:
# The scaled data
df_X_trans

: 

In [None]:
# Create a new PCA instance
pca = dec.PCA(n_components=2)

# Fit the scaled data
pca.fit(df_X_trans)

# Transform
X_trans_pca = pca.transform(df_X_trans)

# Original classifications
df_trans_pca = pd.DataFrame(df[['species', 'sex']])

# Incorporate our PCA variables
df_trans_pca['pca0'] = X_trans_pca[:, 0]
df_trans_pca['pca1'] = X_trans_pca[:, 1]

# Show
df_trans_pca

: 

In [None]:
# Look at the variance
pca.explained_variance_ratio_

: 

In [None]:
# Pair plot
sns.pairplot(df_trans_pca, hue='species')

: 

# kNN on the scaled data

***

## Before Scaling

In [None]:
# Load the data
df = pd.read_csv('penguins.csv').dropna()

# Create a new instance of a classifier
clf = sk.neighbors.KNeighborsClassifier()

# X data
X = df[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']]

# y data
y = df['species'].to_numpy()

# Run cross-validation with five folds
sk.model_selection.cross_val_score(clf, X, y)

: 

## After Scaling

In [None]:
# Load the data
df = pd.read_csv('penguins.csv').dropna()

# Create a new instance of a classifier
clf = sk.neighbors.KNeighborsClassifier()

# X data
X = df[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']]
  
# Use a standard scaler
scaler = pre.StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

# y data
y = df['species'].to_numpy()

# Run cross-validation with five folds
sk.model_selection.cross_val_score(clf, X, y)

: 

***

## End