# Data Analysis

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

from scipy import stats

import utils

## Loading Dataset

In [2]:
train_input, train_output = utils.loadTrainingData()

## Training Input Dataset

### Dataset Description

In [None]:
train_input
# train_input.shape

In [None]:
train_input.describe()

**capuchon_insertion**'s standard deviation is quite low ($0.024425$).

In [None]:
train_input.hist(figsize=(20,10))

In [None]:
# train_input.boxplot(column=train_input.columns.tolist()[1:], figsize=(20,10))

for column in train_input.columns.tolist()[1:]:
    plt.figure()
    train_input.boxplot([column])

### Features Correlation

In [None]:
train_input.corr()

In [None]:
# plt.matshow(train_input.corr())
# plt.show()

In [None]:
sns.pairplot(train_input)
plt.show()

### NA Values

In [None]:
train_input.isna().sum()

**capuchon_insertion** might not be a relevent parameter, more than 50% of the population is na ($\frac{18627}{34515} \approx 0.53967840069$).  
But nan values could also be filed with the average value as shown below...  
If every/most defective individual is set to na then we can eliminate this feature.   

In [None]:
mean = train_input["capuchon_insertion"].mean()
capuchon_insertion_no_nan = train_input["capuchon_insertion"].fillna(mean, inplace=False)
capuchon_insertion_no_nan

In [None]:
train_input_defect = train_input.copy() # Deep copy
defect_index = train_output.index[train_output["result"] == 1].tolist()
train_input_defect = train_input_defect.iloc[defect_index,:]
train_input_defect.isna().sum()

Among the 305 defective individuals, 110 of them do not have a **capuchon_insertion** value ($\frac{110}{305} \approx 0.3606$).  
Most of the defective individuals have a **capuchon_insertion** value, maybe it is worth keeping it...  

### PCA

#### Unscaled

In [None]:
# Create dataset for PCA
train_input_pca = train_input.copy() # Deep copy
train_input_pca = train_input_pca[train_input_pca.columns[~train_input_pca.columns.isin(["id", "capuchon_insertion"])]]

# PCA
pca = PCA()
pca.fit(train_input_pca)

pca.explained_variance_ratio_
plt.xticks(range(12))
plt.plot(range(12), pca.explained_variance_ratio_)

In [None]:
cumsum = np.cumsum(pca.explained_variance_ratio_)
plt.xticks(range(12))
plt.plot(range(12), cumsum)

In [None]:
pca_model = PCA(n_components=12).fit(train_input_pca)
X_pca = pca_model.transform(train_input_pca)

# Number of components
nb_comp = pca_model.components_.shape[0]

# Index of the most important feature on EACH component i.e. largest absolute value
most_important = [np.abs(pca_model.components_[i]).argmax() for i in range(nb_comp)]

# Features names
initial_feature_names = pca_model.feature_names_in_

# Name of the most important feature on EACH component
most_important_names = [initial_feature_names[most_important[i]] for i in range(nb_comp)]

dic = {'PC{}'.format(i+1) : most_important_names[i] for i in range(nb_comp)}

# build the dataframe
# df = pd.DataFrame(sorted(dic.items()))

dic

In [None]:
# Create 3D figure
fig = plt.figure(figsize=(20,10))
ax = fig.add_subplot(projection='3d')

# Plot with color
colors = {1: 'red', 0: 'green'}
ax.scatter(X_pca[:, 0], X_pca[:, 1], X_pca[:, 2], c=train_output["result"].map(colors))

# Set labels
ax.set_xlabel('Dim 1')
ax.set_ylabel('Dim 2')
ax.set_zlabel('Dim 3')

# Show figure
plt.show()

In [None]:
# Create 2D figure
fig = plt.figure(figsize=(20,10))
ax = fig.add_subplot()

# Plot with color and transparency
colors = {1: 'red', 0: 'green'}
alphas = {1: 1, 0: 0.1}
ax.scatter(X_pca[:, 0], X_pca[:, 1], c=train_output["result"].map(colors), alpha=train_output["result"].map(alphas))

# Set labels
ax.set_xlabel('Dim 1')
ax.set_ylabel('Dim 2')

# Show figure
plt.show()

#### Scaled

In [None]:
# Create dataset for PCA
train_input_pca_scale = train_input.copy() # Deep copy
train_input_pca_scale = train_input_pca_scale[train_input_pca_scale.columns[~train_input_pca_scale.columns.isin(["id", "capuchon_insertion"])]]

# Scale data
standard_scaler = StandardScaler(copy=False)
standard_scaler.fit_transform(train_input_pca_scale)

# PCA
scaled_pca = PCA()
scaled_pca.fit(train_input_pca_scale)

scaled_pca.explained_variance_ratio_
plt.xticks(range(12))
plt.plot(range(12), scaled_pca.explained_variance_ratio_)

In [None]:
scaled_cumsum = np.cumsum(scaled_pca.explained_variance_ratio_)
plt.xticks(range(12))
plt.plot(range(12), scaled_cumsum)

In [None]:
scaled_pca_model = PCA(n_components=12).fit(train_input_pca_scale)
X_scaled_pca = scaled_pca_model.transform(train_input_pca_scale)

# Number of components
nb_comp = scaled_pca_model.components_.shape[0]

# Index of the most important feature on EACH component i.e. largest absolute value
most_important = [np.abs(scaled_pca_model.components_[i]).argmax() for i in range(nb_comp)]

# Features names
initial_feature_names = scaled_pca_model.feature_names_in_

# Name of the most important feature on EACH component
most_important_names = [initial_feature_names[most_important[i]] for i in range(nb_comp)]

scaled_dic = {'PC{}'.format(i+1) : most_important_names[i] for i in range(nb_comp)}

# build the dataframe
# df = pd.DataFrame(sorted(dic.items()))

scaled_dic

In [None]:
# X_scaled_pca[:, :3]

# Create 3D figure
fig = plt.figure(figsize=(20,10))
ax = fig.add_subplot(projection='3d')

# Plot with color
colors = {1:'red', 0:'green'}
ax.scatter(X_scaled_pca[:, 0], X_scaled_pca[:, 1], X_scaled_pca[:, 2], c=train_output["result"].map(colors))

# Set labels
ax.set_xlabel('Dim 1')
ax.set_ylabel('Dim 2')
ax.set_zlabel('Dim 3')

# Show figure
plt.show()


In [None]:
# Create 2D figure
fig = plt.figure(figsize=(20,10))
ax = fig.add_subplot()

# Plot with color and transparency
colors = {1: 'red', 0: 'green'}
alphas = {1: 1, 0: 0.1}
ax.scatter(X_scaled_pca[:, 0], X_scaled_pca[:, 1], c=train_output["result"].map(colors), alpha=train_output["result"].map(alphas))

# Set labels
ax.set_xlabel('Dim 1')
ax.set_ylabel('Dim 2')

# Show figure
plt.show()

### Removing Outliers

In [None]:
train_input_ = train_input[train_input.columns[~train_input.columns.isin(["id", "capuchon_insertion"])]]
X_train, X_test, y_train, y_test = train_test_split(train_input_, train_output["result"], test_size = 0.3, random_state = 123)

# Scale data (MLP is very sensitive to scaling)
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Scale data (MLP is very sensitive to scaling)
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Copy dataset
train_input_remove = train_input.copy()
train_output_remove = train_output.copy()

# Scale data (MLP is very sensitive to scaling and allow for a unique treshold)
# scaler = StandardScaler()
# scaler.fit(X_train)
# X_train = scaler.transform(X_train)
# X_test = scaler.transform(X_test)

# Find outliers
threshold = 3
outliers = np.array([], dtype = int)
for col_name in list(input_header.values())[1:]:
    z = np.abs(stats.zscore(train_input[col_name]))
    outliers = np.append(outliers, np.where(z > threshold))
outliers_no_duplicate = np.array([], dtype=int)
for i in range(np.size(outliers)):
    if(outliers[i] not in outliers_no_duplicate):
        outliers_no_duplicate = np.append(outliers_no_duplicate, outliers[i])
np.size(outliers_no_duplicate)

# Randomly remove some valid individuals
train_input_remove = train_input_remove.iloc[~outliers_no_duplicate,:]

# Create new datasets
train_input_remove = train_input_remove[train_input_remove.columns[~train_input_remove.columns.isin(["id", "capuchon_insertion"])]]

for column in train_input_remove.columns.tolist()[1:]:
    plt.figure()
    train_input_remove.boxplot([column])

## Training Output Dataset

In [None]:
train_output
# train_output.shape

In [None]:
# train_output_bool = train_output.copy() # Deep copy
# train_output_bool["result"] = train_output_bool["result"].astype(bool)
# train_output_bool

In [None]:
train_output["result"].value_counts()

Unbalanced result classes. Defect class is under-represented in the population ($\frac{305}{34515} \approx 0.008836737650296972$).