# PCA with Python - alcohol dataset

In [None]:
# Import the necessary libraries for importing and viewing the data
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px # Shift + Enter

## Data Import: alcohol dataset (training)

In [None]:
# Import .xlsx file with column names (header=0) and row names (index_col=0), select also the sheet named 'training'
data = pd.read_excel("C:/Users/Eugenio_Py/Desktop/Notebooks/datasets/alcohol.xlsx", sheet_name='training', header=0, index_col=0) #scrivere qui i commenti

In [None]:
# View simple information of the data
data.info()

In [None]:
# View the first 5 rows
data.head()

In [None]:
# View the dimensions of the dataset
data.shape

In [None]:
# Print all the data
data

In [None]:
# Check the presence of missing values
data.isnull().sum()

In [None]:
#Evaluate the percentages of each category.
data["Class"].value_counts(normalize=False)

In [None]:
# View some information about the categorical variable
data["Class"].describe()

In [None]:
# Boxplot using seaborn (plot Y by Class, then a variable (e.g. Acetaldehyde) by class)
sns.set_style("whitegrid")  
sns.boxplot(x = 'Class', y = 'Isobutanol', data = data);

In [None]:
# KDE plot (Kernel Density Estimation)
sns.kdeplot(data=data, x="Isobutanol", hue="Class", multiple="stack");

In [None]:
# Scatter 2D plot using plotly package (useful for dynamic plots)
import plotly.express as px
fig = px.scatter(data, x="Acetaldehyde", y="Isobutanol", color="Class", hover_name="Class", log_x=False)
fig.show()

In [None]:
# Scatter 3D plot using plotly package (useful for dynamic plots)
fig = px.scatter_3d(data, x="Acetaldehyde", y="Isobutanol", z = "Propanol", color="Class")
fig.show()

## Data pre-processing (scaling, autoscaling)

In [None]:
# View the shape
data.shape

In [None]:
# We need to select only numerical values (i.e., the continuous variables)
X = data.iloc[:, 1:13]
X

In [None]:
# To plot the raw data altogether, we need to transpose (.T) the data
X.T.plot(legend=False,figsize=(20,9));

In [None]:
# We need to convert the data into an array-like form in order to proper apply the autoscale function (it works on np array objects)
X = data.values[:, 1:13]
X

In [None]:
# Define a vector for the variables
vars = data.columns[1:13]

In [None]:
# Print the vector
vars

In [None]:
# Now we can also AUTOSCALE the data (autoscaling by column)
from sklearn.preprocessing import scale
Xaut = scale(X)
Xaut

In [None]:
# Now we can plot 2 three dataset (non-scaled and autoscaled) together
import matplotlib
import matplotlib.pyplot as plt
fig, (ax1, ax2) = plt.subplots(2,figsize=(15,15))
ax1.plot(vars,  X.T)
ax1.set_title('Original data')
ax2.plot(vars,  Xaut.T)
ax2.set_title('Autoscaling')

In [None]:
# This is called parallel coordinates plot
from matplotlib.pyplot import figure
figure(figsize=(16, 6), dpi=120)
fig = pd.plotting.parallel_coordinates( data, 'Class', color=('red', 'blue'));
fig;

In [None]:
data.columns[1:13]

In [None]:
# Prepare the dataset for autoscaled data
data_autosc = pd.DataFrame(Xaut)
data_autosc.columns = data.columns[1:13]
data_autosc.index = data.index
data_autosc.head()
data_autosc['Class'] = data.Class
data_autosc.head()

In [None]:
# Prepare the dataset for autoscaled data (try not to use .index, what happens?)
data_autosc = pd.DataFrame(Xaut)
data_autosc.columns = data.columns[1:13]
#data_autosc.index = data.index
data_autosc.head()
data_autosc['Class'] = data.Class
data_autosc.head()

In [None]:
# Autoscaled data parallel coordinates plot
from matplotlib.pyplot import figure
figure(figsize=(16, 6), dpi=120)
fig = pd.plotting.parallel_coordinates(data_autosc, 'Class', color=('red', 'blue'));
#fig.set_xticklabels(data_autosc['Names'], rotation=90, ha='right')
fig;

## PCA on autoscaled dataset

In [None]:
# For simplicity, define Xaut as X
X = Xaut
X

In [None]:
# Build the PCA model (using sklearn library) - set 10 Principal Components (PCs) as default
from sklearn import decomposition
pca = decomposition.PCA(n_components=10)
pca

In [None]:
# Apply the PCA model on X data
Principal_components=pca.fit_transform(X)

view https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html for more information

In [None]:
# View the Scree plot
import matplotlib
import matplotlib.pyplot as plt
PC_values = np.arange(pca.n_components_) + 1
plt.plot(PC_values, pca.explained_variance_ratio_, 'ro-', linewidth=2)
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Proportion of Variance Explained')
plt.show()

How many PCs would you choose?

In [None]:
# Print the explained variance
print ("Explained variance: ", pca.explained_variance_)

In [None]:
# Print the explained variance (ratio)
print ("Proportion of Variance Explained : ", pca.explained_variance_ratio_)

In [None]:
# Print the cumulative explained variance (ratio)
out_sum = np.cumsum(pca.explained_variance_ratio_)  
print ("Cumulative Prop. Variance Explained: ", out_sum)

In [None]:
# Plot the cumulative explained variance
plt.plot(PC_values, np.cumsum(pca.explained_variance_ratio_), 'ro-', linewidth=2)
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Cumulative Prop. Variance Explained')
plt.show()

In [None]:
# Plot the explained variance
plt.plot(PC_values, np.cumsum(pca.explained_variance_), 'ro-', linewidth=2)
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance')
plt.show()

In [None]:
# Build a proper PCA model with n components
pca = decomposition.PCA(n_components=3)
Principal_components=pca.fit_transform(X)

In [None]:
# Prepare a dataframe for the scores
scores = pd.DataFrame(data = Principal_components, columns = ['PC1', 'PC2', 'PC3'])
print(scores)

In [None]:
# Scores - for graphs - add the Class and Y columns
scores = pd.DataFrame(data = Principal_components, columns = ['PC1', 'PC2', 'PC3'])
scores.index = data.index # name the row names as dataset 'data'
scores = pd.concat([data.Class,scores], axis = 1) # concatenate function, along the column direction (axis=1)
print(scores)

In [None]:
# Prepare the Loadings dataframe (loadings = pca.components_.T)
loadings = pd.DataFrame(pca.components_.T, columns=['PC1', 'PC2', 'PC3'], index=data.columns[1:13])
loadings["Attributes"] = loadings.index
loadings

In [None]:
# Plot the Scores
import plotly.express as px
fig = px.scatter(scores, x="PC1", y="PC2", color="Class", hover_data={'Sample': (scores.index)})
fig.update_xaxes(zeroline=True, zerolinewidth=1, zerolinecolor='Black')
fig.update_yaxes(zeroline=True, zerolinewidth=1, zerolinecolor='Black')
fig.update_layout(
    height=600,
    width=800,
    title_text='Scores Plot')
fig.show()

In [None]:
# Scatter 3D plot using plotly package (useful for dynamic plots)
fig = px.scatter_3d(scores, x="PC1", y="PC2", z = "PC3", color="Class")
fig.show()

In [None]:
# Plot the Loadings plot in 2D
fig = px.scatter(loadings, x="PC1", y="PC2",hover_data={'Variables': (loadings.index)})#,text="Attributes")
fig.update_xaxes(zeroline=True, zerolinewidth=1, zerolinecolor='Black')
fig.update_yaxes(zeroline=True, zerolinewidth=1, zerolinecolor='Black')
fig.update_traces(textposition='top center')
fig.update_layout(
    height=600,
    width=800,
    title_text='Loadings Plot')
fig.show()

In [None]:
# Plot only one loading
import matplotlib.pyplot as plt
fig = px.line(loadings, x="Attributes", y="PC1")#, text="Attributes")
fig.show()

In [None]:
# Boxplot using seaborn (Class vs a certain variable)
sns.set_style("whitegrid")  
sns.boxplot(x = 'Class', y = 'Isoamyl_acetal', data = data);

## Import test dataset for prediction

In [None]:
#Import .xlsx file with column names (header=0) and row names (index_col=0)
test = pd.read_excel("C:/Users/Eugenio_Py/Desktop/Notebooks/datasets/alcohol.xlsx", sheet_name='test', header=0, index_col=0)
test.head()

In [None]:
# Pre-process the test set
from sklearn.preprocessing import scale
Xtest = scale(test.values[:, 1:13])
Xtest

In [None]:
# Project the new samples in the developed PCA model
test_components=pca.fit_transform(Xtest)

In [None]:
# Prepare a dataframe for the scores - test
test_scores = pd.DataFrame(data = test_components, columns = ['PC1', 'PC2', 'PC3'])
test_scores.index = test.index # name the row names as dataset 'test'
test_scores = pd.concat([test.Class,test_scores], axis = 1) # concatenate function, along the column direction (axis=1)
print(test_scores)

In [None]:
# We need to create a new (numeric) column in order to plot the new test samples into the developed PCA model
test_scores['Class2'] = pd.Categorical(test_scores["Class"])
test_scores['Class2'] = test_scores.Class2.cat.codes
test_scores

In [None]:
# Plot the Scores with the projected test samples
import plotly.express as px
import plotly as plotly
import plotly.graph_objs as go

fig = px.scatter(scores, x="PC1", y="PC2", color="Class", hover_data={'Sample': (scores.index)})
fig.add_trace(go.Scatter(x=test_scores["PC1"], y=test_scores["PC2"], mode='markers', marker=dict(color=test_scores.Class2,size=8), name = "Test"))
fig.update_layout(hovermode='x unified')
fig.show()


## Hotelling's T2 vs Q-residuals

In [None]:
# Get PCA scores
T = scores.iloc[:,1:4]
T

In [None]:
# Get PCA loadings
P = loadings.iloc[:,0:3]
P

In [None]:
# Calculate error array
Err = Xaut - np.dot(T,P.T)
Err

In [None]:
# Calculate Q-residuals (sum over the rows of the error array)
Q = np.sum(Err**2, axis=1)
Q

In [None]:
# Calculate Hotelling's T-squared (note that data are normalised by default)
Tsq = np.sum((T/np.std(T, axis=0))**2, axis=1)
Tsq

If necessary, install **scipy** library

In [None]:
#pip install scipy

In [None]:
# set the confidence level
conf = 0.95
ncomp = 3
 
from scipy.stats import f

# Calculate confidence level for T-squared from the ppf of the F distribution
Tsq_conf =  f.ppf(q=conf, dfn=ncomp, \
            dfd=Xaut.shape[0])*ncomp*(Xaut.shape[0]-1)/(Xaut.shape[0]-ncomp)
Tsq_conf

In [None]:
# Estimate the confidence level for the Q-residuals
i = np.max(Q)+1
while 1-np.sum(Q>i)/np.sum(Q>0) > conf:
    i -= 1
Q_conf = i

Q_conf

In [None]:
# Create a dataframe using only T2 and Q-residuals
hot_q_data = {'T2': Tsq, 'Qres': Q, 'Class': data.Class}  
hot_q_data = pd.DataFrame(hot_q_data, index = data.index)
hot_q_data


In [None]:
# Plot the Hotelling T2 vs Q-residuals plot
fig = px.scatter(hot_q_data, x="T2", y="Qres", hover_data={'Sample': (hot_q_data.index)})#, color = "Class")
fig.add_hline(y=Q_conf,line_dash="dot", line_color='Red')
fig.add_vline(x=Tsq_conf,line_dash="dot", line_color='Red')
fig.update_traces(textposition='top center')
fig.update_layout(
    height=600,
    width=800,
    title_text="Hotelling's T2 vs Q-residuals")
fig.show()