# Notebook_11: PCA

The objective of this notebook is to experiment with PCA and to find a good number of components and produce some visualisations to explain.

In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

import altair as alt
import altair_saver

alt.data_transformers.enable('data_server')
alt.renderers.enable('default')

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')


In [2]:
# Data prep pipeline
project_root = Path().resolve().parent
data_path = project_root / "Data" / "Final" / "al_data_final.csv"

data = pd.read_csv(data_path)

X = data.drop("tc_act", axis=1)
y = data["tc_act"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=True, stratify=data["x"]
)

# Must scale data before PCA
prep_pipeline = Pipeline(
    [
        ("poly_features", PolynomialFeatures(degree=3, include_bias=False)),
        ("scaler", StandardScaler()),
    ]
)

X_train_prepared = prep_pipeline.fit_transform(X_train)
X_test_prepared = prep_pipeline.transform(X_test)

In [3]:
fig_path = project_root / 'Figures'

In [4]:
pca = PCA(n_components = 0.95)

X_train_pca = pca.fit_transform(X_train_prepared)

In [5]:
pca_df = pd.DataFrame(X_train_pca)
pca_df.rename({0: "PC1", 1: "PC2", 2: "PC3", 3: "PC4"}, axis = 1, inplace = True)

In [6]:
pca_df

Unnamed: 0,PC1,PC2,PC3,PC4
0,-1.660795,-4.916908,0.142667,-0.668685
1,5.097335,4.318547,4.797293,2.335847
2,-2.065288,-5.205475,-0.285547,-0.031620
3,2.108529,4.477549,-3.645370,0.209733
4,1.736276,4.238479,-3.663988,0.838737
...,...,...,...,...
139,2.567030,2.175674,1.829584,-0.460349
140,-0.348875,-2.530451,4.538816,-1.134709
141,5.143048,1.709141,-2.153726,0.085257
142,-3.425587,-1.444043,-1.276328,-0.312171


In [7]:
pca.explained_variance_ratio_

array([0.44780667, 0.31381839, 0.16236881, 0.03975169])

In [8]:
# Plot explained variance
exp_variance_df = pd.DataFrame({'pc': [f"PC{i}" for i in range(1, 5)], 
'exp_var': pca.explained_variance_ratio_,
'cumulative': np.cumsum(pca.explained_variance_ratio_)})

In [9]:
exp_variance_df

Unnamed: 0,pc,exp_var,cumulative
0,PC1,0.447807,0.447807
1,PC2,0.313818,0.761625
2,PC3,0.162369,0.923994
3,PC4,0.039752,0.963746


In [10]:
base = alt.Chart(exp_variance_df).encode(alt.X('pc:N', title = 'Principal Component'))

bar = base.mark_bar().encode(alt.Y('exp_var:Q', title = "% Explained Variance"))

line = base.mark_line(color = 'red').encode(
    y = 'cumulative:Q'
)

chart = (bar + line).properties(width = 250, title = 'PCA Results')

with alt.data_transformers.enable('default'):
    chart.save("../Figures/pca_exp_var.png", scale_factor = 6.0)



In [11]:
chart
