In this notebook, we will apply PCA as new features to enhance our model performance on Ames dataset, and detect outlier using PCA

In [5]:
#import essential libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor

#set Matplotlib defaults
plt.style.use('seaborn-v0_8-whitegrid')
plt.rc('figure', autolayout=True)
plt.rc('axes',
      labelweight='bold',
      labelsize='large',
      titleweight='bold',
      titlesize=14,
      titlepad=10,)

def apply_pca(X, standardize=True):
    #Standardize
    if standardize:
        X = (X - X.mean(axis=0)) / X.std(axis=0)
    #create principal components
    pca = PCA()
    X_pca = pca.fit_transform(X)
    #Convert to dataframe
    component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
    x_pca = pd.DataFrame(X_pca, columns=component_names)
    #create loadings
    loadings = pd.DataFrame(pca.components_.T, #transpose the matrix of loadings
                           columns=component_names, #columns are the pc names
                           index=X.columns, #the rows are the original features
                           )
    return pca, X_pca, loadings


def plot_variance(pca, width=8, dpi=100):
    #create figure
    fig, axs = plt.subplots(1, 2)
    n = pca.n_components_
    grid = np.arange(1, n+1)
    #explained variance
    evr = pca.explained_variance_ratio_
    axs[0].bar(grid, evr)
    axs[0].set(xlabel="Component", title="% Explained Variance", ylim=(0.0, 1.0))
    #Cumulative Variance
    cv = np.cumsum(evr)
    axs[1].plot(np.r_[0, grid], np.r_[0, cv], "o-")
    axs[1].set(xlabel="Component", title="% Cumulative Variance", ylim=(0.0, 1.0))
    
    fig.set(figwidth=8, dpi=100)
    return axs

