In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from ISLP import load_data

In [None]:
from ISLP import load_data
from ISLP.models import (ModelSpec as MS,
                         summarize,
                         poly)

In [None]:
#Auto = pd.read_csv('../../data/Auto.csv')
Auto = pd.read_csv('Auto.csv')
Auto

In [None]:
# drop the records with missing horsepower, 
# and drop the name column since it's categorical with many levels

Auto = Auto[(Auto['horsepower']!='?')]
veh_names = Auto['name'] # store this for later
Auto = Auto[['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'year', 'origin']]
Auto

In [None]:
# standardize the data so that the components aren't biased by sizes of variable values

scaler = StandardScaler(with_std=True,
                        with_mean=True)
Auto_scaled = scaler.fit_transform(Auto)
pd.DataFrame(Auto_scaled)

In [None]:
# add back the column names to make it easier to look at

Auto = pd.DataFrame(Auto_scaled, columns=['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'year', 'origin'])
Auto

In [None]:
PCA_Auto = PCA()

In [None]:
PCA_Auto.fit(Auto)

In [None]:
PCA_Auto.components_

In [None]:
# clean this up a bit

components = pd.DataFrame(PCA_Auto.components_, 
                          columns=['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'year', 'origin'])
components

In [None]:
# We can calculate scores for each component for each record in our data

pca_scores = PCA_Auto.transform(Auto)
display_scores = pd.merge(pd.DataFrame(PCA_Auto.transform(Auto), 
                          columns=['pc1', 'pc2', 'pc3', 'pc4', 'pc5', 'pc6', 'pc7', 'pc8']), veh_names, left_index=True, right_index=True)
display_scores

In [None]:
i, j = 0, 1 # which components
fig, ax = plt.subplots(1, 1, figsize=(8, 8))
ax.scatter(pca_scores[:,0], pca_scores[:,1])
ax.set_xlabel('PC%d' % (i+1))
ax.set_ylabel('PC%d' % (j+1))
for k in range(PCA_Auto.components_.shape[1]):
    ax.arrow(0, 0, PCA_Auto.components_[i,k], PCA_Auto.components_[j,k])
    ax.text(PCA_Auto.components_[i,k],
            PCA_Auto.components_[j,k],
            Auto.columns[k])

In [None]:
# Find the total variance explained by each component

PCA_Auto.explained_variance_

In [None]:
# Calculate these variances as percents
 
PCA_Auto.explained_variance_ratio_

In [None]:
%%capture
fig, axes = plt.subplots(1, 2, figsize=(15, 6))
ticks = np.arange(PCA_Auto.n_components_)+1
ax = axes[0]
ax.plot(ticks,
        PCA_Auto.explained_variance_ratio_,
        marker='o')
ax.set_xlabel('Principal Component');
ax.set_ylabel('Proportion of Variance Explained')
ax.set_ylim([0,1])
ax.set_xticks(ticks)

In [None]:
ax = axes[1]
ax.plot(ticks,
        PCA_Auto.explained_variance_ratio_.cumsum(),
        marker='o')
ax.set_xlabel('Principal Component')
ax.set_ylabel('Cumulative Proportion of Variance Explained')
ax.set_ylim([0, 1])
ax.set_xticks(ticks)
fig

In [None]:
veh_names