# Principal Component Analysis

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('pca.csv')
df

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
scaled_X = scaler.fit_transform(df)
scaled_X

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=2)
principal_components = pca.fit_transform(scaled_X)

In [None]:
plt.scatter(principal_components[:,0], principal_components[:,1])
plt.xlabel('First principal component')
plt.ylabel('Second Principal Component')

The number of components we requested

In [None]:
pca.n_components

In [None]:
pca.components_

In [None]:
df_comp = pd.DataFrame(pca.components_, index=['PC1', 'PC2'], columns=df.columns)
df_comp

In [None]:
plt.figure(figsize=(10, 2))
sns.heatmap(df_comp, annot=True)

We can see that the first component explains 33% percents of the data and the second 22%. In total they explain 55% of the data

In [None]:
pca.explained_variance_ratio_

In [None]:
np.sum(pca.explained_variance_ratio_)

So if we set n_components to equal all features then of course we expect to explain 100% of data

In [None]:
# our original dataframe has 8 features so we set 8 as a value
pca_8 = PCA(n_components=8)
pca_8.fit(scaled_X)

In [None]:
pca_8.explained_variance_ratio_

In [None]:
np.sum(pca_8.explained_variance_ratio_)

Below we will show how much data we explain with testing different n_components value. This way we can easily see how many percents data our components will explain for different **n_components** values.

In [None]:
explained_variance = []

for n in range(1, 9):
    pca = PCA(n_components=n)
    pca.fit(scaled_X)
    
    explained_variance.append(np.sum(pca.explained_variance_ratio_))

In [None]:
plt.plot(range(1, 9), explained_variance)
plt.xlabel("Number of Components")
plt.ylabel("Variance Explained");