# PCA Recipe

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib import colors
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler

In [None]:
dataset = load_iris(as_frame=True)
features = dataset["data"]
target = dataset["target"]
features

## 1. Scale Data

In [None]:
scaler = StandardScaler()
scaled_features = pd.DataFrame()
scaled_features[features.columns] = scaler.fit_transform(features)
scaled_features_2d = scaled_features[["petal length (cm)", "petal width (cm)"]]
scaled_features_2d.describe()


## 2. Calculate Covariance Matrix

In [None]:
covariance = scaled_features_2d.cov()
covariance

## 3. Calculate Eigenvectors

In [None]:
eigenvalues, eigenvectors = np.linalg.eig(covariance)
index_1st = np.argmax(eigenvalues)
index_2st = np.argmin(eigenvalues)
print(f"First principal component's variance is: {eigenvalues[index_1st]}")
print(f"Along new axis w_1 =\n",
      f"{eigenvectors[:, index_1st][0]} * petal length\n",
      f"+ {eigenvectors[:, index_1st][1]} * petal width")

cmap = colors.ListedColormap(["#1D4D4A", "#ED5654", "#764674"])
plt.scatter(scaled_features_2d["petal length (cm)"],
                               scaled_features_2d["petal width (cm)"],
           c=target, cmap=cmap, label="Scaled Data")
# plotting the 1st peincipal component
plt.plot(np.linspace(-2, 2, 2) * eigenvectors[:, index_1st][0],
         np.linspace(-2, 2, 2) * eigenvectors[:, index_1st][1],
         label="1st Principal Comonent", color="#1D4D4A")
# plotting the 2st peincipal component
plt.plot(np.linspace(-2, 2, 2) * eigenvectors[:, index_2st][0],
        np.linspace(-2, 2, 2) * eigenvectors[:, index_2st][1],
         label="2nd Principal Comonent", color="#ED5654")

plt.xlabel("Petal Length (cm)")
plt.ylabel("Petal Width (cm)")

plt.legend()
plt.savefig("1_and_2_PCs.png", dpi=200)
plt.show()

## 4. Selecting First k Components

In [None]:
print(f"The first principal component contains",
      f"{eigenvalues[index_1st] * 100 / (eigenvalues[index_1st]+ eigenvalues[index_2st]):.1f}",
     "percent of the total variance\n",
     "->The first component ist sufficient.")

## 5. Construct projection Matrix
The projection Matrix is already contained in eigenvectors

## 6. Project Data onto Principal Axes
For visualization two axes will be displayed even though we'd only need one

In [None]:
projected_features = pd.DataFrame()
# Multiply the scaled features with the eigenvektor matrix
projected_features[scaled_features_2d.columns] = np.matmul(scaled_features_2d, eigenvectors)

cmap = colors.ListedColormap(["#1D4D4A", "#ED5654", "#764674"])
plt.scatter(projected_features["petal length (cm)"],
                               projected_features["petal width (cm)"],
           c=target, cmap=cmap, label="Scaled Data")

plt.xlabel(f"{eigenvectors[:, index_1st][0]:.2f} * Petal Length + {eigenvectors[:, index_1st][1]:.2f} * Petal Width")
plt.ylabel(f"{eigenvectors[:, index_2st][0]:.2f} * Petal Length + {eigenvectors[:, index_2st][1]:.2f} * Petal Width")
plt.savefig("PCA_after_transform.png", dpi=200)
plt.show()

## Perform PCA with sklearn

In [None]:
from sklearn.decomposition import PCA

pca = PCA()
pca.fit(scaled_features)

plt.bar(["PC 1", "PC 2", "PC 3", "PC 4"], pca.explained_variance_ratio_,
        color="#266662")

plt.savefig("PCA_explained_variance.png", dpi=200)

In [None]:
# Components 1 and 2 are already enouhg to explain the full feature space variance
transformed_features = pca.transform(scaled_features)

plt.scatter(transformed_features[:, 0], transformed_features[:, 1], c=target, cmap=cmap)

plt.xlabel("First Principle Component")
plt.ylabel("Second Principle Component")
plt.savefig("PCA_full_iris.png", dpi=200)
plt.show()