In [None]:
import numpy as np
import pandas as pd
import seaborn as sns; sns.set_style("darkgrid")
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.decomposition import PCA

In [None]:
# Load Data
df = pd.read_csv("./data.csv", header=None)

# Rename columns
subjects = ["C1", "C2", "C3", "C4", "C5"]
columns = {index: subject for index, subject in enumerate(subjects)}
df.rename(columns=columns, inplace=True)
df.head()

In [None]:
# Apply grade
def grade(marks: float):
  if marks >= 75: return "S"
  if marks >= 70: return "A"
  if marks >= 65: return "B"
  if marks >= 60: return "C"
  return "F"

df["Average"] = df[subjects].mean(axis=1)
df["Grade"] = df["Average"].apply(grade)

In [None]:
# Scatter Matrix
sns.pairplot(
  df[subjects + ["Grade"]],
  hue="Grade",
  hue_order=["S", "A", "B", "C", "F"],
)

In [None]:
# Correlation analysis
correlation = df[subjects].corr()

# Heatmap
fig, ax = plt.subplots(figsize=(20, 20))
sns.set(font_scale=3)
sns.heatmap(correlation, annot=True, cmap="Greens", square=True, ax=ax)

In [None]:
# Covariance matrix
covariance_matrix = np.cov(df[subjects].T)
print(covariance_matrix)

In [None]:
# Principle component analysis
normalized_df = preprocessing.scale(df[subjects].T)

pca = PCA(n_components=5)
pca.fit(normalized_df)
pca_data = pca.transform(normalized_df)

principle_components = np.round(pca.explained_variance_ratio_ * 100, decimals=1)
columns = ["PC{}".format(i+1) for i in range(len(principle_components))]
principle_components_df = pd.DataFrame([principle_components], columns=columns)

In [None]:
# PCA error
principal_features = pca.components_

# Reconstruct using PCA -> Compare with original
reconstructed_data = np.dot(principal_features.T, pca_data)
reconstruction_error = np.mean(np.square(normalized_df.T - reconstructed_data), axis=0)
print(reconstruction_error)

In [None]:
# Scree plot
fig, ax = plt.subplots(figsize=(20, 10))
sns.set(font_scale=2)
ax = sns.barplot(principle_components_df)
ax.set(xlabel="Principal Component", ylabel="Explained Variance (%)", title="Scree Plot")

In [None]:
# PCA association
pca_df = pd.DataFrame(pca_data, index=[*subjects], columns=columns)
pca_df.index.name = "Subject"
pca_df

In [None]:
# PCA plot
fig, ax = plt.subplots(figsize=(20, 10))
sns.set(font_scale=2)
ax = sns.scatterplot(pca_df, x="PC1", y="PC2", hue="Subject", s=10**3)
ax.set(xlabel=f"PC1: {principle_components[0]}%", ylabel=f"PC2: {principle_components[1]}%", title="PCA Plot")