### 3. Model Processing
This stage involves the application of a tuned clustering algorithm to the data. The clustering algorithm will group similar data points together, and evaulate the homogeniety of the classes of the clusters (the similiarity of professors in the same departments). This step allows us to identify patterns and relationships within the data.

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.cluster import SpectralClustering, AffinityPropagation, KMeans, MiniBatchKMeans, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from mpl_toolkits import mplot3d
from matplotlib.animation import FuncAnimation
import numpy as np
from IPython.display import HTML
import warnings
warnings.filterwarnings('ignore')

random_seed = 0

df = pd.read_csv("data/combined_data.csv")
X = df[['Quality Score', 'Difficulty Score', 'GPA']]
y = df['College']

# Silhouette Scorer function
def silhouette_scorer(estimator, X):
    labels = estimator.fit_predict(X)
    return silhouette_score(X, labels)

# Function to update the angle of view for viewing it in 3d
def rotate(angle):
    global ax
    if ax is not None:
        ax.view_init(elev=30, azim=angle)
    else:
        print("No plot available to rotate.")


#### Plot of Colleges for Reference

In [None]:
# Plot of Colleges For Reference
sns.scatterplot(x=X['Quality Score'], y=X['Difficulty Score'], hue=y, palette='bright', alpha=0.2)
plt.legend(bbox_to_anchor=(1.75, 1), loc='upper right')
plt.title("2D plot for College For Reference")
plt.xlabel("Quality Score")
plt.ylabel("Difficulty Score")
plt.show()

In [None]:
# 3d representation for college for refrence plot
# Filtering out rows with NaN values in y 
valid_indices = y.notna()
X_valid = X[valid_indices]
y_valid = y[valid_indices]

# Set up 
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# the scatter plot with three axes
sc = ax.scatter(X_valid['Quality Score'], X_valid['Difficulty Score'], X_valid['GPA'], 
                c=y_valid.astype('category').cat.codes, cmap='tab10', alpha=0.3)

# Addong axis labels
ax.set_xlabel("Quality Score")
ax.set_ylabel("Difficulty Score")
ax.set_zlabel("GPA")
ax.set_zlim(0, 4)
ax.set_title("3D plot for College For Reference")

# Adding color bar and legend
cbar = plt.colorbar(sc, ax=ax, label='College')
cbar.set_ticks(range(len(y_valid.unique())))
cbar.set_ticklabels(y_valid.unique())

plt.show()

# Creating an animation for a better view
ani = FuncAnimation(fig, rotate, frames=np.arange(270, 360, 1), interval=50)

# To display in Jupyter Notebook, We are going to use HTML display
HTML(ani.to_jshtml())



#### Spectral Clustering

In [None]:
# Spectral Clustering With PCA
spectral_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA()),
    ('sc', SpectralClustering(random_state=random_seed))
])

# Define param grid for GridSearchCV
param_grid = {
    "pca__n_components": [2, 3],  # Testing different PCA components
    "sc__n_clusters": [3, 4, 5, 6, 7, 8, 9, 10],
    "sc__eigen_solver": ['arpack', 'lobpcg'],
    "sc__affinity": ['nearest_neighbors', 'rbf'],
}

# Grid search with silhouette scoring
grid_search = GridSearchCV(spectral_pipeline, param_grid, scoring=silhouette_scorer)
grid_search.fit(X)


In [None]:
# Best estimator
best_spectral = Pipeline([('scaler', StandardScaler()), 
                          ('pca', PCA(n_components=3)), 
                          ('sc', SpectralClustering(n_clusters=3, eigen_solver='lobpcg', affinity='nearest_neighbors', random_state=random_seed))])
output = best_spectral.fit_predict(X)

# Plot Spectral Clustering Results
sns.scatterplot(x=X['Quality Score'], y=X['Difficulty Score'], hue=output, palette='bright', alpha=0.2)
plt.legend(bbox_to_anchor=(1.15, 1), loc='upper right')
plt.title("2D plot for Spectral Clustering Results")
plt.xlabel("Quality Score")
plt.ylabel("Difficulty Score")
plt.show()

In [None]:
# Setting up 3d Representation
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Using Seaborn's color palette to manually add legend that matches the 2d graph
palette = sns.color_palette("bright", n_colors=3)
colors = [palette[label] for label in output]

# Scatter plot with appropriate alpha 
scatter = ax.scatter(X['Quality Score'], X['Difficulty Score'], X['GPA'], c=colors, alpha=0.2)

# Adding axis labels
ax.set_xlabel("Quality Score")
ax.set_ylabel("Difficulty Score")
ax.set_zlabel("GPA")
ax.set_zlim(0, 4)
ax.set_title("3D plot for Spectral Clustering Results")

# Adding legend 
for label, color in enumerate(palette):
    ax.scatter([], [], [], color=color, label=f"{label}")

ax.legend(title="Cluster Labels", loc='upper right')

# Displaying the plot
plt.show()

# Creating an animation for a better view
ani = FuncAnimation(fig, rotate, frames=np.arange(270, 360, 1), interval=50)

# To display in Jupyter Notebook, We are going to use HTML display
HTML(ani.to_jshtml())
 

#### Affinity Propagation

In [None]:
# Affinity Propagation With PCA
affinity_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA()), 
    ('ap', AffinityPropagation(random_state=random_seed))])

# Define param grid for GridSearchCV
param_grid = {
    "pca__n_components": [2, 3],  # Testing different PCA components
    "ap__damping": [0.5, 0.6, 0.7, 0.8, 0.9],
}

# Grid search with silhouette scoring
grid_search = GridSearchCV(affinity_pipeline, param_grid, scoring=silhouette_scorer)
grid_search.fit(X)

In [None]:
best_affinity = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=3)),
    ('ap', AffinityPropagation(damping=0.8, random_state=random_seed))])
output = best_affinity.fit_predict(X)
sns.scatterplot(x=X['Quality Score'], y=X['Difficulty Score'], hue=output, palette='bright', alpha=0.2)
plt.legend(bbox_to_anchor=(1.15, 1), loc='upper right')
plt.title("2D plot for Affinity Propagation Clustering Results")
plt.xlabel("Quality Score")
plt.ylabel("Difficulty Score")
plt.show()

In [None]:
# Setting up
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Using Seaborn's color palette to manually add legend that matches the 2d graph
palette = sns.color_palette("bright", n_colors=78)
colors = [palette[label] for label in output]

# Scatter plot with appropriate alpha 
scatter = ax.scatter(X['Quality Score'], X['Difficulty Score'], X['GPA'], c=colors, alpha=0.2)

# Adding labels
ax.set_xlabel("Quality Score")
ax.set_ylabel("Difficulty Score")
ax.set_zlabel("GPA")
ax.set_zlim(0, 4)
ax.set_title("3D plot for Affinity Propagation Clustering Results")

# Adding legend
for label, color in enumerate(palette):
    ax.scatter([], [], [], color=color, label=f"{label}")

ax.legend(bbox_to_anchor=(1.25, 1), title="Cluster Labels", loc='upper right')

plt.show()

# Creating an animation for a better view
ani = FuncAnimation(fig, rotate, frames=np.arange(270, 360, 1), interval=50)

# To display in Jupyter Notebook, We are going to use HTML display
HTML(ani.to_jshtml())

#### KMeans

In [None]:
# KMeans
kmeans_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA()), 
    ('km', KMeans(random_state=random_seed))])
param_grid = {
    "pca__n_components": [2, 3],
    "km__n_clusters": [3, 4, 5, 6, 7, 8, 9, 10],
    "km__algorithm": ['lloyd', 'elkan'],
}
grid_search = GridSearchCV(kmeans_pipeline, param_grid, scoring=silhouette_scorer)
grid_search.fit(X)

In [None]:
# Run the Best KMeans Clustering Estimator
best_kmeans = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=3)), 
    ('km', KMeans(n_clusters=3, random_state=random_seed))])
output = best_kmeans.fit_predict(X)
sns.scatterplot(x=X['Quality Score'], y=X['Difficulty Score'], hue=output, palette='bright', alpha=0.2)
plt.legend(bbox_to_anchor=(1.15, 1), loc='upper right')
plt.title("2D plot for KMeans Clustering Results")
plt.xlabel("Quality Score")
plt.ylabel("Difficulty Score")
plt.show()

In [None]:
# Setting up
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Using Seaborn's color palette to manually add legend that matches the 2d graph
palette = sns.color_palette("bright", n_colors=3)
colors = [palette[label] for label in output]

# Scatter plot with appropriate alpha 
scatter = ax.scatter(X['Quality Score'], X['Difficulty Score'], X['GPA'], c=colors, alpha=0.2)

# Adding labels
ax.set_xlabel("Quality Score")
ax.set_ylabel("Difficulty Score")
ax.set_zlabel("GPA")
ax.set_zlim(0, 4)
ax.set_title("3D plot for KMeans Clustering Results")

# Adding legend
for label, color in enumerate(palette):
    ax.scatter([], [], [], color=color, label=f"{label}")

ax.legend(bbox_to_anchor=(1.15, 1), title="Cluster Labels", loc='upper right')

plt.show()

# Creating an animation for a better view
ani = FuncAnimation(fig, rotate, frames=np.arange(270, 360, 1), interval=50)

# To display in Jupyter Notebook, We are going to use HTML display
HTML(ani.to_jshtml())

#### Mini Batch KMeans

In [None]:
# Mini Batch KMeans
mini_batch_kmeans_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA()),
    ('mbkm', MiniBatchKMeans(random_state=random_seed))])
param_grid = {
    "pca__n_components": [2, 3],
    "mbkm__n_clusters": [3, 4, 5, 6, 7, 8, 9, 10],
}
grid_search = GridSearchCV(mini_batch_kmeans_pipeline, param_grid, scoring=silhouette_scorer)
grid_search.fit(X)

In [None]:
# Run the Best Mini Batch KMeans Clustering Estimator
best_mbkmeans = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=3)),
    ('mbkm', MiniBatchKMeans(n_clusters=3, random_state=random_seed))])
output = best_mbkmeans.fit_predict(X)
sns.scatterplot(x=X['Quality Score'], y=X['Difficulty Score'], hue=output, palette='bright', alpha=0.2)
plt.legend(bbox_to_anchor=(1.15, 1), loc='upper right')
plt.title("2D plot for Mini Batch KMeans Clustering Results")
plt.xlabel("Quality Score")
plt.ylabel("Difficulty Score")
plt.show()

In [None]:
# Setting up
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Using Seaborn's color palette to manually add legend that matches the 2d graph
palette = sns.color_palette("bright", n_colors=3)
colors = [palette[label] for label in output]

# Scatter plot with appropriate alpha 
scatter = ax.scatter(X['Quality Score'], X['Difficulty Score'], X['GPA'], c=colors, alpha=0.2)

# Adding labels
ax.set_xlabel("Quality Score")
ax.set_ylabel("Difficulty Score")
ax.set_zlabel("GPA")
ax.set_zlim(0, 4)
ax.set_title("3D plot for Mini Batch KMeans Clustering Results")

# Adding legend
for label, color in enumerate(palette):
    ax.scatter([], [], [], color=color, label=f"{label}")
    
# Location of Legend
ax.legend(bbox_to_anchor=(1.15, 1), title="Cluster Labels", loc='upper right')

plt.show()

# Creating an animation for a better view
ani = FuncAnimation(fig, rotate, frames=np.arange(270, 360, 1), interval=50)

# To display in Jupyter Notebook, We are going to use HTML display
HTML(ani.to_jshtml())

#### Agglomerative Clustering

In [None]:
# Agglomerative Clustering
agglo_cluster_pipeline = Pipeline([
    ('scaler', StandardScaler()), 
    ('pca', PCA()),
    ('ac', AgglomerativeClustering())])
param_grid = {
    "pca__n_components": [2, 3],
    "ac__n_clusters": [3, 4, 5, 6, 7, 8, 9, 10],
    "ac__linkage": ['ward', 'complete', 'average', 'single'],
}
grid_search = GridSearchCV(agglo_cluster_pipeline, param_grid, scoring=silhouette_scorer)
grid_search.fit(X)

In [None]:
# Run the Best Agglomerative Clustering Estimator
best_agglo = Pipeline([
    ('scaler', StandardScaler()), 
    ('pca', PCA(n_components=3)),
    ('ac', AgglomerativeClustering(linkage='single', n_clusters=3))])
output = best_agglo.fit_predict(X)
sns.scatterplot(x=X['Quality Score'], y=X['Difficulty Score'], hue=output, palette='bright', alpha=0.2)
plt.legend(bbox_to_anchor=(1.15, 1), loc='upper right')
plt.title("2D plot for Agglomerative Clustering Results")
plt.xlabel("Quality Score")
plt.ylabel("Difficulty Score")
plt.show()

In [None]:
# Setting up
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Using Seaborn's color palette to manually add legend that matches the 2d graph
palette = sns.color_palette("bright", n_colors=3)
colors = [palette[label] for label in output]

# Scatter plot with appropriate alpha 
scatter = ax.scatter(X['Quality Score'], X['Difficulty Score'], X['GPA'], c=colors, alpha=0.2)

# Adding labels
ax.set_xlabel("Quality Score")
ax.set_ylabel("Difficulty Score")
ax.set_zlabel("GPA")
ax.set_zlim(0, 4)
ax.set_title("3D plot for Agglomerative Clustering Results")

# Adding legend
for label, color in enumerate(palette):
    ax.scatter([], [], [], color=color, label=f"{label}")

# Location of the legend
ax.legend(bbox_to_anchor=(1.15, 1), title="Cluster Labels", loc='upper right')

plt.show()

# Creating an animation for a better view
ani = FuncAnimation(fig, rotate, frames=np.arange(270, 360, 1), interval=50)

# To display in Jupyter Notebook, We are going to use HTML display
HTML(ani.to_jshtml())

#### Gaussian Mixtures

In [None]:
# Gaussian Mixtures
gaussian_pipeline = Pipeline([
    ('scaler', StandardScaler()), 
    ('pca', PCA()),
    ('ga', GaussianMixture(random_state=random_seed))])
param_grid = {
    "pca__n_components": [2, 3],
    "ga__n_components": [1, 2, 3],
    "ga__covariance_type": ['full', 'tied', 'diag', 'spherical'],
    "ga__init_params": ['kmeans', 'k-means++', 'random', 'random_from_data'],
}
grid_search = GridSearchCV(gaussian_pipeline, param_grid, scoring=silhouette_scorer)
grid_search.fit(X)

In [None]:
# Run the Best Gaussian Mixtures Estimator
best_gaussian = Pipeline([
    ('scaler', StandardScaler()), 
    ('pca', PCA(n_components=3)),
    ('ga', GaussianMixture(covariance_type='tied', n_components=2, random_state=random_seed))])
output = best_gaussian.fit_predict(X)
sns.scatterplot(x=X['Quality Score'], y=X['Difficulty Score'], hue=output, palette='bright', alpha=0.2)
plt.legend(bbox_to_anchor=(1.15, 1), loc='upper right')
plt.title("2D plot for Gaussian Mixtures Results")
plt.xlabel("Quality Score")
plt.ylabel("Difficulty Score")
plt.show()

In [None]:
# Setting up
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Using Seaborn's color palette to manually add legend that matches the 2d graph
palette = sns.color_palette("bright", n_colors=2)
colors = [palette[label] for label in output]

# Scatter plot with appropriate alpha 
scatter = ax.scatter(X['Quality Score'], X['Difficulty Score'], X['GPA'], c=colors, alpha=0.2)

# Adding labels
ax.set_xlabel("Quality Score")
ax.set_ylabel("Difficulty Score")
ax.set_zlabel("GPA")
ax.set_zlim(0, 4)
ax.set_title("3D plot for Gaussian Mixtures Results")

# Adding legend
for label, color in enumerate(palette):
    ax.scatter([], [], [], color=color, label=f"{label}")

# Location of the legend
ax.legend(bbox_to_anchor=(1.15, 1), title="Cluster Labels", loc='upper right')

plt.show()

# Creating an animation for a better view
ani = FuncAnimation(fig, rotate, frames=np.arange(270, 360, 1), interval=50)

# To display in Jupyter Notebook, We are going to use HTML display
HTML(ani.to_jshtml())