### 4. Clustering per College
This stage involves generating visuals to analyze and find patterns in the data.

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
import pandas as pd
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.animation import FuncAnimation
import numpy as np
from IPython.display import HTML

random_seed = 0

df = pd.read_csv("data/combined_data.csv")
X = df[['Quality Score', 'Difficulty Score', 'GPA']]
y = df['College']

# Silhouette Scorer function
def silhouette_scorer(estimator, X):
    labels = estimator.fit_predict(X)
    return silhouette_score(X, labels)

# Function to update the angle of view for viewing it in 3d
def rotate(angle):
    global ax
    if ax is not None:
        ax.view_init(elev=30, azim=angle)
    else:
        print("No plot available to rotate.")

#### College of Agriculture and Life Sciences

In [None]:
college_df = df[df['College'] == 'College of Agriculture and Life Sciences']
college = college_df[['Quality Score', 'Difficulty Score', 'GPA']]
best_kmeans = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=3)), 
    ('km', KMeans(n_clusters=3, random_state=random_seed))])
output = best_kmeans.fit_predict(college)
sns.scatterplot(x=college['Quality Score'], y=college['Difficulty Score'], hue=output, palette='bright', alpha=0.2)
plt.legend(title="Cluster Labels", bbox_to_anchor=(1.25, 1), loc='upper right')
plt.title("2D KMeans Clustering Results for Agriculture and Life Sciences")
plt.xlabel("Quality Score")
plt.ylabel("Difficulty Score")
plt.show()

In [None]:
# 3D plot with GPA as the third axis
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')


# Using Seaborn's color palette to manually match the 2D graph colors
palette = sns.color_palette("bright", n_colors=3)
colors = [palette[label] for label in output]

# Plotting the data points in 3D space with colors based on cluster labels
scatter = ax.scatter(college['Quality Score'], college['Difficulty Score'], college['GPA'], c=colors, alpha=0.4)

# Adding labels 
ax.set_xlabel("Quality Score")
ax.set_xlim(1, 5)
ax.set_ylabel("Difficulty Score")
ax.set_ylim(1, 5)
ax.set_zlabel("GPA")
ax.set_zlim(0, 4)
ax.set_title("3D KMeans Clustering Results for Agriculture and Life Sciences")

# Adding a legend to match the cluster labels
for label, color in enumerate(palette):
    ax.scatter([], [], [], color=color, label=f"{label}")
# Locating the Legend
plt.legend(title="Cluster Labels", bbox_to_anchor=(1.25, 1), loc='upper right')

plt.show()

# Creating an animation for a better view
ani = FuncAnimation(fig, rotate, frames=np.arange(270, 360, 1), interval=50)

# To display in Jupyter Notebook, We are going to use HTML display
HTML(ani.to_jshtml())

### College of Design

In [None]:
college_df = df[df['College'] == 'College of Design']
college = college_df[['Quality Score', 'Difficulty Score', 'GPA']]
best_kmeans = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=3)), 
    ('km', KMeans(n_clusters=3, random_state=random_seed))])
output = best_kmeans.fit_predict(college)
sns.scatterplot(x=college['Quality Score'], y=college['Difficulty Score'], hue=output, palette='bright', alpha=0.2)
plt.legend(title="Cluster Labels", bbox_to_anchor=(1.25, 1), loc='upper right')
plt.title("2D KMeans Clustering Results for College of Design")
plt.xlabel("Quality Score")
plt.ylabel("Difficulty Score")
plt.show()

In [None]:
# 3D plot with GPA as the third axis
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Using Seaborn's color palette to manually match the 2D graph colors
palette = sns.color_palette("bright", n_colors=3)
colors = [palette[label] for label in output]

# Plotting the data points in 3D space with colors based on cluster labels
scatter = ax.scatter(college['Quality Score'], college['Difficulty Score'], college['GPA'], c=colors, alpha=0.4)

# Adding labels 
ax.set_xlabel("Quality Score")
ax.set_xlim(1, 5)
ax.set_ylabel("Difficulty Score")
ax.set_ylim(1, 5)
ax.set_zlabel("GPA")
ax.set_zlim(0, 4)
ax.set_title("3D KMeans Clustering Results for College of Design")

# Adding a legend to match the cluster labels
for label, color in enumerate(palette):
    ax.scatter([], [], [], color=color, label=f"{label}")
# Locating the Legend
plt.legend(title="Cluster Labels", bbox_to_anchor=(1.25, 1), loc='upper right')

plt.show()

# Creating an animation for a better view
ani = FuncAnimation(fig, rotate, frames=np.arange(270, 360, 1), interval=50)

# To display in Jupyter Notebook, We are going to use HTML display
HTML(ani.to_jshtml())

#### College of Education

In [None]:
college_df = df[df['College'] == 'College of Education']
college = college_df[['Quality Score', 'Difficulty Score', 'GPA']]
best_kmeans = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=3)), 
    ('km', KMeans(n_clusters=3, random_state=random_seed))])
output = best_kmeans.fit_predict(college)
sns.scatterplot(x=college['Quality Score'], y=college['Difficulty Score'], hue=output, palette='bright', alpha=0.2)
plt.legend(title="Cluster Labels", bbox_to_anchor=(1.25, 1), loc='upper right')
plt.title("2D KMeans Clustering Results for College of Education")
plt.xlabel("Quality Score")
plt.ylabel("Difficulty Score")
plt.show()

In [None]:
# 3D plot with GPA as the third axis
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Using Seaborn's color palette to manually match the 2D graph colors
palette = sns.color_palette("bright", n_colors=3)
colors = [palette[label] for label in output]

# Plotting the data points in 3D space with colors based on cluster labels
scatter = ax.scatter(college['Quality Score'], college['Difficulty Score'], college['GPA'], c=colors, alpha=0.4)

# Adding labels 
ax.set_xlabel("Quality Score")
ax.set_xlim(1, 5)
ax.set_ylabel("Difficulty Score")
ax.set_ylim(1, 5)
ax.set_zlabel("GPA")
ax.set_zlim(0, 4)
ax.set_title("3D KMeans Clustering Results for College of Education")

# Adding a legend to match the cluster labels
for label, color in enumerate(palette):
    ax.scatter([], [], [], color=color, label=f"{label}")
# Locating the Legend
plt.legend(title="Cluster Labels", bbox_to_anchor=(1.25, 1), loc='upper right')
plt.show()

# Creating an animation for a better view
ani = FuncAnimation(fig, rotate, frames=np.arange(270, 360, 1), interval=50)

# To display in Jupyter Notebook, We are going to use HTML display
HTML(ani.to_jshtml())

#### College of Engineering

In [None]:
college_df = df[df['College'] == 'College of Engineering']
college = college_df[['Quality Score', 'Difficulty Score', 'GPA']]
best_kmeans = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=3)), 
    ('km', KMeans(n_clusters=3, random_state=random_seed))])
output = best_kmeans.fit_predict(college)
sns.scatterplot(x=college['Quality Score'], y=college['Difficulty Score'], hue=output, palette='bright', alpha=0.2)
plt.legend(title="Cluster Labels", bbox_to_anchor=(1.25, 1), loc='upper right')
plt.title("2D KMeans Clustering Results for College of Engineering")
plt.xlabel("Quality Score")
plt.ylabel("Difficulty Score")
plt.show()

In [None]:
# 3D plot with GPA as the third axis
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Using Seaborn's color palette to manually match the 2D graph colors
palette = sns.color_palette("bright", n_colors=3)
colors = [palette[label] for label in output]

# Plotting the data points in 3D space with colors based on cluster labels
scatter = ax.scatter(college['Quality Score'], college['Difficulty Score'], college['GPA'], c=colors, alpha=0.4)

# Adding labels 
ax.set_xlabel("Quality Score")
ax.set_xlim(1, 5)
ax.set_ylabel("Difficulty Score")
ax.set_ylim(1, 5)
ax.set_zlabel("GPA")
ax.set_zlim(0, 4)
ax.set_title("3D KMeans Clustering Results for College of Engineering")

# Adding a legend to match the cluster labels
for label, color in enumerate(palette):
    ax.scatter([], [], [], color=color, label=f"{label}")
# Locating the Legend
plt.legend(title="Cluster Labels", bbox_to_anchor=(1.25, 1), loc='upper right')
plt.show()

# Creating an animation for a better view
ani = FuncAnimation(fig, rotate, frames=np.arange(270, 360, 1), interval=50)

# To display in Jupyter Notebook, We are going to use HTML display
HTML(ani.to_jshtml())

#### College of Humanities and Social Sciences

In [None]:
college_df = df[df['College'] == 'College of Humanities and Social Sciences']
college = college_df[['Quality Score', 'Difficulty Score', 'GPA']]
best_kmeans = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=3)), 
    ('km', KMeans(n_clusters=3, random_state=random_seed))])
output = best_kmeans.fit_predict(college)
sns.scatterplot(x=college['Quality Score'], y=college['Difficulty Score'], hue=output, palette='bright', alpha=0.2)
plt.legend(title="Cluster Labels", bbox_to_anchor=(1.25, 1), loc='upper right')
plt.title("2D KMeans Clustering Results for College of Humanities and Social Sciences")
plt.xlabel("Quality Score")
plt.ylabel("Difficulty Score")
plt.show()

In [None]:
# 3D plot with GPA as the third axis
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Using Seaborn's color palette to manually match the 2D graph colors
palette = sns.color_palette("bright", n_colors=3)
colors = [palette[label] for label in output]

# Plotting the data points in 3D space with colors based on cluster labels
scatter = ax.scatter(college['Quality Score'], college['Difficulty Score'], college['GPA'], c=colors, alpha=0.4)

# Adding labels 
ax.set_xlabel("Quality Score")
ax.set_xlim(1, 5)
ax.set_ylabel("Difficulty Score")
ax.set_ylim(1, 5)
ax.set_zlabel("GPA")
ax.set_zlim(0, 4)
ax.set_title("3D KMeans Clustering Results for College of Humanities and Social Sciences")

# Adding a legend to match the cluster labels
for label, color in enumerate(palette):
    ax.scatter([], [], [], color=color, label=f"{label}")
# Locating the Legend
plt.legend(title="Cluster Labels", bbox_to_anchor=(1.25, 1), loc='upper right')
plt.show()

# Creating an animation for a better view
ani = FuncAnimation(fig, rotate, frames=np.arange(270, 360, 1), interval=50)

# To display in Jupyter Notebook, We are going to use HTML display
HTML(ani.to_jshtml())

#### College of Natural Resources

In [None]:
college_df = df[df['College'] == 'College of Natural Resources']
college = college_df[['Quality Score', 'Difficulty Score', 'GPA']]
best_kmeans = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=3)), 
    ('km', KMeans(n_clusters=3, random_state=random_seed))])
output = best_kmeans.fit_predict(college)
sns.scatterplot(x=college['Quality Score'], y=college['Difficulty Score'], hue=output, palette='bright', alpha=0.2)
plt.legend(title="Cluster Labels", bbox_to_anchor=(1.25, 1), loc='upper right')
plt.title("2D KMeans Clustering Results for College of Natural Resources")
plt.xlabel("Quality Score")
plt.ylabel("Difficulty Score")
plt.show()

In [None]:
# 3D plot with GPA as the third axis
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Using Seaborn's color palette to manually match the 2D graph colors
palette = sns.color_palette("bright", n_colors=3)
colors = [palette[label] for label in output]

# Plotting the data points in 3D space with colors based on cluster labels
scatter = ax.scatter(college['Quality Score'], college['Difficulty Score'], college['GPA'], c=colors, alpha=0.4)

# Adding labels 
ax.set_xlabel("Quality Score")
ax.set_xlim(1, 5)
ax.set_ylabel("Difficulty Score")
ax.set_ylim(1, 5)
ax.set_zlabel("GPA")
ax.set_zlim(0, 4)
ax.set_title("3D KMeans Clustering Results for College of Natural Resources")

# Adding a legend to match the cluster labels
for label, color in enumerate(palette):
    ax.scatter([], [], [], color=color, label=f"{label}")
# Locating the Legend
plt.legend(title="Cluster Labels", bbox_to_anchor=(1.25, 1), loc='upper right')
plt.show()

# Creating an animation for a better view
ani = FuncAnimation(fig, rotate, frames=np.arange(270, 360, 1), interval=50)

# To display in Jupyter Notebook, We are going to use HTML display
HTML(ani.to_jshtml())

#### College of Sciences

In [None]:
college_df = df[df['College'] == 'College of Sciences']
college = college_df[['Quality Score', 'Difficulty Score', 'GPA']]
best_kmeans = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=3)), 
    ('km', KMeans(n_clusters=3, random_state=random_seed))])
output = best_kmeans.fit_predict(college)
sns.scatterplot(x=college['Quality Score'], y=college['Difficulty Score'], hue=output, palette='bright', alpha=0.2)
plt.legend(title="Cluster Labels", bbox_to_anchor=(1.25, 1), loc='upper right')
plt.title("2D KMeans Clustering Results for College of Sciences")
plt.xlabel("Quality Score")
plt.ylabel("Difficulty Score")
plt.show()

In [None]:
# 3D plot with GPA as the third axis
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Using Seaborn's color palette to manually match the 2D graph colors
palette = sns.color_palette("bright", n_colors=3)
colors = [palette[label] for label in output]

# Plotting the data points in 3D space with colors based on cluster labels
scatter = ax.scatter(college['Quality Score'], college['Difficulty Score'], college['GPA'], c=colors, alpha=0.4)

# Adding labels 
ax.set_xlabel("Quality Score")
ax.set_xlim(1, 5)
ax.set_ylabel("Difficulty Score")
ax.set_ylim(1, 5)
ax.set_zlabel("GPA")
ax.set_zlim(0, 4)
ax.set_title("3D KMeans Clustering Results for College of Sciences")

# Adding a legend to match the cluster labels
for label, color in enumerate(palette):
    ax.scatter([], [], [], color=color, label=f"{label}")
# Locating the Legend
plt.legend(title="Cluster Labels", bbox_to_anchor=(1.25, 1), loc='upper right')
plt.show()

# Creating an animation for a better view
ani = FuncAnimation(fig, rotate, frames=np.arange(270, 360, 1), interval=50)

# To display in Jupyter Notebook, We are going to use HTML display
HTML(ani.to_jshtml())

#### Poole College of Management

In [None]:
college_df = df[df['College'] == 'Poole College of Management']
college = college_df[['Quality Score', 'Difficulty Score', 'GPA']]
best_kmeans = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=3)), 
    ('km', KMeans(n_clusters=3, random_state=random_seed))])
output = best_kmeans.fit_predict(college)
sns.scatterplot(x=college['Quality Score'], y=college['Difficulty Score'], hue=output, palette='bright', alpha=0.2)
plt.legend(title="Cluster Labels", bbox_to_anchor=(1.25, 1), loc='upper right')
plt.title("2D KMeans Clustering Results for Poole College of Management")
plt.xlabel("Quality Score")
plt.ylabel("Difficulty Score")
plt.show()

In [None]:
# 3D plot with GPA as the third axis
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Using Seaborn's color palette to manually match the 2D graph colors
palette = sns.color_palette("bright", n_colors=3)
colors = [palette[label] for label in output]

# Plotting the data points in 3D space with colors based on cluster labels
scatter = ax.scatter(college['Quality Score'], college['Difficulty Score'], college['GPA'], c=colors, alpha=0.4)

# Adding labels 
ax.set_xlabel("Quality Score")
ax.set_xlim(1, 5)
ax.set_ylabel("Difficulty Score")
ax.set_ylim(1, 5)
ax.set_zlabel("GPA")
ax.set_zlim(0, 4)
ax.set_title("3D KMeans Clustering Results for Poole College of Management")

# Adding a legend to match the cluster labels
for label, color in enumerate(palette):
    ax.scatter([], [], [], color=color, label=f"{label}")
# Locating the Legend
plt.legend(title="Cluster Labels", bbox_to_anchor=(1.25, 1), loc='upper right')
plt.show()

# Creating an animation for a better view
ani = FuncAnimation(fig, rotate, frames=np.arange(270, 360, 1), interval=50)

# To display in Jupyter Notebook, We are going to use HTML display
HTML(ani.to_jshtml())

#### Wilson College of Textiles

In [None]:
college_df = df[df['College'] == 'Wilson College of Textiles']
college = college_df[['Quality Score', 'Difficulty Score', 'GPA']]
print(college.shape)
best_kmeans = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=2)),
    ('km', KMeans(n_clusters=2, random_state=random_seed))])
output = best_kmeans.fit_predict(college)
sns.scatterplot(x=college['Quality Score'], y=college['Difficulty Score'], hue=output, palette='bright', alpha=0.2)
plt.legend(title="Cluster Labels", bbox_to_anchor=(1.25, 1), loc='upper right')
plt.title("2D KMeans Clustering Results for Wilson College of Textiles")
plt.xlabel("Quality Score")
plt.ylabel("Difficulty Score")
plt.xlim(1.0, 5.0)
plt.ylim(1.0, 5.0)
plt.show()

In [None]:
# 3D plot with GPA as the third axis
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Using Seaborn's color palette to manually match the 2D graph colors
palette = sns.color_palette("bright", n_colors=2)
colors = [palette[label] for label in output]

# Plotting the data points in 3D space with colors based on cluster labels
scatter = ax.scatter(college['Quality Score'], college['Difficulty Score'], college['GPA'], c=colors, alpha=0.4)

# Adding labels 
ax.set_xlabel("Quality Score")
ax.set_xlim(1, 5)
ax.set_ylabel("Difficulty Score")
ax.set_ylim(1, 5)
ax.set_zlabel("GPA")
ax.set_zlim(0, 4)
ax.set_title("3D KMeans Clustering Results for Wilson College of Textiles")

# Adding a legend to match the cluster labels
for label, color in enumerate(palette):
    ax.scatter([], [], [], color=color, label=f"{label}")
# Locating the Legend
plt.legend(title="Cluster Labels", bbox_to_anchor=(1.25, 1), loc='upper right')
plt.show()

# Creating an animation for a better view
ani = FuncAnimation(fig, rotate, frames=np.arange(270, 360, 1), interval=50)

# To display in Jupyter Notebook, We are going to use HTML display
HTML(ani.to_jshtml())