### 4. Clustering per College
This stage involves generating visuals to analyze and find patterns in the data.

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
import pandas as pd
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.animation import FuncAnimation
import numpy as np
from IPython.display import HTML

random_seed = 0

df = pd.read_csv("data/combined_data.csv")
X = df[['Quality Score', 'Difficulty Score', 'GPA']]
y = df['College']

# Silhouette Scorer function
def silhouette_scorer(estimator, X):
    labels = estimator.fit_predict(X)
    return silhouette_score(X, labels)

# Function to update the angle of view for viewing it in 3d
def rotate(angle):
    global ax
    if ax is not None:
        ax.view_init(elev=30, azim=angle)
    else:
        print("No plot available to rotate.")

In [None]:
# TODO: Visualize data points and rating distributions for each college individually
# TODO: Visualize aggregation + average of data points and rating distributions for all colleges
# TODO: Visualize 

#### College of Agriculture and Life Sciences

In [None]:
college_df = df[df['College'] == 'College of Agriculture and Life Sciences']
college = college_df[['Quality Score', 'Difficulty Score', 'GPA']]
best_kmeans = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=3)), 
    ('km', KMeans(n_clusters=3, random_state=random_seed))])
output = best_kmeans.fit_predict(college)
sns.scatterplot(x=college['Quality Score'], y=college['Difficulty Score'], hue=output, palette='bright', alpha=0.2)
plt.legend(bbox_to_anchor=(1.15, 1), loc='upper right')
plt.title("KMeans Clustering Results")
plt.xlabel("Quality Score")
plt.ylabel("Difficulty Score")
plt.show()

In [None]:
# 3D plot with GPA as the third axis
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Plotting the data points in 3D space with colors based on cluster labels
sc = ax.scatter(college['Quality Score'], college['Difficulty Score'], college['GPA'], 
                c=output, cmap='tab10', alpha=0.6)

# Adding labels 
ax.set_xlabel("Quality Score")
ax.set_ylabel("Difficulty Score")
ax.set_zlabel("GPA")
ax.set_zlim(0, 4)
ax.set_title("3D KMeans Clustering Results for Agriculture and Life Sciences")

# Adding color bar
cbar = plt.colorbar(sc, label='Cluster')
plt.show()

# Creating an animation for a better view
ani = FuncAnimation(fig, rotate, frames=np.arange(0, 360, 1), interval=50)

# To display in Jupyter Notebook, We are going to use HTML display
HTML(ani.to_jshtml())

### College of Design

In [None]:
college_df = df[df['College'] == 'College of Design']
college = college_df[['Quality Score', 'Difficulty Score', 'GPA']]
best_kmeans = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=3)), 
    ('km', KMeans(n_clusters=3, random_state=random_seed))])
output = best_kmeans.fit_predict(college)
sns.scatterplot(x=college['Quality Score'], y=college['Difficulty Score'], hue=output, palette='bright', alpha=0.2)
plt.legend(bbox_to_anchor=(1.15, 1), loc='upper right')
plt.title("KMeans Clustering Results")
plt.xlabel("Quality Score")
plt.ylabel("Difficulty Score")
plt.show()

In [None]:
# 3D plot with GPA as the third axis
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Plotting the data points in 3D space with colors based on cluster labels
sc = ax.scatter(college['Quality Score'], college['Difficulty Score'], college['GPA'], 
                c=output, cmap='tab10', alpha=0.6)

# Adding labels 
ax.set_xlabel("Quality Score")
ax.set_ylabel("Difficulty Score")
ax.set_zlabel("GPA")
ax.set_zlim(0, 4)
ax.set_title("3D KMeans Clustering Results for College of Design")

# Adding color bar
cbar = plt.colorbar(sc, label='Cluster')
plt.show()

# Creating an animation for a better view
ani = FuncAnimation(fig, rotate, frames=np.arange(0, 360, 1), interval=50)

# To display in Jupyter Notebook, We are going to use HTML display
HTML(ani.to_jshtml())

#### College of Education

In [None]:
college_df = df[df['College'] == 'College of Education']
college = college_df[['Quality Score', 'Difficulty Score', 'GPA']]
best_kmeans = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=3)), 
    ('km', KMeans(n_clusters=3, random_state=random_seed))])
output = best_kmeans.fit_predict(college)
sns.scatterplot(x=college['Quality Score'], y=college['Difficulty Score'], hue=output, palette='bright', alpha=0.2)
plt.legend(bbox_to_anchor=(1.15, 1), loc='upper right')
plt.title("KMeans Clustering Results")
plt.xlabel("Quality Score")
plt.ylabel("Difficulty Score")
plt.show()

In [None]:
# 3D plot with GPA as the third axis
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Plotting the data points in 3D space with colors based on cluster labels
sc = ax.scatter(college['Quality Score'], college['Difficulty Score'], college['GPA'], 
                c=output, cmap='tab10', alpha=0.6)

# Adding labels 
ax.set_xlabel("Quality Score")
ax.set_ylabel("Difficulty Score")
ax.set_zlabel("GPA")
ax.set_zlim(0, 4)
ax.set_title("3D KMeans Clustering Results for College of Education")

# Adding color bar
cbar = plt.colorbar(sc, label='Cluster')
plt.show()

# Creating an animation for a better view
ani = FuncAnimation(fig, rotate, frames=np.arange(0, 360, 1), interval=50)

# To display in Jupyter Notebook, We are going to use HTML display
HTML(ani.to_jshtml())

#### College of Engineering

In [None]:
college_df = df[df['College'] == 'College of Engineering']
college = college_df[['Quality Score', 'Difficulty Score', 'GPA']]
best_kmeans = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=3)), 
    ('km', KMeans(n_clusters=3, random_state=random_seed))])
output = best_kmeans.fit_predict(college)
sns.scatterplot(x=college['Quality Score'], y=college['Difficulty Score'], hue=output, palette='bright', alpha=0.2)
plt.legend(bbox_to_anchor=(1.15, 1), loc='upper right')
plt.title("KMeans Clustering Results")
plt.xlabel("Quality Score")
plt.ylabel("Difficulty Score")
plt.show()

In [None]:
# 3D plot with GPA as the third axis
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Plotting the data points in 3D space with colors based on cluster labels
sc = ax.scatter(college['Quality Score'], college['Difficulty Score'], college['GPA'], 
                c=output, cmap='tab10', alpha=0.6)

# Adding labels 
ax.set_xlabel("Quality Score")
ax.set_ylabel("Difficulty Score")
ax.set_zlabel("GPA")
ax.set_zlim(0, 4)
ax.set_title("3D KMeans Clustering Results for College of Engineering")

# Adding color bar
cbar = plt.colorbar(sc, label='Cluster')
plt.show()

# Creating an animation for a better view
ani = FuncAnimation(fig, rotate, frames=np.arange(0, 360, 1), interval=50)

# To display in Jupyter Notebook, We are going to use HTML display
HTML(ani.to_jshtml())

#### College of Humanities and Social Sciences

In [None]:
college_df = df[df['College'] == 'College of Humanities and Social Sciences']
college = college_df[['Quality Score', 'Difficulty Score', 'GPA']]
best_kmeans = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=3)), 
    ('km', KMeans(n_clusters=3, random_state=random_seed))])
output = best_kmeans.fit_predict(college)
sns.scatterplot(x=college['Quality Score'], y=college['Difficulty Score'], hue=output, palette='bright', alpha=0.2)
plt.legend(bbox_to_anchor=(1.15, 1), loc='upper right')
plt.title("KMeans Clustering Results")
plt.xlabel("Quality Score")
plt.ylabel("Difficulty Score")
plt.show()

In [None]:
# 3D plot with GPA as the third axis
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Plotting the data points in 3D space with colors based on cluster labels
sc = ax.scatter(college['Quality Score'], college['Difficulty Score'], college['GPA'], 
                c=output, cmap='tab10', alpha=0.6)

# Adding labels 
ax.set_xlabel("Quality Score")
ax.set_ylabel("Difficulty Score")
ax.set_zlabel("GPA")
ax.set_zlim(0, 4)
ax.set_title("3D KMeans Clustering Results for College of Humanities and Social Sciences")

# Adding color bar
cbar = plt.colorbar(sc, label='Cluster')
plt.show()

# Creating an animation for a better view
ani = FuncAnimation(fig, rotate, frames=np.arange(0, 360, 1), interval=50)

# To display in Jupyter Notebook, We are going to use HTML display
HTML(ani.to_jshtml())

#### College of Natural Resources

In [None]:
college_df = df[df['College'] == 'College of Natural Resources']
college = college_df[['Quality Score', 'Difficulty Score', 'GPA']]
best_kmeans = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=3)), 
    ('km', KMeans(n_clusters=3, random_state=random_seed))])
output = best_kmeans.fit_predict(college)
sns.scatterplot(x=college['Quality Score'], y=college['Difficulty Score'], hue=output, palette='bright', alpha=0.2)
plt.legend(bbox_to_anchor=(1.15, 1), loc='upper right')
plt.title("KMeans Clustering Results")
plt.xlabel("Quality Score")
plt.ylabel("Difficulty Score")
plt.show()

In [None]:
# 3D plot with GPA as the third axis
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Plotting the data points in 3D space with colors based on cluster labels
sc = ax.scatter(college['Quality Score'], college['Difficulty Score'], college['GPA'], 
                c=output, cmap='tab10', alpha=0.6)

# Adding labels 
ax.set_xlabel("Quality Score")
ax.set_ylabel("Difficulty Score")
ax.set_zlabel("GPA")
ax.set_zlim(0, 4)
ax.set_title("3D KMeans Clustering Results for College of Natural Resources")

# Adding color bar
cbar = plt.colorbar(sc, label='Cluster')
plt.show()

# Creating an animation for a better view
ani = FuncAnimation(fig, rotate, frames=np.arange(0, 360, 1), interval=50)

# To display in Jupyter Notebook, We are going to use HTML display
HTML(ani.to_jshtml())

#### College of Sciences

In [None]:
college_df = df[df['College'] == 'College of Sciences']
college = college_df[['Quality Score', 'Difficulty Score', 'GPA']]
best_kmeans = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=3)), 
    ('km', KMeans(n_clusters=3, random_state=random_seed))])
output = best_kmeans.fit_predict(college)
sns.scatterplot(x=college['Quality Score'], y=college['Difficulty Score'], hue=output, palette='bright', alpha=0.2)
plt.legend(bbox_to_anchor=(1.15, 1), loc='upper right')
plt.title("KMeans Clustering Results")
plt.xlabel("Quality Score")
plt.ylabel("Difficulty Score")
plt.show()

In [None]:
# 3D plot with GPA as the third axis
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Plotting the data points in 3D space with colors based on cluster labels
sc = ax.scatter(college['Quality Score'], college['Difficulty Score'], college['GPA'], 
                c=output, cmap='tab10', alpha=0.6)

# Adding labels 
ax.set_xlabel("Quality Score")
ax.set_ylabel("Difficulty Score")
ax.set_zlabel("GPA")
ax.set_zlim(0, 4)
ax.set_title("3D KMeans Clustering Results for College of Sciences")

# Adding color bar
cbar = plt.colorbar(sc, label='Cluster')
plt.show()

# Creating an animation for a better view
ani = FuncAnimation(fig, rotate, frames=np.arange(0, 360, 1), interval=50)

# To display in Jupyter Notebook, We are going to use HTML display
HTML(ani.to_jshtml())

#### Poole College of Management

In [None]:
college_df = df[df['College'] == 'Poole College of Management']
college = college_df[['Quality Score', 'Difficulty Score', 'GPA']]
best_kmeans = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=3)), 
    ('km', KMeans(n_clusters=3, random_state=random_seed))])
output = best_kmeans.fit_predict(college)
sns.scatterplot(x=college['Quality Score'], y=college['Difficulty Score'], hue=output, palette='bright', alpha=0.2)
plt.legend(bbox_to_anchor=(1.15, 1), loc='upper right')
plt.title("KMeans Clustering Results")
plt.xlabel("Quality Score")
plt.ylabel("Difficulty Score")
plt.show()

In [None]:
# 3D plot with GPA as the third axis
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
# Plotting the data points in 3D space with colors based on cluster labels
sc = ax.scatter(college['Quality Score'], college['Difficulty Score'], college['GPA'], 
                c=output, cmap='tab10', alpha=0.6)

# Adding labels 
ax.set_xlabel("Quality Score")
ax.set_ylabel("Difficulty Score")
ax.set_zlabel("GPA")
ax.set_zlim(0, 4)
ax.set_title("3D KMeans Clustering Results for Poole College of Management")

# Adding color bar
cbar = plt.colorbar(sc, label='Cluster')
plt.show()

# Creating an animation for a better view
ani = FuncAnimation(fig, rotate, frames=np.arange(0, 360, 1), interval=50)

# To display in Jupyter Notebook, We are going to use HTML display
HTML(ani.to_jshtml())

#### Wilson College of Textiles

In [None]:
college_df = df[df['College'] == 'Wilson College of Textiles']
college = college_df[['Quality Score', 'Difficulty Score', 'GPA']]
print(college.shape)
best_kmeans = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=2)),
    ('km', KMeans(n_clusters=2, random_state=random_seed))])
output = best_kmeans.fit_predict(college)
sns.scatterplot(x=college['Quality Score'], y=college['Difficulty Score'], hue=output, palette='bright', alpha=0.2)
plt.legend(bbox_to_anchor=(1.15, 1), loc='upper right')
plt.title("KMeans Clustering Results")
plt.xlabel("Quality Score")
plt.ylabel("Difficulty Score")
plt.xlim(1.0, 5.0)
plt.ylim(1.0, 5.0)
plt.show()

In [None]:
# 3D plot with GPA as the third axis
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Plotting the data points in 3D space with colors based on cluster labels
sc = ax.scatter(college['Quality Score'], college['Difficulty Score'], college['GPA'], 
                c=output, cmap='tab10', alpha=0.6)

# Adding labels 
ax.set_xlabel("Quality Score")
ax.set_ylabel("Difficulty Score")
ax.set_zlabel("GPA")
ax.set_zlim(0, 4)
ax.set_title("3D KMeans Clustering Results for Wilson College of Textiles")

# Adding color bar
cbar = plt.colorbar(sc, label='Cluster')
plt.show()

# Creating an animation for a better view
ani = FuncAnimation(fig, rotate, frames=np.arange(0, 360, 1), interval=50)

# To display in Jupyter Notebook, We are going to use HTML display
HTML(ani.to_jshtml())

### 5. Interpretation
This stage involves computing metrics to drive insights about the data.

In [None]:
# TODO: Calculate and compare weighted averages of ratings for each college, compare to global averages
# TODO: Calculate metrics like standard deviation, median, outliers, for each college and globally
# TODO: Compare expected salary with average ratings for each college