### Importing Class Similarities Data

First we will import some modules that we might need 😉

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier


##Part 1: Exploring the Data

Now lets read in the data from the ClassSimilarities_2025Spring.csv matrix. [Click here to download the data](https://drive.google.com/uc?download&id=1KHm51Dv09t9bcSaJbSn8eCBLY8ZUsdj2)

In [None]:
# read in data
from google.colab import files
uploaded = files.upload()


In [None]:
sims = pd.read_csv('ClassSimilarities_2025Spring.csv')

sims.tail()


Uncomment the line for your class and create the `class_df` data frame

Call your matrix "class_df"

In [None]:

class_df = sims[sims['Section'] == 'Section 1 (MW 930am)']
#class_df = sims[sims['Section'] == 'Section 2 (MW 11am)']
#class_df = sims[sims['Section'] == 'Section 30 (W 6pm)']

Explore the features of interest.  
Sort by the mean to see which ones are most (and least) popular.  Maybe even sort hightest to lowest!

Plot barplots to visualize the distributions of each of the features.

Suggestion - define the columns of interest as `feature_cols` and use a for loop to iterate over all of the features of interest and plot a barplot.

Try and get the barplot to plot in order from 1 to 5 on the x-axis. (can use "reindex" for this)

In [None]:
feature_cols=class_df.columns.drop(['NYUID','Timestamp','Section','Name','CoffeeTeaSoda','Sleephours'])


plt.figure(figsize=(15, 10))

mean_scores = class_df[feature_cols].mean().sort_values(ascending=False)
mean_scores.round(2)




In [None]:

sorted_features = mean_scores.sort_values(ascending=False).index

plt.figure(figsize=(15,15))
# Iterate over each feature
for i, feature in enumerate(sorted_features):
    # Create a subplot for each feature
    ax = plt.subplot(6,5, i+1)

    # Reindex the value counts to include all values 1 through 5
    counts = class_df[feature].value_counts().reindex(range(1, 6), fill_value=0)

    # Plot the bar chart
    counts.plot(kind='bar', ax=ax)

    # Set the x-axis range
    plt.xticks(range(0, 5))

    # Set the title and
    avg = mean_scores[feature].round(2)
    lab = f"{feature}: {avg}"
    plt.xlabel('')
    ax.set_title(lab, fontsize=10)
    ax.set_ylabel('Count')

# Adjust the layout
plt.tight_layout()

# Show the plot
plt.show()


Plot a heatmap of the correlations to see which features are most positively and negatively correlated.  Use the `.corr()` function in seaborn and `sns.heatmap`.  Any interesting results here?


*you can play with colorbrewer colors here!  My favorite is cmap='RdBu_r' and dont forget to set the limits to (-1,1) using `vmax` and `vmin`*

In [None]:
# heatmap of correlations

import seaborn as sns


# Calculate the correlation matrix
corr = class_df[feature_cols].corr()

# Create a heatmap of the correlation matrix
plt.figure(figsize=(15, 8))
sns.heatmap(corr, annot=True,fmt=".2f", cmap='RdBu_r', cbar=True,vmin=-1,vmax=1)

plt.show()

Now plot a histogram of coffee cups per week.

How many people dont drink any coffee? 😲

In [None]:

ax = sns.histplot(class_df['CoffeeTeaSoda'], bins=10)
ax.set(xlabel='Caffeine Cups per Week', ylabel='Count')
plt.show()

no_coffee=class_df[class_df['CoffeeTeaSoda'] == 0].shape[0]
print("number of no coffee drinkers =",no_coffee)


## Part 2: Looking at distances between students



One helpful suggestions would be to set *NYUID* as the index so that you can easily extract the data for any given  NYUID.

In [None]:

# Set 'NYUID' as the index - this allows you to call the row by the NYUID
class_df.set_index('NYUID', inplace=True)
sims.set_index('NYUID', inplace=True) # only do this if havent done before


Use `sklearn.pairwise_distances`  to calculate a distance matrix "dist_matrix" (It may be helpful to convert the dist_matrix into a data frame using pd.DataFrame)

Use Manhattan distance because it will be more iterpretable.

Identify your data with your NYUID and see if you can find the three most similar students to you and the three least similar students to you

In [None]:
from sklearn.metrics import pairwise_distances

# Calculate the pairwise distances
dist_matrix = pairwise_distances(class_df[feature_cols].values, metric='manhattan')

# Convert the distance matrix into a DataFrame
dist_df = pd.DataFrame(dist_matrix, index=class_df.index, columns=class_df.index)

# Input NYUID you would like to query - your id here!
input_NYUID = '#####'  # Replace 'input_value' with the actual NYUID


Find the three most similar students to you and the three least similar students to you.

Remember that the most similar student will be the student themself!  (distance = 0) so you will have to account for that.

also you might want to check to make sure that the input_NYUID is actuall in the index or else you will get an error.

In [None]:
if (input_NYUID in class_df.index):

    # Get the distances for the input NYUID
    distances = dist_df.loc[input_NYUID]

    # Get the 3 most similar students (excluding the input student itself)
    most_similar = distances.nsmallest(4).iloc[1:] #starting at index 1 removes the student themself as the most similar

    # Get the 3 least similar students
    least_similar = distances.nlargest(3)

    # Print the NYUID and Name of the most and least similar students
    print("Most similar students to :", input_NYUID, class_df.loc[input_NYUID, 'Name'])
    for NYUID in most_similar.index:
        print(NYUID, class_df.loc[NYUID, 'Name'],most_similar[NYUID])

    print("\nLeast similar students to :", input_NYUID, class_df.loc[input_NYUID, 'Name'])
    for NYUID in least_similar.index:
        print(NYUID, class_df.loc[NYUID, 'Name'],least_similar[NYUID])

else:
    print('NYUID not found in the dataset')


Which two people are the closest? Because we use Manhattan distance, this distance is interpreted as the total sum of the absolute differences of the 24 features.


You can look for the smallest value in the matrix, but you need to account for the fact that the diagonals are zero...

this one is tricky, you need to use "unravel_index" to find the row and col in the matrix with the lowest value...(see solution)

In [None]:
# Replace the diagonal of the distance matrix with np.inf
np.fill_diagonal(dist_df.values, np.inf)

# Find the NYUIDs of the two students who are the closest
min_dist = np.unravel_index(np.argmin(dist_df.values), dist_df.shape)
mdl=list(min_dist)
cls=dist_df.index[mdl]

print( class_df.loc[cls[0], 'Name'], ":", class_df.loc[cls[1], 'Name'], dist_df.loc[cls[0], cls[1]])


Which two people in class are the furthest?

In [None]:
# Replace the diagonal of the distance matrix with 0
np.fill_diagonal(dist_df.values, 0)

# Find the NYUIDs of the two students who are the closest
max_dist = np.unravel_index(np.argmax(dist_df.values), dist_df.shape)
mdl=list(max_dist)
cls=dist_df.index[mdl]

print( class_df.loc[cls[0], 'Name'], ":", class_df.loc[cls[1], 'Name'], dist_df.loc[cls[0], cls[1]])

##Part 3: K nearest neighbors for predicting targets##

We want to know if the preferences data has value in predicting the Coffee consumption or the Hours Sleep.

Since we dont have a lot of data in each class, use the entire `sims` data (both classes) and split into 80/20 and fit a knn with k=5 (aka `KNeighborsRegressor(n_neighbors=5)`


In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split

# remove one outlier from troll in evening section :(
outliers = sims[sims['Coffee'] == 69].index
sims = sims.drop(outliers)

# Select the features and target
target = 'Coffee'

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(sims[feature_cols], sims[target], test_size=0.2, random_state=11)

# Create a KNeighborsRegressor with k=5
knn = KNeighborsRegressor(n_neighbors=5)

# Fit the model to the training data
knn.fit(X_train, y_train)


Make a scatterplot of predicted values for the test set on the x-axis and actual values on the y-axis.

Does the prediction seem any good?

In [None]:

import matplotlib.pyplot as plt

# Make predictions on the testing data
y_pred = knn.predict(X_test)

# Create a scatter plot of the actual values versus the predicted values
plt.scatter(y_pred,y_test,alpha=0.4)
plt.xlabel('Predicted Values')
plt.ylabel('Actual Values')
plt.title('Predicted vs Actual Values for Coffee Consumption')

# Add a reference line
plt.plot([y_test.min() , y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=1)

plt.show()

Using a for-loop - find the best k via RMSE.

(Optional - compare to a dumb model that uses just the average of the training set)

In [None]:
from sklearn.metrics import root_mean_squared_error
#yikes this is not good!  But lets calcuate the best k anyway

rmse_best = np.inf
for k in range(1, 21):
    knn = KNeighborsRegressor(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    rmse = root_mean_squared_error(y_test, y_pred)
    if (rmse < rmse_best):
      rmse_best = rmse
      best_k = k
    print('k=', k, 'RMSE:', round(rmse,3))
print("\nBest value of k: ", best_k," with RMSE= ",rmse_best)

In [None]:
# dumb_model - predict everyone with the mean of the population

# RMSE of using just the mean to predict
rmse_dumb = np.sqrt(((y_test - y_train.mean())**2).mean())
rmse_model = root_mean_squared_error(y_test, y_pred)


print(f"RMSE(dumb) = {round(rmse_dumb,2)} and RMSE(model) = {round(rmse_best,2)}")


## Part 4: Clustering

Lets perform hierarchical clustering to find the "10-cluster solution" and print out the clusters.

Did any of you end up in clusters with your project team members?

All code provided...



In [None]:
from sklearn.cluster import AgglomerativeClustering

# Note: this is a different way than used in the HierClust.ipynb notebook.  both are fine!
# Create an AgglomerativeClustering model with 6 clusters
cluster = AgglomerativeClustering(n_clusters=8, linkage='ward')

# Fit the model to the data and predict the cluster labels
cluster_labels = cluster.fit_predict(class_df[feature_cols])
student_clusters = class_df.assign(Cluster=cluster_labels)



In [None]:

for cluster, data in student_clusters.groupby('Cluster'):
    print(f"Cluster {cluster}:")
    print(data.Name.tolist())
    print()

Plot the dendrogram.

Play around with the following  parameters:
in the linkage function : `method` and `metric`

in the dendrogram: `orientation` - for the direction of the plot
                : `color_threshold` - for how the plot is colored showing different clusters

can use `labels=class_df.index` to show NYUID instead of names...


In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage

# Create a linkage matrix
linked = linkage(class_df[feature_cols], 'ward')

# Plot the dendrogram
plt.figure(figsize=(10, 7))
dendrogram(linked, orientation='right', color_threshold=8, labels=class_df.Name, distance_sort='descending', show_leaf_counts=True)
plt.show()

For fun - we can also cluster the *features* to see which are most similar...by passing the feature correlation matrix into the `linked` function




In [None]:
# Calculate the correlation matrix
corr = class_df[feature_cols].corr()

# Create a linkage matrix based on the correlation matrix
linked = linkage(corr, 'ward')

# Create a dendrogram
plt.figure(figsize=(10, 7))
dendrogram(linked, labels=corr.columns, orientation='right',color_threshold = 2)
plt.show()