# K-Means Clustering Analysis
Imports and elbow method to determine optimal number of clusters

In [None]:
from data_cleaning import ratings_df, watchlist_df 
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D

main_features = ratings_df[["Your Rating", 'IMDb Rating', 'Runtime (mins)', 'Year', 'Num Votes', 'Day_Rated']]
genres = ratings_df[[ 'Action', 'Adventure', 'Animation','Biography', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family','Fantasy', 'Game-Show', 'History', 'Horror', 'Music', 'Musical', 'Mystery', 'Reality-TV', 'Romance', 'Sci-Fi', 'Short', 'Sport', 'Thriller', 'War', 'Western']]

#? Elbow method borderline better without scaling..
scaler = StandardScaler()
scaled_features = scaler.fit_transform(main_features)

distortions = []
K_range = range(1, 20)
for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10) # Setting n_init=10 to ensure code behaves consistlenty after scikit-learn module gets updated
    kmeans.fit(scaled_features)
    distortions.append(kmeans.inertia_)

plt.plot(K_range, distortions, marker='o')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Distortion')
plt.title('Elbow Method for Optimal k')
plt.show()

#? Findings
print("We can see by inspecting the graph that the rate of decrease slows at k = 11, there 11 is the optimal number of clusters for our model")

## Exploration of Clusters through visualisation 

In [None]:

#? Creating k-means clustering model with optimal k=11
model = KMeans(n_clusters=11, random_state=42, n_init=10)  
clusters = model.fit_predict(scaled_features)

#? Creating a cluster_df to highlight my mean cluster rating and example movies for each
features_labels = pd.DataFrame({'Cluster': clusters})
cluster_df = pd.DataFrame({'Cluster': features_labels['Cluster'], 'My Rating': ratings_df['Your Rating']})
cluster_means = cluster_df.groupby('Cluster')['My Rating'].mean()
unique_clusters_sorted = sorted(cluster_df['Cluster'].unique())

table_data = []
for cluster in unique_clusters_sorted:
    if pd.notna(cluster_means.get(cluster)):
        cluster_movies = ratings_df.loc[cluster_df['Cluster'] == cluster, 'Title'].tolist()

        avg_rating = np.round(cluster_means.loc[cluster], 1)
        example_movies = ', '.join(cluster_movies[:3]) 
        table_data.append([int(cluster), avg_rating, example_movies])

table_df = pd.DataFrame(table_data, columns=['Cluster', 'Mean Rating', 'Example Movies'])
print(table_df.to_string(index=False), '\n')

print("""Certain trends standout from the clusters example movies that would have been hard to determine from just boxplots such as:
      Cluster 3, clusters movies with low runtimes (eg. contains TV shows), 
      Cluster 6, clusters movies with low IMDb rating, 
      Cluster 7, clusters movies with High IMDb Ratings and high Number of votes""")

#? Boxplot visualisations to identify trends in the clusters
main_features_list = ["Your Rating", 'IMDb Rating', 'Runtime (mins)', 'Year', 'Num Votes', 'Day_Rated']

for feature in main_features_list:
    sns.boxplot(x='Cluster', y=feature, data=pd.concat([cluster_df['Cluster'], main_features[feature]], axis=1))
    plt.title(f'Distribution of {feature} Across Clusters')
    plt.show()

## Visualising relationships betwen features: Runtime, Year and Num Votes

In [None]:
fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111, projection='3d')
scatter = ax.scatter(ratings_df['Year'],  ratings_df['Num Votes'], ratings_df['Runtime (mins)'], c=ratings_df["Your Rating"], cmap='viridis')
ax.set_title('Scatter Plot of Year, Num Votes, and Runtime with Ratings Color-coded')
ax.set_xlabel('Year')
ax.set_ylabel('Num Votes')
ax.set_zlabel('Runtime (mins)')
cbar = plt.colorbar(scatter, label='My Rating')
plt.show()