In [5]:
# Notebook: 6_Final_Clustering.ipynb

import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# 1. Load the data and the matrix
df = pd.read_csv('processed_netflix_data.csv')
X = np.load('matrix_reduced.npy')

# 2. Apply K-Means with the chosen K (adjust k if your graph suggested otherwise)
chosen_k = 6 
kmeans = KMeans(n_clusters=chosen_k, init='k-means++', random_state=42)
cluster_labels = kmeans.fit_predict(X)

# 3. Add the cluster labels back to our original dataframe
df['cluster'] = cluster_labels

# 4. Calculate final Evaluation Metrics
score = silhouette_score(X, cluster_labels)
print(f"Final Silhouette Score for {chosen_k} clusters: {score:.4f}")

# 5. Save the results to a final CSV
df.to_csv('final_clustered_netflix_data.csv', index=False)
print("Results saved to 'final_clustered_netflix_data.csv'")

# 6. Quick Check: How many shows are in each cluster?
print("\n--- Distribution of Clusters ---")
print(df['cluster'].value_counts())

Final Silhouette Score for 6 clusters: 0.0493
Results saved to 'final_clustered_netflix_data.csv'

--- Distribution of Clusters ---
cluster
3    3824
4    1250
5     980
2     697
0     638
1     398
Name: count, dtype: int64


In [4]:
#other way around to create clusters
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score

# 1. Load the reduced matrix and the dataframe
X = np.load('matrix_reduced.npy')
df = pd.read_csv('processed_netflix_data.csv')

# 2. Strategy: Use K-Means++ and optimize 'k'
# We will use KMeans++ which places initial centers far apart to improve the DB Index.
# We are choosing 6 clusters based on your previous visualization.
n_clusters = 6
kmeans = KMeans(
    n_clusters=n_clusters, 
    init='k-means++',   # Better initialization for lower DB scores
    n_init=20,          # Runs the algorithm 20 times to find the best separation
    max_iter=500, 
    random_state=42
)

clusters = kmeans.fit_predict(X)

# 3. Save the results
df['cluster'] = clusters
df.to_csv('final_clustered_netflix_data.csv', index=False)

# 4. Detailed Evaluation
db_index = davies_bouldin_score(X, clusters)
sil_score = silhouette_score(X, clusters)

print(f"--- Updated Metrics ---")
print(f"New Davies-Bouldin Index: {db_index:.4f}")
print(f"Silhouette Score: {sil_score:.4f} (Closer to 1.0 is better)")

# 5. Check Cluster Balance
print("\nTitles per Cluster:")
print(df['cluster'].value_counts().sort_index())

--- Updated Metrics ---
New Davies-Bouldin Index: 3.9531
Silhouette Score: 0.0537 (Closer to 1.0 is better)

Titles per Cluster:
cluster
0     397
1    1601
2     515
3     892
4    1054
5    3328
Name: count, dtype: int64
