In [10]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import umap
import hdbscan
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from torch.utils.data import DataLoader, TensorDataset
from sentence_transformers import SentenceTransformer
import warnings
import random

# Suppress warnings
warnings.filterwarnings('ignore')

# ─── CONFIG ────────────────────────────────────────────────────────────────────
CONFIG = {
    'data_path':               r"C:\Users\PC\Downloads\merged_movies (3).csv",
    'embeddings_path':         "new_embeddings_distilroberta.npy",
    'output_dir':              "final_movies_clusters",
    'umap_components':         15,
    'umap_n_neighbors':        75,
    'umap_min_dist':           0.1,
    'umap_metric':             'cosine',
    'latent_dim':              64,
    'autoencoder_epochs':      50,
    'denoising_epochs':        15,
    'autoencoder_batch_size':  256,
    'denoising_noise_std':     0.05,
    # Original HDBSCAN parameters
    'hdbscan_min_cluster_size': 50,
    'hdbscan_min_samples':      5,
    'hdbscan_epsilon':          0.3,
    'hdbscan_method':           'eom',
    # Feature weights 
    'genre_weight':            1.5,
    'runtime_weight':          0.1,
    'year_weight':             0.3,
    'sbert_weight':            1.0,
    # Subsample for metrics
    'silhouette_sample_size':   10000,
    # Stricter noise reassignment threshold
    'noise_reassign_threshold': 90
}

os.makedirs(CONFIG['output_dir'], exist_ok=True)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# ─── 1. LOAD & FILTER DATA ─────────────────────────────────────────────────────
df = pd.read_csv(CONFIG['data_path']).drop_duplicates('MovieID').reset_index(drop=True)
if 'Genre:(no genres listed)' in df.columns:
    df = df[df['Genre:(no genres listed)'] == False].reset_index(drop=True)
df['release_year'] = pd.to_numeric(df.get('release_year', np.nan), errors='coerce')
df['release_year'] = df['release_year'].fillna(df['release_year'].median()).clip(1900, 2025)
df['runtime'] = pd.to_numeric(df['runtime'], errors='coerce').fillna(df['runtime'].median())

# Debug dataset-wide genre distribution
genre_cols = [c for c in df.columns if c.startswith('Genre:')]
print(f"Total genre columns in dataset: {len(genre_cols)}")
genre_counts = df[genre_cols].sum().sort_values(ascending=False)
print(f"Top 5 genres in dataset:")
for i in range(min(5, len(genre_counts))):
    print(f"{genre_counts.index[i]}: {int(genre_counts.iloc[i])} movies")
single_genre_movies = df[genre_cols].sum(axis=1) == 1
drama_only = (df['Genre:Drama'] == 1) & (df[genre_cols].sum(axis=1) == 1)
print(f"Movies with only one genre: {single_genre_movies.sum()} ({single_genre_movies.sum()/len(df)*100:.2f}%)")
print(f"Movies with only Drama genre: {drama_only.sum()} ({drama_only.sum()/len(df)*100:.2f}%)")

# ─── 2. LOAD / COMPUTE SBERT EMBEDDINGS ────────────────────────────────────────
if os.path.exists(CONFIG['embeddings_path']):
    sbert_embeddings = np.load(CONFIG['embeddings_path'])
else:
    st_model = SentenceTransformer('distilroberta-base', device=device)
    sbert_embeddings = st_model.encode(
        df['overview'].fillna(''),
        batch_size=CONFIG['autoencoder_batch_size'],
        show_progress_bar=True,
        convert_to_numpy=True
    )
    np.save(CONFIG['embeddings_path'], sbert_embeddings)

# ─── 3. DENOISING AUTOENCODER ────────────────────────────────────────────────────
class DenoiseAE(nn.Module):
    def __init__(self, inp, lat):
        super().__init__()
        self.enc = nn.Sequential(
            nn.Linear(inp, 512), nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(512, lat), nn.ReLU()
        )
        self.dec = nn.Sequential(
            nn.Linear(lat, 512), nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(512, inp)
        )
    def forward(self, x):
        z = self.enc(x)
        return self.dec(z), z

def train_denoiser(emb):
    model = DenoiseAE(emb.shape[1], CONFIG['latent_dim']).to(device)
    opt = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-5)
    loader = DataLoader(
        TensorDataset(torch.tensor(emb, dtype=torch.float32)),
        batch_size=CONFIG['autoencoder_batch_size'], shuffle=True
    )
    best_loss, patience, counter = float('inf'), 3, 0
    for epoch in range(CONFIG['denoising_epochs']):
        model.train()
        total = 0.0
        for (batch,) in loader:
            batch = batch.to(device)
            noisy = batch + torch.randn_like(batch) * CONFIG['denoising_noise_std']
            recon, _ = model(noisy)
            loss = nn.MSELoss()(recon, batch)
            opt.zero_grad()
            loss.backward()
            opt.step()
            total += loss.item()
        avg = total / len(loader)
        if avg < best_loss:
            best_loss, counter = avg, 0
            torch.save(model.state_dict(), os.path.join(CONFIG['output_dir'], 'denoiser.pth'))
        else:
            counter += 1
            if counter >= patience:
                break
    model.load_state_dict(torch.load(os.path.join(CONFIG['output_dir'], 'denoiser.pth')))
    model.eval()
    with torch.no_grad():
        _, den = model(torch.tensor(emb, dtype=torch.float32).to(device))
    return den.cpu().numpy()

sbert_denoised = train_denoiser(sbert_embeddings)

# ─── 4. UMAP REDUCTION ─────────────────────────────────────────────────────────
um = umap.UMAP(
    n_components=CONFIG['umap_components'],
    n_neighbors=CONFIG['umap_n_neighbors'],
    min_dist=CONFIG['umap_min_dist'],
    metric=CONFIG['umap_metric'],
    random_state=42
)
umap_emb = um.fit_transform(sbert_denoised)
sbert_reduced = MinMaxScaler().fit_transform(umap_emb)

# Visualize UMAP projection for debugging
plt.figure(figsize=(10, 8))
sns.scatterplot(x=sbert_reduced[:10000, 0], y=sbert_reduced[:10000, 1], s=10, alpha=0.5)
plt.title('UMAP Projection of Denoised SBERT Embeddings (First 10,000 Movies)')
plt.savefig(os.path.join(CONFIG['output_dir'], 'umap_projection.png'))
plt.close()

# ─── 5. HYBRID FEATURE ENGINEERING ─────────────────────────────────────────────
def make_hybrid(df, sbert_red):
    gcols = [c for c in df.columns if c.startswith('Genre:')]
    # Normalize genre weights and reduce Drama dominance
    w = np.log(len(df) / (df[gcols].sum() + 1))
    w = MinMaxScaler().fit_transform(w.values.reshape(-1, 1)).flatten()  # Convert Series to array and normalize
    w[gcols.index('Genre:Drama')] *= 0.7  # Stronger reduction
    gf = df[gcols].values * w * CONFIG['genre_weight']
    y = df['release_year'].values.reshape(-1, 1)
    yf = MinMaxScaler().fit_transform(y) * CONFIG['year_weight']
    r = df['runtime'].values.reshape(-1, 1)
    rf = MinMaxScaler().fit_transform(r) * CONFIG['runtime_weight']
    return np.hstack([gf, yf, rf, sbert_red * CONFIG['sbert_weight']])

hybrid = make_hybrid(df, sbert_reduced)
hybrid = MinMaxScaler().fit_transform(hybrid)

# Validate feature variance
feature_variances = np.var(hybrid, axis=0)
print(f"Feature variances (min: {feature_variances.min():.4f}, max: {feature_variances.max():.4f}, mean: {feature_variances.mean():.4f})")
if feature_variances.min() < 1e-4:
    print("Warning: Some features have near-zero variance, consider removing them.")

# ─── 6. COMPRESS HYBRID FEATURES ───────────────────────────────────────────────
class HybridAE(nn.Module):
    def __init__(self, inp, lat):
        super().__init__()
        self.enc = nn.Sequential(
            nn.Linear(inp, 256), nn.ReLU(),
            nn.Linear(256, lat), nn.ReLU()
        )
        self.dec = nn.Sequential(
            nn.Linear(lat, 256), nn.ReLU(),
            nn.Linear(256, inp)
        )
    def forward(self, x):
        z = self.enc(x)
        return self.dec(z), z

def train_hybrid(ftrs):
    model = HybridAE(ftrs.shape[1], CONFIG['latent_dim']).to(device)
    opt = optim.Adam(model.parameters(), lr=5e-4)
    loader = DataLoader(
        TensorDataset(torch.tensor(ftrs, dtype=torch.float32)),
        batch_size=CONFIG['autoencoder_batch_size'], shuffle=True
    )
    best, patience, counter = float('inf'), 5, 0
    for epoch in range(CONFIG['autoencoder_epochs']):
        model.train()
        total = 0.0
        for (batch,) in loader:
            batch = batch.to(device)
            recon, _ = model(batch)
            loss = nn.MSELoss()(recon, batch)
            opt.zero_grad()
            loss.backward()
            opt.step()
            total += loss.item()
        avg = total / len(loader)
        if avg < best:
            best, counter = avg, 0
            torch.save(model.state_dict(), os.path.join(CONFIG['output_dir'], 'hybrid_ae.pth'))
        else:
            counter += 1
            if counter >= patience:
                break
    model.load_state_dict(torch.load(os.path.join(CONFIG['output_dir'], 'hybrid_ae.pth')))
    model.eval()
    with torch.no_grad():
        _, latent = model(torch.tensor(ftrs, dtype=torch.float32).to(device))
    return latent.cpu().numpy()

latent = train_hybrid(hybrid)

# ─── 7. HDBSCAN CLUSTERING ─────────────────────────────────────────────────────
ds_hdb = hdbscan.HDBSCAN(
    min_cluster_size=CONFIG['hdbscan_min_cluster_size'],
    min_samples=CONFIG['hdbscan_min_samples'],
    cluster_selection_epsilon=CONFIG['hdbscan_epsilon'],
    cluster_selection_method=CONFIG['hdbscan_method'],
    metric='euclidean',
    core_dist_n_jobs=-1
)
labels = ds_hdb.fit_predict(latent)
print(f"Noise after HDBSCAN: {np.mean(labels == -1)*100:.2f}%")

# ─── 8. SELECTIVE NOISE REASSIGNMENT ───────────────────────────────────────────
from sklearn.metrics.pairwise import euclidean_distances
mask_noise = labels == -1
noise_count = np.sum(mask_noise)
print(f"Noise points before reassignment: {noise_count} ({noise_count/len(labels)*100:.2f}%)")
if mask_noise.any():
    valid_labels = np.unique(labels[~mask_noise])
    centroids = np.vstack([latent[labels == c].mean(axis=0) for c in valid_labels])
    distances = euclidean_distances(latent[mask_noise], centroids)
    min_distances = distances.min(axis=1)
    threshold = np.percentile(min_distances, CONFIG['noise_reassign_threshold'])
    noise_indices = np.where(mask_noise)[0]
    reassign_count = 0
    for i, dist in enumerate(min_distances):
        if dist <= threshold:
            closest_cluster = valid_labels[np.argmin(distances[i])]
            labels[noise_indices[i]] = closest_cluster
            reassign_count += 1
    print(f"Reassigned {reassign_count} noise points")
else:
    print("No noise points to reassign")

final_noise_count = np.sum(labels == -1)
print(f"Noise after reassignment: {final_noise_count/len(labels)*100:.2f}%")

# ─── 9. CLUSTER SIZE REPORTING ─────────────────────────────────────────────────
cluster_sizes = pd.Series(labels[labels != -1]).value_counts().sort_values(ascending=False)
print("\nTop 5 largest clusters:")
for i in range(5):
    if i < len(cluster_sizes):
        print(f"Cluster {cluster_sizes.index[i]}: {cluster_sizes.iloc[i]} movies")

# Save largest cluster's movies to CSV
if len(cluster_sizes) > 0:
    largest_cluster_id = cluster_sizes.index[0]
    largest_cluster_movies = df[labels == largest_cluster_id][['MovieID', 'title', 'release_year', 'runtime'] + genre_cols]
    largest_cluster_movies.to_csv(os.path.join(CONFIG['output_dir'], f'cluster_{largest_cluster_id}_movies.csv'), index=False)
    print(f"Saved largest cluster (Cluster {largest_cluster_id}) movies to: {CONFIG['output_dir']}/cluster_{largest_cluster_id}_movies.csv")
    genre_counts = largest_cluster_movies[genre_cols].sum().sort_values(ascending=False)
    print(f"\nTotal genre columns in Cluster {largest_cluster_id}: {len(genre_cols)}")
    print(f"Non-zero genre columns in Cluster {largest_cluster_id}: {sum(genre_counts > 0)}")
    print(f"\nTop 5 genres in Cluster {largest_cluster_id}:")
    for i in range(min(5, len(genre_counts))):
        print(f"{genre_counts.index[i]}: {int(genre_counts.iloc[i])} movies")

# ─── 10. FINAL METRICS & SAVE ──────────────────────────────────────────────────
# Compute final metrics
np.random.seed(42)
idx = np.random.choice(len(latent), min(len(latent), CONFIG['silhouette_sample_size']), replace=False)
mask = labels[idx] != -1
if mask.any():
    sil_score = silhouette_score(latent[idx][mask], labels[idx][mask])
    db_index = davies_bouldin_score(latent[idx][mask], labels[idx][mask])
    ch_score = calinski_harabasz_score(latent[idx][mask], labels[idx][mask])
else:
    sil_score, db_index, ch_score = 0.0, np.inf, 0.0

# Compute cluster statistics
num_clusters = len(cluster_sizes)
total_movies = len(labels)
movies_in_clusters = total_movies - final_noise_count
percentage_in_clusters = (movies_in_clusters / total_movies) * 100
avg_size = cluster_sizes.mean() if num_clusters > 0 else 0
min_size = cluster_sizes.min() if num_clusters > 0 else 0
max_size = cluster_sizes.max() if num_clusters > 0 else 0
median_size = cluster_sizes.median() if num_clusters > 0 else 0

# Generate report
report_lines = []
report_lines.append("========== Movie Clustering Analysis Report ==========\n")
report_lines.append("--- Data Summary ---\n")
report_lines.append(f"Total number of movies: {total_movies}\n")
report_lines.append(f"Clustering performed on: {CONFIG['latent_dim']}-dimensional latent space from hybrid features\n")
report_lines.append("--- Clustering Summary ---\n")
report_lines.append(f"Number of clusters: {num_clusters}\n")
report_lines.append(f"Movies assigned to clusters: {movies_in_clusters} ({percentage_in_clusters:.2f}%)\n")
report_lines.append(f"Average cluster size: {avg_size:.1f}\n")
report_lines.append(f"Minimum cluster size: {min_size}\n")
report_lines.append(f"Maximum cluster size: {max_size}\n")
report_lines.append(f"Median cluster size: {median_size}\n")
report_lines.append("--- Top 5 Largest Clusters ---\n")
for i, (cluster, size) in enumerate(cluster_sizes.items()):
    if i < 5:
        report_lines.append(f"Cluster {cluster}: {size} movies\n")
report_lines.append("\n--- Top 5 Smallest Clusters ---\n")
for i, (cluster, size) in enumerate(cluster_sizes.sort_values().items()):
    if i < 5:
        report_lines.append(f"Cluster {cluster}: {size} movies\n")

# Sample movies
random.seed(42)
sample_clusters = random.sample(list(cluster_sizes.index), min(3, num_clusters))
report_lines.append("\n--- Sample Movies from 3 Random Clusters ---\n")
for cluster in sample_clusters:
    cluster_movies = df[labels == cluster]['title'].tolist()
    sample_titles = random.sample(cluster_movies, min(7, len(cluster_movies)))
    report_lines.append(f"\nCluster {cluster} (size: {cluster_sizes[cluster]}):\n")
    for title in sample_titles:
        report_lines.append(f" - {title}\n")

# Quality metrics
report_lines.append("\n--- Clustering Quality Metrics ---\n")
report_lines.append(f"{'Silhouette Score (subsample)':<30}: {sil_score:.4f}\n")
report_lines.append(f"{'Davies–Bouldin Index':<30}: {db_index:.4f}\n")
report_lines.append(f"{'Calinski–Harabasz Score':<30}: {ch_score:.2f}\n")
report_lines.append("==================================================\n")

# Print and save report
with open(os.path.join(CONFIG['output_dir'], 'clustering_report.txt'), 'w') as f:
    for line in report_lines:
        print(line, end='')
        f.write(line)

# Save clusters to CSV
pd.DataFrame({'MovieID': df['MovieID'], 'cluster': labels}).to_csv(
    os.path.join(CONFIG['output_dir'], 'movie_clusters_optimized.csv'), index=False
)
print(f"\nSaved clustering report to: {CONFIG['output_dir']}/clustering_report.txt")
print(f"Saved cluster assignments to: {CONFIG['output_dir']}/movie_clusters_optimized.csv")

Total genre columns in dataset: 20
Top 5 genres in dataset:
Genre:Drama: 19750 movies
Genre:Comedy: 12380 movies
Genre:Thriller: 8275 movies
Genre:Action: 6631 movies
Genre:Horror: 6335 movies
Movies with only one genre: 18774 (41.41%)
Movies with only Drama genre: 6667 (14.71%)
Feature variances (min: 0.0000, max: 0.1985, mean: 0.0512)
Noise after HDBSCAN: 6.95%
Noise points before reassignment: 3151 (6.95%)
Reassigned 2836 noise points
Noise after reassignment: 0.69%

Top 5 largest clusters:
Cluster 104: 6667 movies
Cluster 103: 5146 movies
Cluster 101: 3965 movies
Cluster 88: 2065 movies
Cluster 65: 2007 movies
Saved largest cluster (Cluster 104) movies to: final_movies_cluster1/cluster_104_movies.csv

Total genre columns in Cluster 104: 20
Non-zero genre columns in Cluster 104: 1

Top 5 genres in Cluster 104:
Genre:Drama: 6667 movies
Genre:IMAX: 0 movies
Genre:Thriller: 0 movies
Genre:Musical: 0 movies
Genre:Comedy: 0 movies
--- Data Summary ---
Total number of movies: 45335
Cluste

In [9]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import umap
import os
from matplotlib.colors import to_rgba

# Assume 'latent', 'labels', and 'CONFIG' are already defined from previous clustering steps
# Visualize clusters in 2D using UMAP with more distinct colors
print("Generating 2D UMAP visualization of clusters with more distinct colors...")
umap_2d = umap.UMAP(n_components=2, random_state=42)
latent_2d = umap_2d.fit_transform(latent)

# Get unique cluster labels, excluding noise
unique_labels = np.unique(labels[labels != -1])

# Combine multiple colormaps for more distinct colors
base_cmaps = ['tab20', 'tab20b', 'tab20c']  # Each has 20 colors, totaling 60 unique colors
base_colors = np.vstack([plt.get_cmap(cmap)(np.linspace(0, 1, 20)) for cmap in base_cmaps])

# Repeat base colors if needed to cover all clusters
num_repeats = int(np.ceil(len(unique_labels) / len(base_colors)))
colors = np.tile(base_colors, (num_repeats, 1))[:len(unique_labels)]

# Create a custom colormap for clusters
cluster_cmap = ListedColormap(colors)

# Convert noise color to RGBA for consistency
noise_color = to_rgba('gray')  # Shape (4,) for RGBA

# Create a color array for all points
plot_colors = np.zeros((len(labels), 4))  # Initialize array for RGBA colors
for i, label in enumerate(labels):
    if label == -1:
        plot_colors[i] = noise_color
    else:
        plot_colors[i] = cluster_cmap(unique_labels.tolist().index(label))

# Plot with distinct colors and noise legend
plt.figure(figsize=(12, 8))
scatter = plt.scatter(latent_2d[:, 0], latent_2d[:, 1], c=plot_colors, s=5, alpha=0.5)
plt.title('2D UMAP Visualization of Movie Clusters')
plt.xlabel('UMAP Component 1')
plt.ylabel('UMAP Component 2')

# Add legend for noise points
from matplotlib.lines import Line2D
legend_elements = [Line2D([0], [0], marker='o', color='w', label='Noise', markerfacecolor='gray', markersize=5)]
plt.legend(handles=legend_elements, loc='upper right')

# Save the plot
plt.savefig(os.path.join(CONFIG['output_dir'], 'cluster_visualization_2d_more_distinct.png'), dpi=300)
plt.close()
print(f"Saved 2D cluster visualization with more distinct colors to: {CONFIG['output_dir']}/cluster_visualization_2d_more_distinct.png")

Generating 2D UMAP visualization of clusters with more distinct colors...
Saved 2D cluster visualization with more distinct colors to: final_movies_cluster1/cluster_visualization_2d_more_distinct.png
