In [2]:
import os
import shutil
import numpy as np
import pandas as pd
from pathlib import Path

from scipy.sparse import csr_matrix
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.manifold import TSNE
from sklearn.metrics import (
    silhouette_score, silhouette_samples,
    davies_bouldin_score, calinski_harabasz_score
)
from sklearn.preprocessing import StandardScaler
import hdbscan
import matplotlib.pyplot as plt

# Define output directory relative to notebook location
OUTPUT_DIR = Path.cwd() / 'users_clusters_final'
OUTPUT_DIR.mkdir(exist_ok=True)

# Utility to save figures
def save_fig(fig, name):
    """Save a matplotlib figure to the output directory and close it."""
    path = OUTPUT_DIR / name
    fig.savefig(path, bbox_inches='tight')
    plt.close(fig)

# Step 1: Load and preprocess data
def load_data(ratings_path, movies_path, users_path):
    """Load MovieLens 1M files and drop unused columns."""
    ratings = pd.read_csv(
        ratings_path, sep='::', engine='python',
        names=['userId','movieId','rating','timestamp'],
        usecols=['userId','movieId','rating']
    )
    movies = pd.read_csv(
        movies_path, sep='::', engine='python', encoding='latin-1',
        names=['movieId','title','genres']
    )
    users = pd.read_csv(
        users_path, sep='::', engine='python',
        names=['userId','gender','age','occupation','zip_code'],
        usecols=['userId','gender','age','occupation']
    )
    movies['genres'] = movies['genres'].str.split('|')
    return ratings, movies, users

# Step 2: Normalize ratings using z-score per user
def normalize_ratings(ratings):
    """Return DataFrame with normalized ratings (z-score) per user."""
    stats = ratings.groupby('userId')['rating'].agg(['mean','std'])
    stats['std'].replace(0, 1, inplace=True)
    merged = ratings.merge(stats, on='userId')
    merged['norm_rating'] = (
        (merged['rating'] - merged['mean']) / merged['std']
    ).fillna(0)
    return merged[['userId','movieId','norm_rating']]

# Step 3: Build features: TF-IDF genre + demographics
def build_features(ratings_norm, movies, users, min_ratings=50):
    """Construct user features: genre TF-IDF and encoded demographics."""
    df = ratings_norm.merge(movies, on='movieId')
    exploded = df.explode('genres')
    genre_mean = (
        exploded.groupby(['userId','genres'])['norm_rating']
        .mean().unstack(fill_value=0)
    )
    counts = ratings_norm['userId'].value_counts()
    valid = counts[counts >= min_ratings].index
    genre_mean = genre_mean.loc[valid]
    N = len(genre_mean)
    idf = np.log((N + 3) / (3 + (genre_mean != 0).sum())) + 1
    genre_tfidf = genre_mean * idf

    demo = users[users['userId'].isin(valid)].copy()
    demo = pd.get_dummies(demo, columns=['gender','occupation'], dtype=float)
    demo['age'] = demo['age'].astype(float)
    demo_index = demo['userId']
    demo_feat = demo.drop(columns=['userId'])
    scaler = StandardScaler()
    demo_scaled = pd.DataFrame(
        scaler.fit_transform(demo_feat),
        index=demo_index,
        columns=demo_feat.columns
    )

    features = pd.DataFrame(
        genre_tfidf.values,
        index=genre_tfidf.index,
        columns=genre_tfidf.columns
    ).join(demo_scaled, how='left').fillna(0)
    return features

# Step 4: PCA reduction and standardization
def reduce_dim(features, variance_threshold=0.85):
    """Apply PCA to retain variance_threshold of variance, then scale and impute."""
    pca = PCA(n_components=variance_threshold)
    reduced = pca.fit_transform(features)
    scaled = StandardScaler().fit_transform(reduced)
    imputed = SimpleImputer(strategy='mean').fit_transform(scaled)
    return imputed

# Step 5: Cluster users with HDBSCAN, fallback to KMeans
def cluster_users(data, min_cluster_size=150, fallback_k=8):
    """Cluster data with HDBSCAN; if fewer than 2 clusters, fallback to KMeans."""
    hdb = hdbscan.HDBSCAN(
        min_cluster_size=min_cluster_size,
        min_samples=5,
        cluster_selection_method='eom'
    )
    labels = hdb.fit_predict(data)
    if len(set(labels) - {-1}) < 2:
        labels = KMeans(n_clusters=fallback_k, random_state=42).fit_predict(data)
    return labels

# Step 6: Evaluate clusters, save plots, and display metrics
def evaluate_and_plot(data, labels):
    """Compute metrics, silhouette and t-SNE plots, save and print them."""
    valid_idx = labels >= 0
    X, y = data[valid_idx], labels[valid_idx]

    scores = {
        'silhouette': silhouette_score(X, y),
        'davies_bouldin': davies_bouldin_score(X, y),
        'calinski_harabasz': calinski_harabasz_score(X, y)
    }

    # Print metrics
    print("Cluster Evaluation Metrics:")
    for name, val in scores.items():
        print(f"  {name}: {val:.4f}")

    # Silhouette plot
    sil_vals = silhouette_samples(X, y)
    fig, ax = plt.subplots(figsize=(8,6))
    y_lower = 10
    for cluster in np.unique(y):
        vals = np.sort(sil_vals[y == cluster])
        size = len(vals)
        y_upper = y_lower + size
        ax.fill_betweenx(np.arange(y_lower, y_upper), 0, vals, alpha=0.7)
        y_lower = y_upper + 10
    ax.axvline(scores['silhouette'], linestyle='--')
    ax.set(title='Silhouette Plot', xlabel='Coefficient')
    save_fig(fig, 'silhouette_plot.png')

    # t-SNE visualization
    tsne = TSNE(n_components=2, random_state=42)
    emb = tsne.fit_transform(X)
    fig, ax = plt.subplots(figsize=(8,6))
    ax.scatter(emb[:,0], emb[:,1], c=y, s=5)
    ax.set(title='t-SNE Projection')
    save_fig(fig, 'tsne_plot.png')

    return scores

# Main execution
if __name__ == '__main__':
    RATINGS = Path.cwd() / 'data/ratings.dat'
    MOVIES  = Path.cwd() / 'data/movies.dat'
    USERS   = Path.cwd() / 'data/users.dat'

    # Process data
    ratings, movies, users = load_data(RATINGS, MOVIES, USERS)
    ratings_norm   = normalize_ratings(ratings)
    features       = build_features(ratings_norm, movies, users)
    reduced        = reduce_dim(features)
    labels         = cluster_users(reduced)

    # Save cluster assignments
    cluster_df = pd.DataFrame({'userId': features.index, 'cluster': labels})
    user_csv   = OUTPUT_DIR / 'user_clusters.csv'
    cluster_df.to_csv(user_csv, index=False)
    print(f"Saved user_clusters.csv to {user_csv}")

    # Evaluate and plot
    metrics     = evaluate_and_plot(reduced, labels)
    metrics_csv = OUTPUT_DIR / 'cluster_metrics.csv'
    pd.Series(metrics).to_csv(metrics_csv)
    print(f"Saved cluster_metrics.csv to {metrics_csv}")




The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  stats['std'].replace(0, 1, inplace=True)


Saved user_clusters.csv to C:\Users\PC\Desktop\users-clusters\users_clusters_final\user_clusters.csv
Cluster Evaluation Metrics:
  silhouette: 0.5317
  davies_bouldin: 0.7462
  calinski_harabasz: 1278.0854
Saved cluster_metrics.csv to C:\Users\PC\Desktop\users-clusters\users_clusters_final\cluster_metrics.csv
