In [1]:
#Imports 
import pandas as pd
import numpy as np
from datetime import datetime
from sqlalchemy import text
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score
import matplotlib.pyplot as plt
import seaborn as sns
from Database.database import engine, SessionLocal
from Database.models import FactUserAnalyticsSnapshot

# Configuration of display
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

print("Imported successfully")

Imported successfully


In [2]:
#Loading data for K-means

snapshot_date_key = int(datetime.now().strftime("%Y%m%d"))

print("="*80)
print("LOADING DATA FOR K-MEANS CLUSTERING (ALL USERS)")
print("="*80)

print("Loading data from fact_user_analytics_snapshot...")
with SessionLocal() as session:
    records = session.query(FactUserAnalyticsSnapshot).filter(
        FactUserAnalyticsSnapshot.snapshot_date_key == snapshot_date_key
    ).all()
    
    data = [{
        'user_key': r.user_key,
        'subscription_plan_key': r.subscription_plan_key,
        'rfm_recency': r.rfm_recency,
        'rfm_frequency': r.rfm_frequency,
        'rfm_monetary': r.rfm_monetary,
        'rfm_segment': r.rfm_segment,
        'segment_label': r.segment_label,
        'engagement_level': r.engagement_level,
        'churn_probability': r.churn_probability if r.churn_probability is not None else 0.0,
        'churn_risk_band': r.churn_risk_band
    } for r in records]

df = pd.DataFrame(data)

df['is_free_tier'] = (df['subscription_plan_key'] == 1).astype(int)
df['is_premium_tier'] = (df['subscription_plan_key'].isin([4, 5])).astype(int)
df['is_standard_tier'] = (df['subscription_plan_key'].isin([2, 3])).astype(int)

print(f"Loaded {len(df):,} total users")

print("\nUser Distribution by Tier:")
tier_counts = df['subscription_plan_key'].value_counts().sort_index()
tier_names = {1: 'Free', 2: 'Standard Monthly', 3: 'Standard Annual', 
              4: 'Premium Monthly', 5: 'Premium Annual'}
for tier, count in tier_counts.items():
    pct = count / len(df) * 100
    print(f"  {tier_names.get(tier, f'Tier {tier}'):20s}: {count:,} ({pct:.1f}%)")

print("\nCurrent Data Summary:")
print(f"  RFM Recency range:     {df['rfm_recency'].min():.0f} - {df['rfm_recency'].max():.0f} days")
print(f"  RFM Frequency range:   {df['rfm_frequency'].min():.0f} - {df['rfm_frequency'].max():.0f} interactions")
print(f"  RFM Monetary range:    ${df['rfm_monetary'].min():.2f} - ${df['rfm_monetary'].max():.2f}")
print(f"  Avg Churn Probability: {df['churn_probability'].mean():.3f}")


LOADING DATA FOR K-MEANS CLUSTERING (ALL USERS)
Loading data from fact_user_analytics_snapshot...
Loaded 1,000 total users

User Distribution by Tier:
  Free                : 198 (19.8%)
  Standard Monthly    : 204 (20.4%)
  Standard Annual     : 211 (21.1%)
  Premium Monthly     : 195 (19.5%)
  Premium Annual      : 192 (19.2%)

Current Data Summary:
  RFM Recency range:     0 - 60 days
  RFM Frequency range:   0 - 30 interactions
  RFM Monetary range:    $509.95 - $629.90
  Avg Churn Probability: 0.351


In [3]:
#Feature selection and scaling
print("="*80)
print("PREPARING FEATURES FOR K-MEANS (FREE + PREMIUM)")
print("="*80)

clustering_features = [
    'rfm_recency',         
    'rfm_frequency',       
    'rfm_monetary',        
    'churn_probability',  
    'is_free_tier',        
    'is_premium_tier'     
]

X = df[clustering_features].copy()
X = X.fillna(0)

print(f"Selected {len(clustering_features)} features for clustering:")
for feat in clustering_features:
    print(f"  - {feat}")

print(f"\nFeature Statistics:")
print(X.describe())

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("\nFeatures standardized (mean=0, std=1)")
print(f"   Shape: {X_scaled.shape}")
print(f"   Free users:    {df['is_free_tier'].sum():,}")
print(f"   Premium users: {(df['is_premium_tier'] | df['is_standard_tier']).sum():,}")


PREPARING FEATURES FOR K-MEANS (FREE + PREMIUM)
Selected 6 features for clustering:
  - rfm_recency
  - rfm_frequency
  - rfm_monetary
  - churn_probability
  - is_free_tier
  - is_premium_tier

Feature Statistics:
       rfm_recency  rfm_frequency  rfm_monetary  churn_probability  is_free_tier  is_premium_tier
count    1000.0000    1000.000000   1000.000000        1000.000000   1000.000000      1000.000000
mean       13.8520      13.010000    591.576960           0.351000      0.198000         0.387000
std        17.2563       8.647119     16.925079           0.317854      0.398692         0.487307
min         0.0000       0.000000    509.950000           0.000000      0.000000         0.000000
25%         3.0000       5.000000    584.920000           0.122539      0.000000         0.000000
50%         6.0000      12.000000    584.920000           0.228911      0.000000         0.000000
75%        21.2500      20.000000    599.910000           0.568582      0.000000         1.000000
m

In [4]:
# Determining optimal K using Elbow method
print("="*80)
print("FINDING OPTIMAL NUMBER OF CLUSTERS")
print("="*80)

k_range = range(2, 11)
inertias = []
silhouette_scores = []
davies_bouldin_scores = []

print("Testing K from 2 to 10...")
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    
    inertias.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(X_scaled, kmeans.labels_))
    davies_bouldin_scores.append(davies_bouldin_score(X_scaled, kmeans.labels_))
    
    print(f"  K={k}: Inertia={kmeans.inertia_:.2f}, Silhouette={silhouette_scores[-1]:.3f}, DB={davies_bouldin_scores[-1]:.3f}")

results_df = pd.DataFrame({
    'K': list(k_range),
    'Inertia': inertias,
    'Silhouette_Score': silhouette_scores,
    'Davies_Bouldin_Score': davies_bouldin_scores
})

print("\nCLUSTERING METRICS SUMMARY:")
print(results_df.to_string(index=False))

# Higher Silhouette = better (closer to 1)
# Lower Davies-Bouldin = better (closer to 0)
optimal_k_silhouette = results_df.loc[results_df['Silhouette_Score'].idxmax(), 'K']
optimal_k_db = results_df.loc[results_df['Davies_Bouldin_Score'].idxmin(), 'K']

print(f"\nRECOMMENDED K:")
print(f"  Based on Silhouette Score: K = {optimal_k_silhouette}")
print(f"  Based on Davies-Bouldin:   K = {optimal_k_db}")
print(f"\nSuggestion: Use K = {optimal_k_silhouette} or K = {optimal_k_db}")

recommended_k = 5
print(f"\nWe'll use K = {recommended_k} for business segmentation")


FINDING OPTIMAL NUMBER OF CLUSTERS
Testing K from 2 to 10...
  K=2: Inertia=4514.74, Silhouette=0.285, DB=1.316
  K=3: Inertia=3241.24, Silhouette=0.342, DB=1.193
  K=4: Inertia=2619.20, Silhouette=0.337, DB=1.216
  K=5: Inertia=2355.29, Silhouette=0.317, DB=1.130
  K=6: Inertia=2102.41, Silhouette=0.317, DB=1.134
  K=7: Inertia=1884.23, Silhouette=0.334, DB=1.157
  K=8: Inertia=1671.73, Silhouette=0.334, DB=1.192
  K=9: Inertia=1463.18, Silhouette=0.353, DB=1.060
  K=10: Inertia=1358.81, Silhouette=0.351, DB=1.076

CLUSTERING METRICS SUMMARY:
 K     Inertia  Silhouette_Score  Davies_Bouldin_Score
 2 4514.742032          0.284688              1.315752
 3 3241.240919          0.341999              1.192736
 4 2619.196245          0.337357              1.216405
 5 2355.294648          0.316829              1.130391
 6 2102.411773          0.316901              1.133734
 7 1884.229405          0.334447              1.156864
 8 1671.733024          0.334302              1.191963
 9 1463.17

In [5]:
#Final K-Means Model
print("="*80)
print(f"TRAINING K-MEANS MODEL (K={recommended_k})")
print("="*80)

kmeans_final = KMeans(
    n_clusters=recommended_k, 
    random_state=42, 
    n_init=20, 
    max_iter=300
)

kmeans_final.fit(X_scaled)

df['kmeans_cluster'] = kmeans_final.labels_

print(f"K-Means clustering complete!")
print(f"   Clusters: {recommended_k}")
print(f"   Inertia: {kmeans_final.inertia_:.2f}")

print("\nCLUSTER DISTRIBUTION:")
cluster_counts = df['kmeans_cluster'].value_counts().sort_index()
for cluster, count in cluster_counts.items():
    pct = count / len(df) * 100
    print(f"  Cluster {cluster}: {count:,} users ({pct:.1f}%)")

print("\nCLUSTER CHARACTERISTICS:")
cluster_stats = df.groupby('kmeans_cluster')[clustering_features].mean()
print(cluster_stats.round(2))


TRAINING K-MEANS MODEL (K=5)
K-Means clustering complete!
   Clusters: 5
   Inertia: 2355.29

CLUSTER DISTRIBUTION:
  Cluster 0: 192 users (19.2%)
  Cluster 1: 297 users (29.7%)
  Cluster 2: 198 users (19.8%)
  Cluster 3: 145 users (14.5%)
  Cluster 4: 168 users (16.8%)

CLUSTER CHARACTERISTICS:
                rfm_recency  rfm_frequency  rfm_monetary  churn_probability  is_free_tier  is_premium_tier
kmeans_cluster                                                                                            
0                     40.55           6.93        592.18               0.88           0.0             0.39
1                      3.65          15.24        591.13               0.30           0.0             0.00
2                     15.38          12.93        592.34               0.00           1.0             0.00
3                      3.71          22.39        596.91               0.19           0.0             1.00
4                      8.33           8.01        586.17     

In [6]:
# Assigning business labels 
print("="*80)
print("LABELING CLUSTERS (FREE + PREMIUM USERS)")
print("="*80)

cluster_profiles = df.groupby('kmeans_cluster').agg({
    'rfm_recency': 'mean',
    'rfm_frequency': 'mean',
    'rfm_monetary': 'mean',
    'churn_probability': 'mean',
    'is_free_tier': 'sum',
    'is_premium_tier': 'sum',
    'user_key': 'count'
}).round(2)

cluster_profiles.columns = [
    'Avg_Recency', 'Avg_Frequency', 'Avg_Monetary', 
    'Avg_Churn_Prob', 'Free_Users', 'Premium_Users', 'Total_Users'
]

print("\nCLUSTER PROFILES:")
print(cluster_profiles)

def assign_cluster_label(row):
    """
    Assign labels considering free vs premium users
    """
    recency = row['rfm_recency']
    frequency = row['rfm_frequency']
    monetary = row['rfm_monetary']
    churn_prob = row['churn_probability']
    is_free = row['is_free_tier']
    
    if is_free == 1:
        if recency < 15 and frequency > 50:
            return 'Active Free Users (High Conversion Potential)'
        elif recency < 30 and frequency > 20:
            return 'Engaged Free Users'
        elif recency > 60 or frequency < 5:
            return 'Dormant Free Users'
        else:
            return 'Casual Free Users'
    
    else:
        if recency < 15 and frequency > 100 and monetary > 200 and churn_prob < 0.3:
            return 'Champions (Premium)'
        elif recency < 30 and frequency > 50 and monetary > 100 and churn_prob < 0.5:
            return 'Loyal Customers (Premium)'
        elif recency < 30 and churn_prob < 0.5:
            return 'Promising Premium Users'
        elif churn_prob > 0.6 or recency > 60:
            return 'At Risk Premium (Retention Focus)'
        else:
            return 'Standard Premium Users'

df['kmeans_segment_label'] = df.apply(assign_cluster_label, axis=1)

print("\nCLUSTER LABELS ASSIGNED:")
label_dist = df['kmeans_segment_label'].value_counts()
for label, count in label_dist.items():
    pct = count / len(df) * 100
    print(f"  {label:45s}: {count:,} ({pct:.1f}%)")

print("\nCLUSTER COMPOSITION (Free vs Premium):")
cluster_composition = df.groupby(['kmeans_cluster', 'is_free_tier']).size().unstack(fill_value=0)
cluster_composition.columns = ['Premium', 'Free']
cluster_composition['Total'] = cluster_composition.sum(axis=1)
cluster_composition['% Free'] = (cluster_composition['Free'] / cluster_composition['Total'] * 100).round(1)
print(cluster_composition)

LABELING CLUSTERS (FREE + PREMIUM USERS)

CLUSTER PROFILES:
                Avg_Recency  Avg_Frequency  Avg_Monetary  Avg_Churn_Prob  Free_Users  Premium_Users  Total_Users
kmeans_cluster                                                                                                  
0                     40.55           6.93        592.18            0.88           0             74          192
1                      3.65          15.24        591.13            0.30           0              0          297
2                     15.38          12.93        592.34            0.00         198              0          198
3                      3.71          22.39        596.91            0.19           0            145          145
4                      8.33           8.01        586.17            0.39           0            168          168

CLUSTER LABELS ASSIGNED:
  Promising Premium Users                      : 500 (50.0%)
  At Risk Premium (Retention Focus)            : 224 (22.4%)
 

In [7]:
#Updating fact_user_analytics_snapshot with K-Means 
print("="*80)
print("UPDATING DATABASE WITH K-MEANS CLUSTERS")
print("="*80)

update_df = df[['user_key', 'kmeans_cluster', 'kmeans_segment_label']].copy()

print(f"Updating {len(update_df):,} user records...")

updated_count = 0
with SessionLocal() as session:
    for idx, row in update_df.iterrows():
        record = session.query(FactUserAnalyticsSnapshot).filter(
            FactUserAnalyticsSnapshot.user_key == int(row['user_key']),
            FactUserAnalyticsSnapshot.snapshot_date_key == snapshot_date_key
        ).first()
        
        if record:
            record.kmeans_cluster = int(row['kmeans_cluster'])
            record.kmeans_segment_label = row['kmeans_segment_label']
            updated_count += 1
            
            if updated_count % 500 == 0:
                session.commit()  # Commit in batches
                print(f"  Updated {updated_count:,} records...")
        
    session.commit()

print(f"\nUpdated {updated_count:,} records in fact_user_analytics_snapshot")

# Verifying updates
with SessionLocal() as session:
    total_records = session.query(FactUserAnalyticsSnapshot).filter(
        FactUserAnalyticsSnapshot.snapshot_date_key == snapshot_date_key
    ).count()
    
    records_with_cluster = session.query(FactUserAnalyticsSnapshot).filter(
        FactUserAnalyticsSnapshot.snapshot_date_key == snapshot_date_key,
        FactUserAnalyticsSnapshot.kmeans_cluster.isnot(None)
    ).count()
    
    records_with_label = session.query(FactUserAnalyticsSnapshot).filter(
        FactUserAnalyticsSnapshot.snapshot_date_key == snapshot_date_key,
        FactUserAnalyticsSnapshot.kmeans_segment_label.isnot(None)
    ).count()
    
    print("\nVerification:")
    print(f"  Total records:         {total_records:,}")
    print(f"  With kmeans_cluster:   {records_with_cluster:,}")
    print(f"  With segment label:    {records_with_label:,}")

print("\nDatabase update complete!")


UPDATING DATABASE WITH K-MEANS CLUSTERS
Updating 1,000 user records...
  Updated 500 records...
  Updated 1,000 records...

Updated 1,000 records in fact_user_analytics_snapshot

Verification:
  Total records:         1,000
  With kmeans_cluster:   1,000
  With segment label:    1,000

Database update complete!
