In [1]:
#Imports
import pandas as pd
import numpy as np
from datetime import datetime, timedelta, timezone
from sqlalchemy import text
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, 
    precision_score, 
    recall_score, 
    f1_score,
    roc_auc_score,
    classification_report,
    confusion_matrix
)
from sklearn.preprocessing import StandardScaler
from Database.database import engine, SessionLocal
from sqlalchemy.orm import Session
from Database.models import FactUserAnalyticsSnapshot, FactUserDailyActivity, ModelPerformanceMetrics
import matplotlib.pyplot as plt
import seaborn as sns

#Display Configuration
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

print("Imported successfully")

Imported successfully


In [2]:
#Loading RFM + Activity data using ORM

snapshot_date_key = int(datetime.now().strftime("%Y%m%d"))
start_date = datetime.now() - timedelta(days=90)
start_date_key = int(start_date.strftime("%Y%m%d"))

print("="*80)
print("LOADING RFM + ACTIVITY DATA FROM DATABASE")
print("="*80)

#Including users with subscriptions only
print("Loading RFM data...")
with SessionLocal() as session:
    rfm_records = session.query(FactUserAnalyticsSnapshot).filter(
        FactUserAnalyticsSnapshot.snapshot_date_key == snapshot_date_key,
        FactUserAnalyticsSnapshot.subscription_plan_key.in_([2, 3, 4, 5])
    ).all()
    
    rfm_data = [{
        'user_key': r.user_key,
        'snapshot_date_key': r.snapshot_date_key,
        'subscription_plan_key': r.subscription_plan_key,
        'rfm_recency': r.rfm_recency,
        'rfm_frequency': r.rfm_frequency,
        'rfm_monetary': r.rfm_monetary,
        'rfm_r_score': r.rfm_r_score,
        'rfm_f_score': r.rfm_f_score,
        'rfm_m_score': r.rfm_m_score,
        'rfm_segment': r.rfm_segment,
        'segment_label': r.segment_label,
        'engagement_level': r.engagement_level
    } for r in rfm_records]

rfm_df = pd.DataFrame(rfm_data)
print(f"Loaded {len(rfm_df):,} RFM records")

print("Loading activity data from fact_user_daily_activity...")

with SessionLocal() as session:
    activity_records = session.query(FactUserDailyActivity).all()
    activity_data = [{
        'user_key': r.user_key,
        'date_key': r.date_key,
        'subscription_plan_key': r.subscription_plan_key,
        'campaign_key': r.campaign_key,
        'logins_count': r.logins_count,
        'sessions_count': r.sessions_count,
        'minutes_watched': r.minutes_watched,
        'lessons_completed': r.lessons_completed,
        'quizzes_attempted': r.quizzes_attempted,
        'distinct_courses_accessed': r.distinct_courses_accessed,
        'active_days_last_30d': r.active_days_last_30d,
        'days_since_last_login': r.days_since_last_login,
        'is_inactive_7d_flag': r.is_inactive_7d_flag,
        'active_courses_count': r.active_courses_count,
        'completed_courses_total': r.completed_courses_total,
    } for r in activity_records]

activity_df = pd.DataFrame(activity_data)

print("Activity DataFrame columns:", activity_df.columns.tolist())
print("Activity DataFrame shape:", activity_df.shape)
print(activity_df.head())

print("Aggregating activity metrics...")
activity_agg = activity_df.groupby('user_key').agg({
    'logins_count': 'sum',
    'sessions_count': 'sum',
    'minutes_watched': 'sum',
    'lessons_completed': 'sum',
    'quizzes_attempted': 'sum',
    'distinct_courses_accessed': 'max',
    'active_days_last_30d': 'mean',
    'days_since_last_login': 'min',
    'is_inactive_7d_flag': 'sum',
    'active_courses_count': 'max',
    'completed_courses_total': 'max'
}).reset_index()

activity_agg.columns = [
    'user_key',
    'logins_90d',
    'sessions_90d',
    'minutes_watched_90d',
    'lessons_completed_90d',
    'quizzes_attempted_90d',
    'courses_accessed',
    'avg_active_days_30d',
    'days_since_last_login',
    'inactive_7d_count',
    'active_courses',
    'completed_courses'
]

print("Detecting subscription changes...")

user_subscription_changes = activity_df.groupby('user_key').agg({
    'subscription_plan_key': ['first', 'last', 'min', 'max'],
    'date_key': ['min', 'max']
}).reset_index()

user_subscription_changes.columns = [
    'user_key', 
    'first_plan', 'last_plan', 
    'min_plan', 'max_plan',
    'first_date', 'last_date'
]

user_subscription_changes['has_downgraded'] = (
    user_subscription_changes['last_plan'] < user_subscription_changes['max_plan']
).astype(int)

downgrade_df = user_subscription_changes[['user_key', 'has_downgraded']]

print(f"Detected downgrades:")
print(f"  Users with downgrades: {(downgrade_df['has_downgraded'] == 1).sum():,}")
print(f"  Users without downgrades: {(downgrade_df['has_downgraded'] == 0).sum():,}")

print("Merging datasets...")
df = rfm_df.merge(activity_agg, on='user_key', how='left')
df = df.merge(downgrade_df, on='user_key', how='left')

df['logins_90d'] = df['logins_90d'].fillna(0)
df['sessions_90d'] = df['sessions_90d'].fillna(0)
df['minutes_watched_90d'] = df['minutes_watched_90d'].fillna(0)
df['lessons_completed_90d'] = df['lessons_completed_90d'].fillna(0)
df['quizzes_attempted_90d'] = df['quizzes_attempted_90d'].fillna(0)
df['courses_accessed'] = df['courses_accessed'].fillna(0)
df['avg_active_days_30d'] = df['avg_active_days_30d'].fillna(0)
df['days_since_last_login'] = df['days_since_last_login'].fillna(999)
df['inactive_7d_count'] = df['inactive_7d_count'].fillna(0)
df['active_courses'] = df['active_courses'].fillna(0)
df['completed_courses'] = df['completed_courses'].fillna(0)
df['has_downgraded'] = df['has_downgraded'].fillna(0).astype(int)

print(f"Loaded {len(df):,} premium users with activity data")

#To be considered a churned customer, multiple criteria must be met as defined below:
df['is_churned'] = (
    (
        (df['rfm_recency'] > 45) | 
        (df['segment_label'] == 'Recently Churned')  
    ) |
    (
        (df['has_downgraded'] == 1) & (df['rfm_recency'] > 14) 
    ) |
    (
        (df['logins_90d'] == 0) & (df['lessons_completed_90d'] == 0) 
    )
).astype(int)

print("\nChurn Distribution:")
churn_counts = df['is_churned'].value_counts()
print(f"  Retained (0):  {churn_counts.get(0, 0):,} ({churn_counts.get(0, 0)/len(df)*100:.1f}%)")
print(f"  Churned (1):   {churn_counts.get(1, 0):,} ({churn_counts.get(1, 0)/len(df)*100:.1f}%)")

print("\nChurn Breakdown by Reason:")
print(f"  Inactive (>30 days):          {(df['rfm_recency'] > 30).sum():,}")
print(f"  In churned segments:          {df['segment_label'].isin(['Recently Churned', 'Dormant Premium']).sum():,}")
print(f"  Downgraded subscription:      {(df['has_downgraded'] == 1).sum():,}")
print(f"  Very low engagement:          {((df['logins_90d'] < 2) & (df['lessons_completed_90d'] == 0)).sum():,}")
print(f"  Total churned (with overlap): {df['is_churned'].sum():,}")

print("\nActivity Statistics:")
print(f"  Avg logins (90d):             {df['logins_90d'].mean():.1f}")
print(f"  Avg minutes watched (90d):    {df['minutes_watched_90d'].mean():.1f}")
print(f"  Avg lessons completed (90d):  {df['lessons_completed_90d'].mean():.1f}")
print(f"  Avg active courses:           {df['active_courses'].mean():.1f}")

print("\nChurn target variable created with activity features")

rfm_df = df.copy()


LOADING RFM + ACTIVITY DATA FROM DATABASE
Loading RFM data...
Loaded 802 RFM records
Loading activity data from fact_user_daily_activity...
Activity DataFrame columns: ['user_key', 'date_key', 'subscription_plan_key', 'campaign_key', 'logins_count', 'sessions_count', 'minutes_watched', 'lessons_completed', 'quizzes_attempted', 'distinct_courses_accessed', 'active_days_last_30d', 'days_since_last_login', 'is_inactive_7d_flag', 'active_courses_count', 'completed_courses_total']
Activity DataFrame shape: (49565, 15)
   user_key  date_key  subscription_plan_key  campaign_key  logins_count  sessions_count  minutes_watched  lessons_completed  quizzes_attempted  distinct_courses_accessed  active_days_last_30d  days_since_last_login  is_inactive_7d_flag  active_courses_count  completed_courses_total
0       788  20250830                      1            18             2               2              121                  3                  5                          3                    19     

In [3]:
#Feature engineering
print("="*80)
print("FEATURE ENGINEERING (90-DAY PERIOD)")
print("="*80)

# RFM derived features
rfm_df['rfm_score_total'] = rfm_df['rfm_r_score'] + rfm_df['rfm_f_score'] + rfm_df['rfm_m_score']
rfm_df['rfm_score_avg'] = rfm_df['rfm_score_total'] / 3

# Activity-based features
rfm_df['avg_session_duration'] = np.where(
    rfm_df['sessions_90d'] > 0,
    rfm_df['minutes_watched_90d'] / rfm_df['sessions_90d'],
    0
)

rfm_df['login_frequency'] = rfm_df['logins_90d'] / 90 
rfm_df['lesson_completion_rate'] = np.where(
    rfm_df['sessions_90d'] > 0,
    rfm_df['lessons_completed_90d'] / rfm_df['sessions_90d'],
    0
)

rfm_df['quiz_engagement_rate'] = np.where(
    rfm_df['lessons_completed_90d'] > 0,
    rfm_df['quizzes_attempted_90d'] / rfm_df['lessons_completed_90d'],
    0
)

rfm_df['course_completion_ratio'] = np.where(
    rfm_df['courses_accessed'] > 0,
    rfm_df['completed_courses'] / rfm_df['courses_accessed'],
    0
)

# Engagement score (0-100)
rfm_df['engagement_score'] = (
    (rfm_df['logins_90d'] / rfm_df['logins_90d'].max() * 100) * 0.2 +
    (rfm_df['minutes_watched_90d'] / rfm_df['minutes_watched_90d'].max() * 100) * 0.3 +
    (rfm_df['lessons_completed_90d'] / rfm_df['lessons_completed_90d'].max() * 100) * 0.3 +
    (rfm_df['active_courses'] / rfm_df['active_courses'].max() * 100) * 0.2
).fillna(0)

# Risk flags 
rfm_df['high_recency_risk'] = (rfm_df['rfm_recency'] > 45).astype(int) 
rfm_df['low_activity_risk'] = (rfm_df['logins_90d'] < 5).astype(int)   
rfm_df['no_lessons_risk'] = (rfm_df['lessons_completed_90d'] == 0).astype(int)
rfm_df['no_active_courses_risk'] = (rfm_df['active_courses'] == 0).astype(int)
rfm_df['low_watch_time_risk'] = (rfm_df['minutes_watched_90d'] < 120).astype(int) 

# Subscription features
rfm_df['is_premium_tier'] = rfm_df['subscription_plan_key'].isin([4, 5]).astype(int)
rfm_df['is_annual'] = rfm_df['subscription_plan_key'].isin([3, 5]).astype(int)

# Interaction features
rfm_df['engagement_x_frequency'] = rfm_df['engagement_score'] * rfm_df['rfm_frequency']
rfm_df['recency_x_logins'] = rfm_df['rfm_recency'] * rfm_df['logins_90d']

print(f"Total features available: {len(rfm_df.columns)}")

FEATURE ENGINEERING (90-DAY PERIOD)
Total features available: 42


In [4]:
#Prepare features for training
print("="*80)
print("PREPARING TRAINING DATA (90-DAY FEATURES)")
print("="*80)

# Select features for model 
feature_columns = [
    'rfm_recency',
    'rfm_frequency',
    'rfm_monetary',
    
    # Activity features
    'logins_90d',
    'sessions_90d',
    'minutes_watched_90d',
    'lessons_completed_90d',
    'quizzes_attempted_90d',
    'courses_accessed',
    'avg_active_days_30d',
    'days_since_last_login',
    'active_courses',
    'completed_courses',
    
    # Derived activity features
    'avg_session_duration',
    'login_frequency',
    'lesson_completion_rate',
    'quiz_engagement_rate',
    'course_completion_ratio',
    'engagement_score',
    
    # Risk flags
    'high_recency_risk',
    'low_activity_risk',
    'no_lessons_risk',
    'no_active_courses_risk',
    'low_watch_time_risk',
    
    # Subscription features
    'is_premium_tier',
    'is_annual',
    'has_downgraded'
]

X = rfm_df[feature_columns].copy()
y = rfm_df['is_churned'].copy()

X = X.replace([np.inf, -np.inf], 0)
X = X.fillna(0)

print(f"Features prepared:")
print(f"  Shape: {X.shape}")
print(f"  Features: {len(feature_columns)}")
print(f"  Target: {len(y)} samples")
print(f"\nFeature Categories:")
print(f"  RFM raw features:      3 (recency, frequency, monetary)")
print(f"  Activity features:    10 (90-day window)")
print(f"  Derived features:      6")
print(f"  Risk flags:            5")
print(f"  Subscription:          3")
print(f"  Total:                {len(feature_columns)}")

# Making the split into train and test data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

print(f"\nTrain/Test Split:")
print(f"  Training set:   {len(X_train):,} samples ({len(X_train)/len(X)*100:.1f}%)")
print(f"  Test set:       {len(X_test):,} samples ({len(X_test)/len(X)*100:.1f}%)")
print(f"\n  Train - Churned: {y_train.sum():,} ({y_train.sum()/len(y_train)*100:.1f}%)")
print(f"  Test - Churned:  {y_test.sum():,} ({y_test.sum()/len(y_test)*100:.1f}%)")

PREPARING TRAINING DATA (90-DAY FEATURES)
Features prepared:
  Shape: (802, 27)
  Features: 27
  Target: 802 samples

Feature Categories:
  RFM raw features:      3 (recency, frequency, monetary)
  Activity features:    10 (90-day window)
  Derived features:      6
  Risk flags:            5
  Subscription:          3
  Total:                27

Train/Test Split:
  Training set:   641 samples (79.9%)
  Test set:       161 samples (20.1%)

  Train - Churned: 254 (39.6%)
  Test - Churned:  64 (39.8%)


In [5]:
#We will be using Random Forest Classifier for predictions
print("="*80)
print("TRAINING CHURN PREDICTION MODEL")
print("="*80)

model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=20,
    min_samples_leaf=10,
    random_state=42,
    class_weight='balanced', 
    n_jobs=-1
)

print("Training Random Forest Classifier...")
print(f"  n_estimators: 100")
print(f"  max_depth: 10")
print(f"  class_weight: balanced")
print()

model.fit(X_train, y_train)
print("Model training complete!")

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

y_train_proba = model.predict_proba(X_train)[:, 1]
y_test_proba = model.predict_proba(X_test)[:, 1]

print("Predictions generated")


TRAINING CHURN PREDICTION MODEL
Training Random Forest Classifier...
  n_estimators: 100
  max_depth: 10
  class_weight: balanced

Model training complete!
Predictions generated


In [6]:
#Model performance evaluation
print("="*80)
print("MODEL PERFORMANCE EVALUATION")
print("="*80)

train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

train_precision = precision_score(y_train, y_train_pred)
test_precision = precision_score(y_test, y_test_pred)

train_recall = recall_score(y_train, y_train_pred)
test_recall = recall_score(y_test, y_test_pred)

train_f1 = f1_score(y_train, y_train_pred)
test_f1 = f1_score(y_test, y_test_pred)

train_auc = roc_auc_score(y_train, y_train_proba)
test_auc = roc_auc_score(y_test, y_test_proba)

print("\nTRAINING SET PERFORMANCE:")
print(f"  Accuracy:   {train_accuracy*100:.1f}%")
print(f"  Precision:  {train_precision*100:.1f}%")
print(f"  Recall:     {train_recall*100:.1f}%")
print(f"  F1-Score:   {train_f1*100:.1f}%")
print(f"  AUC-ROC:    {train_auc:.3f}")

print("\nTEST SET PERFORMANCE:")
print(f"  Accuracy:   {test_accuracy*100:.1f}%")
print(f"  Precision:  {test_precision*100:.1f}%")
print(f"  Recall:     {test_recall*100:.1f}%")
print(f"  F1-Score:   {test_f1*100:.1f}%")
print(f"  AUC-ROC:    {test_auc:.3f}")

print("\nCONFUSION MATRIX (Test Set):")
cm = confusion_matrix(y_test, y_test_pred)
print(f"  True Negatives:  {cm[0,0]:,} (Correctly predicted retained)")
print(f"  False Positives: {cm[0,1]:,} (Predicted churned, actually retained)")
print(f"  False Negatives: {cm[1,0]:,} (Predicted retained, actually churned)")
print(f"  True Positives:  {cm[1,1]:,} (Correctly predicted churned)")

print("\nDETAILED CLASSIFICATION REPORT:")
print(classification_report(y_test, y_test_pred, target_names=['Retained', 'Churned']))

# Store metrics to database
with SessionLocal() as session:
    metrics = ModelPerformanceMetrics(
        snapshot_date_key=snapshot_date_key,
        model_type='churn_prediction',
        model_version='v1.0',
        accuracy=round(test_accuracy, 4),
        precision=round(test_precision, 4),
        recall=round(test_recall, 4),
        f1_score=round(test_f1, 4),
        auc_roc=round(test_auc, 4),
        train_samples=len(X_train),
        test_samples=len(X_test),
        true_negatives=int(cm[0, 0]),
        false_positives=int(cm[0, 1]),
        false_negatives=int(cm[1, 0]),
        true_positives=int(cm[1, 1])
    )
    session.add(metrics)
    session.commit()
    print(f"Saved model metrics (ID: {metrics.model_performance_id})")

print("\nModel evaluation complete!")


MODEL PERFORMANCE EVALUATION

TRAINING SET PERFORMANCE:
  Accuracy:   93.6%
  Precision:  94.9%
  Recall:     88.6%
  F1-Score:   91.6%
  AUC-ROC:    0.983

TEST SET PERFORMANCE:
  Accuracy:   83.2%
  Precision:  81.4%
  Recall:     75.0%
  F1-Score:   78.0%
  AUC-ROC:    0.936

CONFUSION MATRIX (Test Set):
  True Negatives:  86 (Correctly predicted retained)
  False Positives: 11 (Predicted churned, actually retained)
  False Negatives: 16 (Predicted retained, actually churned)
  True Positives:  48 (Correctly predicted churned)

DETAILED CLASSIFICATION REPORT:
              precision    recall  f1-score   support

    Retained       0.84      0.89      0.86        97
     Churned       0.81      0.75      0.78        64

    accuracy                           0.83       161
   macro avg       0.83      0.82      0.82       161
weighted avg       0.83      0.83      0.83       161

Saved model metrics (ID: 1)

Model evaluation complete!


In [7]:
# Analyzing and saving feature importance
print("="*80)
print("FEATURE IMPORTANCE ANALYSIS")
print("="*80)

feature_importance = pd.DataFrame({
    'feature_name': feature_columns,
    'importance_score': model.feature_importances_
}).sort_values('importance_score', ascending=False)

feature_importance['importance_score'] = (
    feature_importance['importance_score'] / feature_importance['importance_score'].sum() * 100
)

feature_importance['importance_rank'] = range(1, len(feature_importance) + 1)

print("\nTOP 15 MOST IMPORTANT FEATURES:")
print(feature_importance.head(15).to_string(index=False))

feature_importance_db = feature_importance.copy()
feature_importance_db['snapshot_date_key'] = snapshot_date_key
feature_importance_db['model_type'] = 'churn_prediction'
feature_importance_db['model_version'] = 'v1.0'

with engine.begin() as conn:
    feature_importance_db.to_sql(
        'feature_importance',
        con=conn,
        if_exists='append',
        index=False
    )

print(f"\nFeature importance saved to database ({len(feature_importance_db)} features)")


FEATURE IMPORTANCE ANALYSIS

TOP 15 MOST IMPORTANT FEATURES:
          feature_name  importance_score  importance_rank
           rfm_recency         38.379097                1
         rfm_frequency         16.904763                2
          rfm_monetary          9.930012                3
     high_recency_risk          7.223636                4
        has_downgraded          3.742220                5
   minutes_watched_90d          2.767173                6
  quiz_engagement_rate          2.420689                7
 quizzes_attempted_90d          2.190667                8
      engagement_score          2.058060                9
lesson_completion_rate          2.051757               10
  avg_session_duration          1.989870               11
   avg_active_days_30d          1.856666               12
 lessons_completed_90d          1.703641               13
            logins_90d          1.469900               14
          sessions_90d          1.465102               15

Feature im

In [8]:
#Predicting churn probability for all users
print("="*80)
print("PREDICTING CHURN FOR ALL USERS")
print("="*80)

X_full = rfm_df[feature_columns].copy()
X_full = X_full.replace([np.inf, -np.inf], 0).fillna(0)

churn_predictions = model.predict_proba(X_full)[:, 1]
rfm_df['churn_probability_predicted'] = churn_predictions

def classify_churn_risk(prob):
    if prob >= 0.7:
        return 'High Risk'
    elif prob >= 0.4:
        return 'Medium Risk'
    elif prob >= 0.2:
        return 'Low Risk'
    else:
        return 'Minimal Risk'

rfm_df['churn_risk_band_predicted'] = rfm_df['churn_probability_predicted'].apply(classify_churn_risk)

print(f"Churn predictions generated for {len(rfm_df):,} users")

print("\nChurn Risk Distribution:")
risk_dist = rfm_df['churn_risk_band_predicted'].value_counts()
for risk, count in risk_dist.items():
    pct = count / len(rfm_df) * 100
    print(f"  {risk:20s}: {count:,} ({pct:.1f}%)")

print("\nChurn Probability Statistics:")
print(f"  Mean:    {rfm_df['churn_probability_predicted'].mean():.3f}")
print(f"  Median:  {rfm_df['churn_probability_predicted'].median():.3f}")
print(f"  Min:     {rfm_df['churn_probability_predicted'].min():.3f}")
print(f"  Max:     {rfm_df['churn_probability_predicted'].max():.3f}")


PREDICTING CHURN FOR ALL USERS
Churn predictions generated for 802 users

Churn Risk Distribution:
  Minimal Risk        : 246 (30.7%)
  High Risk           : 198 (24.7%)
  Low Risk            : 194 (24.2%)
  Medium Risk         : 164 (20.4%)

Churn Probability Statistics:
  Mean:    0.438
  Median:  0.309
  Min:     0.047
  Max:     0.989


In [12]:
#Updating fact_user_analytics_snapshot with churn predictions
print("="*80)
print("UPDATING DATABASE WITH CHURN PREDICTIONS")
print("="*80)

update_df = rfm_df[['user_key', 'churn_probability_predicted', 'churn_risk_band_predicted']].copy()

print(f"Updating {len(update_df):,} user records...")

updated_count = 0
with engine.begin() as conn:
    for idx, row in update_df.iterrows():
        conn.execute(text("""
            UPDATE fact_user_analytics_snapshot
            SET churn_probability = :churn_prob,
                churn_risk_band = :risk_band
            WHERE user_key = :user_key
              AND snapshot_date_key = :snap_date
        """), {
            'churn_prob': float(row['churn_probability_predicted']),
            'risk_band': row['churn_risk_band_predicted'],
            'user_key': int(row['user_key']),
            'snap_date': snapshot_date_key
        })
        updated_count += 1
        
        if updated_count % 500 == 0:
            print(f"Updated {updated_count:,} records...")

print(f"\nUpdated {updated_count:,} records in fact_user_analytics_snapshot")

# Verifying to see if updates took place
with engine.connect() as conn:
    verify_query = text("""
        SELECT 
            COUNT(*) as total,
            COUNT(churn_probability) as has_churn_prob,
            COUNT(churn_risk_band) as has_risk_band
        FROM fact_user_analytics_snapshot
        WHERE snapshot_date_key = :snap_date
    """)
    
    verify_df = pd.read_sql(verify_query, conn, params={'snap_date': snapshot_date_key})
    
    print("\nVerification:")
    print(f"  Total records:         {verify_df['total'].iloc[0]:,}")
    print(f"  With churn_probability: {verify_df['has_churn_prob'].iloc[0]:,}")
    print(f"  With churn_risk_band:  {verify_df['has_risk_band'].iloc[0]:,}")

print("\nDatabase update complete!")


UPDATING DATABASE WITH CHURN PREDICTIONS
Updating 802 user records...
Updated 500 records...

Updated 802 records in fact_user_analytics_snapshot

Verification:
  Total records:         1,000
  With churn_probability: 802
  With churn_risk_band:  802

Database update complete!


In [13]:
#Analyzing churn reasons and save to database
print("="*80)
print("ANALYZING CHURN REASONS")
print("="*80)

#Identifying at-risk users (High + Medium risk)
at_risk_users = rfm_df[rfm_df['churn_risk_band_predicted'].isin(['High Risk', 'Medium Risk'])].copy()

print(f"Analyzing {len(at_risk_users):,} at-risk users...")

def classify_churn_reason(row):
    reasons = []
    
    if row['rfm_recency'] > 30:
        reasons.append('Inactivity')
    if row['logins_90d'] < 3:
        reasons.append('Low Engagement')
    if row['lessons_completed_90d'] == 0:
        reasons.append('Course Dropped')
    if row['active_courses'] == 0:
        reasons.append('No Active Courses')
    if row['minutes_watched_90d'] < 60:
        reasons.append('Low Watch Time')
    if row['has_downgraded'] == 1:
        reasons.append('Downgraded Plan')
    if row['quiz_engagement_rate'] == 0 and row['lessons_completed_90d'] > 0:
        reasons.append('Low Quiz Engagement')
        
    return reasons[0] if reasons else 'Other'

at_risk_users['primary_churn_reason'] = at_risk_users.apply(classify_churn_reason, axis=1)

churn_reasons_agg = at_risk_users.groupby('primary_churn_reason').agg({
    'user_key': 'count',
    'churn_probability_predicted': 'mean'
}).reset_index()

churn_reasons_agg.columns = ['reason_category', 'reason_count', 'avg_churn_probability']
churn_reasons_agg['reason_pct'] = (churn_reasons_agg['reason_count'] / len(at_risk_users) * 100).round(1)

reason_display_map = {
    'Inactivity': 'Prolonged Inactivity (30+ days)',
    'Low Engagement': 'Low Platform Engagement',
    'Course Dropped': 'No Lessons Completed',
    'No Active Courses': 'No Active Courses',
    'Low Watch Time': 'Minimal Watch Time',
    'Downgraded Plan': 'Subscription Downgrade',
    'Low Quiz Engagement': 'Low Quiz Participation',
    'Other': 'Other Factors'
}

churn_reasons_agg['reason_display_name'] = churn_reasons_agg['reason_category'].map(reason_display_map)

def assign_severity(avg_prob):
    if avg_prob >= 0.7:
        return 'High'
    elif avg_prob >= 0.4:
        return 'Medium'
    else:
        return 'Low'

churn_reasons_agg['severity_level'] = churn_reasons_agg['avg_churn_probability'].apply(assign_severity)

churn_reasons_agg['snapshot_date_key'] = snapshot_date_key

print("\nCHURN REASONS BREAKDOWN:")
print(churn_reasons_agg.sort_values('reason_count', ascending=False).to_string(index=False))

with engine.begin() as conn:
    churn_reasons_agg.to_sql(
        'churn_reasons',
        con=conn,
        if_exists='append',
        index=False
    )

print(f"\nChurn reasons saved to database ({len(churn_reasons_agg)} categories)")


ANALYZING CHURN REASONS
Analyzing 362 at-risk users...

CHURN REASONS BREAKDOWN:
reason_category  reason_count  avg_churn_probability  reason_pct             reason_display_name severity_level  snapshot_date_key
Downgraded Plan           156               0.652966        43.1          Subscription Downgrade         Medium           20251128
     Inactivity           155               0.868262        42.8 Prolonged Inactivity (30+ days)           High           20251128
          Other            51               0.512238        14.1                   Other Factors         Medium           20251128

Churn reasons saved to database (3 categories)
