# Feature Drift Analysis (Pandas Version)

## Overview
Comprehensive drift analysis using **pandas** with memory-efficient processing. This notebook performs drift detection for both **numerical features (PSI)** and **categorical features (Chi-Square)**, along with monthly trend analysis.

### Analysis Types:
1. **PSI Drift Analysis (Numerical Features)**: Population Stability Index comparing in-time vs OOT distributions
2. **Chi-Square Drift Analysis (Categorical Features)**: Chi-square statistic comparing categorical distributions
3. **Monthly Drift Trends**: Temporal evolution of drift for OOT months vs in-time baseline
4. **Monthly Statistics Trends**: Median and average trends for numerical features across all months

### Pandas Optimizations:
- **Sampled full-table processing**: Load complete tables with sampling for accurate statistics
- **Efficient binning**: Vectorized operations for PSI calculation
- **Memory management**: Process one table at a time, free memory between tables
- **Hybrid Spark/Pandas**: Use Spark for efficient Parquet reading, pandas for analysis

### Drift Thresholds:
**PSI (Numerical Features)**:
- PSI < 0.1: Insignificant drift
- 0.1 ≤ PSI < 0.25: Moderate drift (monitor)
- PSI ≥ 0.25: Significant drift (investigate)

**Chi-Square (Categorical Features)**:
- Chi-square < 10.0: Insignificant drift
- 10.0 ≤ Chi-square < 25.0: Moderate drift (monitor)
- Chi-square ≥ 25.0: Significant drift (investigate)

### Outputs:
- **Numerical Features (PSI)**:
  - PSI scores CSV (in-time vs OOT): `psi_overall_intime_vs_oot.csv`
  - Monthly PSI CSVs per table: `psi_monthly_trends_{table_name}.csv`
  - Individual monthly trend plots for each feature
- **Categorical Features (Chi-Square)**:
  - Chi-square scores CSV (in-time vs OOT): `chi_square_overall_intime_vs_oot.csv`
  - Monthly chi-square CSVs per table: `chi_square_monthly_trends_{table_name}.csv`
  - Individual monthly trend plots for each feature
- **Monthly Statistics Trends**:
  - Monthly median/average CSV per table: `monthly_statistics_trends_{table_name}.csv`
  - Contains median and average for all numerical features across ALL months (in-time + OOT)
- Overall drift visualizations: `plots/psi_distribution.png`, `plots/chi_square_distribution.png`

---


In [None]:
%pip install --upgrade pandas==2 -i https://repo.td.com/repository/pypi-all/simple

In [None]:
# Import optimization libraries
from joblib import Parallel, delayed
import warnings
warnings.filterwarnings('ignore')

# Optimized monthly trend plot creation (reusable for PSI and Chi-Square)
def create_monthly_trend_plot(feature, feature_data, threshold_moderate, threshold_significant, 
                               table_folder_name, table_trend_folder, metric_name='PSI'):
    """Create monthly trend plot - optimized for parallel processing"""
    try:
        if len(feature_data) > 0:
            fig, ax = plt.subplots(figsize=(12, 6))
            metric_col = 'psi' if metric_name == 'PSI' else 'chi_square'
            ax.plot(feature_data['month'], feature_data[metric_col], 
                   marker='o', linewidth=2, markersize=6, color='steelblue')
            ax.axhline(y=threshold_moderate, color='orange', linestyle='--', linewidth=1.5, 
                      label=f'Moderate ({threshold_moderate})')
            ax.axhline(y=threshold_significant, color='red', linestyle='--', linewidth=1.5, 
                      label=f'Significant ({threshold_significant})')
            ax.set_title(f'Monthly {metric_name} Trend: {feature}\n({table_folder_name})', 
                       fontsize=12, fontweight='bold')
            ax.set_ylabel(metric_name, fontsize=10)
            ax.set_xlabel('Month', fontsize=10)
            ax.legend(fontsize=9, loc='best')
            ax.grid(True, alpha=0.3, linestyle='--')
            ax.tick_params(axis='x', rotation=45)
            plt.tight_layout()
            plot_file = f"{table_trend_folder}{feature}.png"
            save_plot_to_adls(fig, plot_file, dpi=150)
            plt.close(fig)
            return True
        return False
    except:
        return False

print("✓ Optimization functions loaded")


In [None]:
dbutils.library.restartPython()

In [None]:
import pandas
print(pandas.__version__)

In [None]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
from io import BytesIO
import gc

# Helper functions
def save_pandas_to_csv_adls(df_pandas, adls_path):
    csv_string = df_pandas.to_csv(index=False)
    dbutils.fs.put(adls_path, csv_string, overwrite=True)
    print(f"✓ Saved CSV to {adls_path}")

def save_plot_to_adls(fig, adls_path, dpi=150):
    import tempfile, os
    buf = BytesIO()
    fig.savefig(buf, format='png', dpi=dpi, bbox_inches='tight')
    buf.seek(0)
    with tempfile.NamedTemporaryFile(mode='wb', suffix='.png', delete=False) as tmp:
        tmp.write(buf.getvalue())
        tmp_path = tmp.name
    dbutils.fs.cp(f"file:{tmp_path}", adls_path)
    os.remove(tmp_path)
    print(f"✓ Saved plot to {adls_path}")

def calculate_psi(expected, actual, num_bins=10):
    """Calculate PSI between two distributions"""
    try:
        expected_clean = expected.dropna()
        actual_clean = actual.dropna()
        if len(expected_clean) == 0 or len(actual_clean) == 0:
            return None
        breakpoints = np.percentile(expected_clean, np.linspace(0, 100, num_bins + 1))
        breakpoints = np.unique(breakpoints)
        if len(breakpoints) <= 1:
            return None
        expected_binned = pd.cut(expected_clean, bins=breakpoints, include_lowest=True, duplicates='drop')
        actual_binned = pd.cut(actual_clean, bins=breakpoints, include_lowest=True, duplicates='drop')
        expected_pct = expected_binned.value_counts(normalize=True, sort=False) + 0.0001
        actual_pct = actual_binned.value_counts(normalize=True, sort=False) + 0.0001
        expected_pct, actual_pct = expected_pct.align(actual_pct, fill_value=0.0001)
        psi_value = np.sum((actual_pct - expected_pct) * np.log(actual_pct / expected_pct))
        return psi_value
    except:
        return None

def calculate_chi_square(expected, actual):
    """Calculate Chi-square statistic between two categorical distributions"""
    try:
        from scipy.stats import chi2_contingency
        
        expected_clean = expected.dropna()
        actual_clean = actual.dropna()
        
        if len(expected_clean) == 0 or len(actual_clean) == 0:
            return None
        
        # Get all unique categories from both distributions
        all_categories = set(expected_clean.unique()) | set(actual_clean.unique())
        
        if len(all_categories) <= 1:
            return None  # No variation to measure
        
        # Create contingency table
        expected_counts = expected_clean.value_counts()
        actual_counts = actual_clean.value_counts()
        
        # Align to include all categories
        expected_counts = expected_counts.reindex(all_categories, fill_value=0)
        actual_counts = actual_counts.reindex(all_categories, fill_value=0)
        
        # Create 2xN contingency table
        contingency = np.array([expected_counts.values, actual_counts.values])
        
        # Calculate chi-square statistic
        chi2, p_value, dof, expected_freq = chi2_contingency(contingency)
        
        return chi2
    except Exception as e:
        return None

print("✓ Setup complete")


In [None]:
# Configuration
DATA_PATH = "abfss://home@edaaaazepcalayelaye0001.dfs.core.windows.net/MD_Artifacts/money-out/data/"
OUTPUT_PATH = "abfss://home@edaaaazepcalayelaye0001.dfs.core.windows.net/MD_Artifacts/money-out/mv/eda_validation/drift_analysis/"
PLOT_PATH = OUTPUT_PATH + "plots/"
dbutils.fs.mkdirs(OUTPUT_PATH)
dbutils.fs.mkdirs(PLOT_PATH)

SAMPLING_RATIO = 0.01
OOT_START_DATE = '2024-01-01'
PSI_THRESHOLD_MODERATE = 0.1
PSI_THRESHOLD_SIGNIFICANT = 0.25
CHI_SQUARE_THRESHOLD_MODERATE = 10.0  # Moderate drift threshold
CHI_SQUARE_THRESHOLD_SIGNIFICANT = 25.0  # Significant drift threshold

TABLES = [
    ("cust", "cust_basic_sumary", ''),
    ("cust", "batch_credit_bureau", ''),
    ("dem", "acct", 2438),
    ("cc", "acct", 2444),
]

# Load metadata
feature_metadata_rows = spark.read.text(f"{DATA_PATH}/feature/feature_metadata.jsonl").collect()
feature_metadata = json.loads('\n'.join([row.value for row in feature_metadata_rows]))

print("✓ Config loaded")


## Processing Strategy: Sampled Full-Table (Accuracy Prioritized)

### Why This Approach?
**PSI** (Population Stability Index) and **Chi-Square** both compare **complete distributions** between two time periods. These metrics **CANNOT be calculated incrementally** - we need to see all values to calculate accurate statistics.

### Memory Efficiency:
- **Memory usage**: Scales with SAMPLING_RATIO
  - 1% sampling: ~2 GB per table
  - 10% sampling: ~5 GB per table
  - 100% sampling: ~15-20 GB per table
- **Mitigation**: Process one table at a time (4-10 tables total), free memory between tables
- **Recommendation**: Use `SAMPLING_RATIO = 0.01` (1%) for accurate results with manageable memory

### How Drift Metrics Work:

**PSI (Numerical Features)**:
```
PSI = Σ (actual% - expected%) × ln(actual% / expected%)

Steps:
  1. Load FULL table (sampled)
  2. Split into in-time and OOT periods
  3. For each numerical feature:
     a. Create bins based on in-time distribution percentiles
     b. Calculate % of values in each bin for in-time (expected)
     c. Calculate % of values in each bin for OOT (actual)
     d. Compute PSI across bins
```

**Chi-Square (Categorical Features)**:
```
Chi-Square = Σ (observed - expected)² / expected

Steps:
  1. Load FULL table (sampled)
  2. Split into in-time and OOT periods
  3. For each categorical feature:
     a. Get all unique categories from both periods
     b. Create contingency table (2 rows × N categories)
     c. Calculate expected frequencies based on in-time distribution
     d. Compute chi-square statistic
```

### Why This Requires Full Data:
- ❌ **PSI**: Cannot bin incrementally - bins must span full value range
- ❌ **PSI**: Cannot calculate percentages without seeing all values
- ❌ **Chi-Square**: Need complete category frequency distributions
- ❌ **Chi-Square**: Contingency table requires all categories from both periods
- ✅ Must load all sampled values for both time periods
- ✅ 1% sampling gives accurate drift metrics on representative sample

### Drift Interpretation:

**PSI (Numerical)**:
- **PSI < 0.1**: Insignificant drift (no action needed)
- **0.1 ≤ PSI < 0.25**: Moderate drift (monitor)
- **PSI ≥ 0.25**: Significant drift (investigate feature)

**Chi-Square (Categorical)**:
- **Chi-square < 10.0**: Insignificant drift (no action needed)
- **10.0 ≤ Chi-square < 25.0**: Moderate drift (monitor)
- **Chi-square ≥ 25.0**: Significant drift (investigate feature)

### Implementation:
```
For each table:
  1. Load full table: spark.read.parquet().sample(0.01)
  2. Convert to pandas: .toPandas()
  3. Split: df_intime = df[df['efectv_dt'] < '2024-01-01']
           df_oot = df[df['efectv_dt'] >= '2024-01-01']
  4. For numerical features: Calculate PSI
  5. For categorical features: Calculate Chi-Square
  6. For monthly trends: Compare each OOT month vs in-time baseline
  7. Free memory: del df, df_intime, df_oot; gc.collect()
```

---


## Success Criteria and Drift Assessment

### ✅ **Analysis Succeeds If**:
- **PSI calculated** for all numerical features
- **Chi-Square calculated** for all categorical features
- Drift levels classified (Insignificant/Moderate/Significant) for both types
- Features with significant drift are flagged:
  - Numerical: PSI ≥ 0.25
  - Categorical: Chi-Square ≥ 25.0
- Monthly drift trends show patterns (not random noise)
- Monthly statistics trends (median/average) calculated for all numerical features
- Results saved successfully to ADLS

### 📊 **Drift Interpretation Guide**:

**PSI (Numerical Features)**:
| PSI Range | Drift Level | Action Required | Typical Cause |
|-----------|-------------|-----------------|---------------|
| < 0.1 | Insignificant | No action | Normal variation |
| 0.1 - 0.25 | Moderate | Monitor | Slight distribution shift |
| ≥ 0.25 | Significant | Investigate | Major distribution change |

**Chi-Square (Categorical Features)**:
| Chi-Square Range | Drift Level | Action Required | Typical Cause |
|------------------|-------------|-----------------|---------------|
| < 10.0 | Insignificant | No action | Normal variation |
| 10.0 - 25.0 | Moderate | Monitor | Slight category distribution shift |
| ≥ 25.0 | Significant | Investigate | Major category distribution change |

### 🔍 **Features to Flag**:
After analysis, pay special attention to:
- **PSI ≥ 0.25**: Significant numerical drift - may impact model performance
- **Chi-Square ≥ 25.0**: Significant categorical drift - category distributions changed
- **Increasing drift trends**: Growing drift over OOT months (both metrics)
- **Sudden drift spikes**: Abrupt changes in specific months
- **Bureau features**: Often have higher drift due to external factors
- **High-cardinality categoricals**: May show higher chi-square due to sparse categories

### 📈 **Expected Patterns**:
- **Most numerical features**: PSI < 0.1 (stable distributions)
- **Most categorical features**: Chi-Square < 10.0 (stable category frequencies)
- **Some features**: Moderate drift (0.1 ≤ PSI < 0.25 or 10 ≤ Chi-Square < 25) - acceptable
- **Few features**: Significant drift (PSI ≥ 0.25 or Chi-Square ≥ 25.0) - investigate
- **Bureau features**: May have higher drift (external data source)
- **Transaction features**: Usually stable drift (internal data)
- **Monthly statistics**: Should show gradual trends, not abrupt jumps

### 📊 **Monthly Statistics Trends**:
- **Median trends**: Should be relatively stable across months
- **Average trends**: May fluctuate more than median (sensitive to outliers)
- **In-Time vs OOT**: Compare trends before/after 2024 boundary
- **Gradual changes**: Expected (business cycles, seasonality)
- **Abrupt changes**: Investigate (data quality issues, feature engineering changes)

### ⚠️ **When to Investigate**:
- More than 20% of features have significant drift
- Core features (demographics, balances) show high drift
- Drift increases month-over-month in OOT period
- Monthly statistics show abrupt jumps (not gradual trends)
- Drift patterns differ significantly by table/family
- Categorical features with very high chi-square (possible data issues)

---


In [None]:
print("="*80)
print("PSI DRIFT ANALYSIS")
print("="*80)

all_psi_results = []

for fam_name, table, fam in TABLES:
    print(f"\nProcessing: {fam_name}-{table}")
    
    table_path = f"{DATA_PATH}/feature/{table}/parquet" if not fam else f"{DATA_PATH}/feature/{table}_{fam}/parquet"
    table_meta_key = table if not fam else f"{table}_{fam}"
    
    if fam_name not in feature_metadata or table_meta_key not in feature_metadata[fam_name]:
        continue
    
    num_features = feature_metadata[fam_name][table_meta_key].get("num_features", [])
    
    # Load via spark then convert to pandas
    df_spark = spark.read.format("parquet").load(table_path)
    if 'efectv_dt' not in df_spark.columns:
        continue
    
    if SAMPLING_RATIO < 1.0:
        df_spark = df_spark.sample(fraction=SAMPLING_RATIO, withReplacement=False, seed=42)
    
    df = df_spark.select(['efectv_dt'] + [c for c in num_features if c in df_spark.columns]).toPandas()
    df['efectv_dt'] = pd.to_datetime(df['efectv_dt'])
    
    # Split data
    # Convert OOT_START_DATE to datetime for comparison
    oot_date = pd.to_datetime(OOT_START_DATE)

    df_intime = df[df['efectv_dt'] < oot_date]
    df_oot = df[df['efectv_dt'] >= oot_date]
    
    # Calculate PSI for each feature
    for feature in num_features:
        if feature not in df.columns:
            continue
        psi = calculate_psi(df_intime[feature], df_oot[feature])
        if psi is not None:
            drift_level = 'Significant' if psi >= PSI_THRESHOLD_SIGNIFICANT else \
                         'Moderate' if psi >= PSI_THRESHOLD_MODERATE else 'Insignificant'
            all_psi_results.append({
                'table': f"{fam_name}_{table}",
                'feature': feature,
                'psi': psi,
                'drift_level': drift_level
            })
    
    del df, df_spark, df_intime, df_oot
    gc.collect()

# Save results and create visualizations
if all_psi_results:
    psi_df = pd.DataFrame(all_psi_results).sort_values('psi', ascending=False)
    save_pandas_to_csv_adls(psi_df, OUTPUT_PATH + "psi_overall_intime_vs_oot.csv")
    
    # Create drift analysis summary
    summary = {
        'analysis_date': pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S"),
        'sampling_ratio': SAMPLING_RATIO,
        'total_features': len(psi_df),
        'insignificant_drift': len(psi_df[psi_df['drift_level'] == 'Insignificant']),
        'moderate_drift': len(psi_df[psi_df['drift_level'] == 'Moderate']),
        'significant_drift': len(psi_df[psi_df['drift_level'] == 'Significant']),
        'mean_psi': float(psi_df['psi'].mean()),
        'median_psi': float(psi_df['psi'].median()),
        'max_psi': float(psi_df['psi'].max()),
        'min_psi': float(psi_df['psi'].min()),
    }
    summary_df = pd.DataFrame([summary])
    save_pandas_to_csv_adls(summary_df, OUTPUT_PATH + "drift_analysis_summary.csv")
    
    # PSI distribution visualization
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    axes[0].hist(psi_df['psi'], bins=50, edgecolor='black', alpha=0.7)
    axes[0].axvline(x=PSI_THRESHOLD_MODERATE, color='orange', linestyle='--', linewidth=2, label='Moderate')
    axes[0].axvline(x=PSI_THRESHOLD_SIGNIFICANT, color='red', linestyle='--', linewidth=2, label='Significant')
    axes[0].set_xlabel('PSI Value', fontsize=12)
    axes[0].set_ylabel('Frequency', fontsize=12)
    axes[0].set_title('PSI Distribution', fontsize=14, fontweight='bold')
    axes[0].legend(fontsize=10)
    
    drift_counts = psi_df['drift_level'].value_counts()
    colors = {'Insignificant': 'green', 'Moderate': 'orange', 'Significant': 'red'}
    bar_colors = [colors.get(x, 'gray') for x in drift_counts.index]
    axes[1].bar(drift_counts.index, drift_counts.values, color=bar_colors, edgecolor='black', alpha=0.7)
    axes[1].set_xlabel('Drift Level', fontsize=12)
    axes[1].set_ylabel('Count', fontsize=12)
    axes[1].set_title('Features by Drift Level', fontsize=14, fontweight='bold')
    
    plt.tight_layout()
    save_plot_to_adls(fig, PLOT_PATH + "psi_distribution.png", dpi=150)
    plt.close(fig)
    
    print(f"\n✓ Analyzed {len(psi_df)} features")
    print(f"  Insignificant drift: {summary['insignificant_drift']}")
    print(f"  Moderate drift: {summary['moderate_drift']}")
    print(f"  Significant drift: {summary['significant_drift']}")
    print(f"  Mean PSI: {summary['mean_psi']:.4f}")

print("\n✓ Drift analysis complete")


In [None]:
print("="*80)
print("CHI-SQUARE DRIFT ANALYSIS (CATEGORICAL FEATURES)")
print("="*80)

all_chi_square_results = []

for fam_name, table, fam in TABLES:
    print(f"\nProcessing: {fam_name}-{table}")
    
    table_path = f"{DATA_PATH}/feature/{table}/parquet" if not fam else f"{DATA_PATH}/feature/{table}_{fam}/parquet"
    table_meta_key = table if not fam else f"{table}_{fam}"
    
    if fam_name not in feature_metadata or table_meta_key not in feature_metadata[fam_name]:
        continue
    
    cat_features = list(feature_metadata[fam_name][table_meta_key].get("cat_features", {}).keys())
    
    if len(cat_features) == 0:
        continue
    
    # Load via spark then convert to pandas
    df_spark = spark.read.format("parquet").load(table_path)
    if 'efectv_dt' not in df_spark.columns:
        continue
    
    if SAMPLING_RATIO < 1.0:
        df_spark = df_spark.sample(fraction=SAMPLING_RATIO, withReplacement=False, seed=42)
    
    df = df_spark.select(['efectv_dt'] + [c for c in cat_features if c in df_spark.columns]).toPandas()
    df['efectv_dt'] = pd.to_datetime(df['efectv_dt'])
    
    # Convert OOT_START_DATE to datetime for comparison
    oot_date = pd.to_datetime(OOT_START_DATE)
    
    # Split data
    df_intime = df[df['efectv_dt'] < oot_date]
    df_oot = df[df['efectv_dt'] >= oot_date]
    
    # Calculate Chi-square for each categorical feature
    for feature in cat_features:
        if feature not in df.columns:
            continue
        chi2 = calculate_chi_square(df_intime[feature], df_oot[feature])
        if chi2 is not None:
            drift_level = 'Significant' if chi2 >= CHI_SQUARE_THRESHOLD_SIGNIFICANT else \
                         'Moderate' if chi2 >= CHI_SQUARE_THRESHOLD_MODERATE else 'Insignificant'
            all_chi_square_results.append({
                'table': f"{fam_name}_{table}",
                'feature': feature,
                'chi_square': chi2,
                'drift_level': drift_level
            })
    
    del df, df_spark, df_intime, df_oot
    gc.collect()

# Save results and create visualizations
if all_chi_square_results:
    chi2_df = pd.DataFrame(all_chi_square_results).sort_values('chi_square', ascending=False)
    save_pandas_to_csv_adls(chi2_df, OUTPUT_PATH + "chi_square_overall_intime_vs_oot.csv")
    
    # Chi-square distribution visualization
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    axes[0].hist(chi2_df['chi_square'], bins=50, edgecolor='black', alpha=0.7)
    axes[0].axvline(x=CHI_SQUARE_THRESHOLD_MODERATE, color='orange', linestyle='--', linewidth=2, label='Moderate')
    axes[0].axvline(x=CHI_SQUARE_THRESHOLD_SIGNIFICANT, color='red', linestyle='--', linewidth=2, label='Significant')
    axes[0].set_xlabel('Chi-Square Value', fontsize=12)
    axes[0].set_ylabel('Frequency', fontsize=12)
    axes[0].set_title('Chi-Square Distribution', fontsize=14, fontweight='bold')
    axes[0].legend(fontsize=10)
    
    drift_counts = chi2_df['drift_level'].value_counts()
    colors = {'Insignificant': 'green', 'Moderate': 'orange', 'Significant': 'red'}
    bar_colors = [colors.get(x, 'gray') for x in drift_counts.index]
    axes[1].bar(drift_counts.index, drift_counts.values, color=bar_colors, edgecolor='black', alpha=0.7)
    axes[1].set_xlabel('Drift Level', fontsize=12)
    axes[1].set_ylabel('Count', fontsize=12)
    axes[1].set_title('Features by Drift Level', fontsize=14, fontweight='bold')
    
    plt.tight_layout()
    save_plot_to_adls(fig, PLOT_PATH + "chi_square_distribution.png", dpi=150)
    plt.close(fig)
    
    print(f"\n✓ Analyzed {len(chi2_df)} categorical features")
    print(f"  Insignificant drift: {len(chi2_df[chi2_df['drift_level'] == 'Insignificant'])}")
    print(f"  Moderate drift: {len(chi2_df[chi2_df['drift_level'] == 'Moderate'])}")
    print(f"  Significant drift: {len(chi2_df[chi2_df['drift_level'] == 'Significant'])}")
    print(f"  Mean Chi-square: {chi2_df['chi_square'].mean():.4f}")

print("\n✓ Chi-square drift analysis complete")


## Monthly PSI Trends (Numerical Features)

Calculate PSI for each OOT month to show temporal evolution of drift for all numerical features.

### Output Structure:
- Individual trend plots for each feature (one PNG per feature)
- Saved in table-specific folders: `plots/monthly_trends/{table_name}/`
- Each file named: `{feature_name}.png`
- Monthly PSI CSV per table: `psi_monthly_trends_{table_name}.csv`


## Monthly Chi-Square Trends (Categorical Features)

Calculate Chi-square for each OOT month to show temporal evolution of drift for all categorical features.

### Output Structure:
- Individual trend plots for each feature (one PNG per feature)
- Saved in table-specific folders: `plots/monthly_trends_chi_square/{table_name}/`
- Each file named: `{feature_name}.png`
- Monthly Chi-square CSV per table: `chi_square_monthly_trends_{table_name}.csv`


In [None]:
# Monthly Chi-Square trends for all tables (categorical features)
print("\n" + "="*80)
print("MONTHLY CHI-SQUARE TREND ANALYSIS (CATEGORICAL FEATURES)")
print("="*80)

MONTHLY_TRENDS_CHI_PLOT_PATH = PLOT_PATH + "monthly_trends_chi_square/"
dbutils.fs.mkdirs(MONTHLY_TRENDS_CHI_PLOT_PATH)

for fam_name, table, fam in TABLES:
    print(f"\nProcessing monthly chi-square trends: {fam_name}-{table}")
    
    table_path = f"{DATA_PATH}/feature/{table}/parquet" if not fam else f"{DATA_PATH}/feature/{table}_{fam}/parquet"
    table_meta_key = table if not fam else f"{table}_{fam}"
    
    if fam_name not in feature_metadata or table_meta_key not in feature_metadata[fam_name]:
        print(f"  Skipping (not in metadata)")
        continue
    
    try:
        # Load table
        df_spark = spark.read.format("parquet").load(table_path)
        
        if 'efectv_dt' not in df_spark.columns:
            print(f"  Skipping (no efectv_dt column)")
            del df_spark
            gc.collect()
            continue
        
        if SAMPLING_RATIO < 1.0:
            df_spark = df_spark.sample(fraction=SAMPLING_RATIO, withReplacement=False, seed=42)
        
        # Get ALL categorical features
        cat_features = list(feature_metadata[fam_name][table_meta_key].get("cat_features", {}).keys())
        cat_features = [f for f in cat_features if f in df_spark.columns]
        
        if len(cat_features) == 0:
            print(f"  Skipping (no categorical features)")
            del df_spark
            gc.collect()
            continue
        
        df = df_spark.select(['efectv_dt'] + cat_features).toPandas()
        df['efectv_dt'] = pd.to_datetime(df['efectv_dt'])
        
        # Convert OOT_START_DATE to datetime for comparison
        oot_date = pd.to_datetime(OOT_START_DATE)
        
        # Separate in-time and OOT
        df_intime = df[df['efectv_dt'] < oot_date]
        df_oot = df[df['efectv_dt'] >= oot_date]
        
        if len(df_intime) == 0 or len(df_oot) == 0:
            print(f"  Skipping (insufficient data)")
            del df, df_intime, df_oot, df_spark
            gc.collect()
            continue
        
        # Get unique OOT months
        oot_months = df_oot['efectv_dt'].dt.to_period('M').unique()
        
        if len(oot_months) == 0:
            print(f"  Skipping (no OOT months)")
            del df, df_intime, df_oot, df_spark
            gc.collect()
            continue
        
        # Create folder for this table's monthly trend plots
        table_folder_name = f"{fam_name}_{table}" if not fam else f"{fam_name}_{table}_{fam}"
        table_trend_folder = f"{MONTHLY_TRENDS_CHI_PLOT_PATH}{table_folder_name}/"
        dbutils.fs.mkdirs(table_trend_folder)
        
        # Calculate Chi-square for each month and feature
        monthly_chi2_results = []
        print(f"  Calculating Chi-square for {len(cat_features)} features across {len(oot_months)} months...")
        
        for month in sorted(oot_months):
            df_month = df_oot[df_oot['efectv_dt'].dt.to_period('M') == month]
            for feature in cat_features:
                chi2 = calculate_chi_square(df_intime[feature], df_month[feature])
                if chi2 is not None:
                    monthly_chi2_results.append({
                        'month': str(month),
                        'feature': feature,
                        'chi_square': chi2
                    })
        
        # Save monthly Chi-square results CSV per table
        if monthly_chi2_results:
            monthly_chi2_df = pd.DataFrame(monthly_chi2_results)
            csv_file = f"{OUTPUT_PATH}chi_square_monthly_trends_{table_folder_name}.csv"
            save_pandas_to_csv_adls(monthly_chi2_df, csv_file)
            
            # Create individual trend plots for each feature
            print(f"  Creating {len(cat_features)} individual trend plots...")
            saved_count = 0
            failed_count = 0
            
            for feature in cat_features:
                fig = None
                try:
                    # Get data for this feature
                    feature_data = monthly_chi2_df[monthly_chi2_df['feature'] == feature].sort_values('month')
                    
                    if len(feature_data) > 0:
                        # Create individual figure for each feature
                        fig, ax = plt.subplots(figsize=(12, 6))
                        
                        # Plot trend
                        ax.plot(feature_data['month'], feature_data['chi_square'], 
                               marker='o', linewidth=2, markersize=6, color='steelblue')
                        
                        # Add threshold lines
                        ax.axhline(y=CHI_SQUARE_THRESHOLD_MODERATE, color='orange', 
                                  linestyle='--', linewidth=1.5, label='Moderate (10.0)')
                        ax.axhline(y=CHI_SQUARE_THRESHOLD_SIGNIFICANT, color='red', 
                                  linestyle='--', linewidth=1.5, label='Significant (25.0)')
                        
                        # Style the plot
                        ax.set_title(f'Monthly Chi-Square Trend: {feature}\n({table_folder_name})', 
                                   fontsize=12, fontweight='bold')
                        ax.set_ylabel('Chi-Square', fontsize=10)
                        ax.set_xlabel('Month', fontsize=10)
                        ax.legend(fontsize=9, loc='best')
                        ax.grid(True, alpha=0.3, linestyle='--')
                        ax.tick_params(axis='x', rotation=45)
                        
                        plt.tight_layout()
                        
                        # Save individual plot (no display - saved directly to ADLS)
                        plot_file = f"{table_trend_folder}{feature}.png"
                        save_plot_to_adls(fig, plot_file, dpi=150)
                        plt.close(fig)  # Explicitly close to free memory
                        fig = None  # Prevent double-close
                        saved_count += 1
                    else:
                        if fig is not None:
                            plt.close(fig)
                        failed_count += 1
                        print(f"    Warning: Skipped {feature} (no data)")
                        
                except Exception as e:
                    if fig is not None:
                        plt.close(fig)
                    failed_count += 1
                    print(f"    Warning: Could not plot {feature}: {str(e)}")
            
            print(f"  ✓ Monthly trend plots saved: {saved_count} successful, {failed_count} failed")
            print(f"    Location: {table_trend_folder}")
        
        del df, df_intime, df_oot, df_spark
        gc.collect()
        
    except Exception as e:
        print(f"  ✗ Error processing {fam_name}-{table}: {str(e)}")
        try:
            del df_spark
            gc.collect()
        except:
            pass

print("\n✓ Monthly Chi-Square trend analysis complete")


## Monthly Statistics Trends (Median & Average for Numerical Features)

Calculate monthly median and average for ALL numerical features across ALL months (both in-time and OOT).

### Output Format:
- CSV per table: `monthly_statistics_trends_{table_name}.csv`
- Columns: `feature_name`, `stat_method`, `month1`, `month2`, `month3`, ...
- Rows: Each feature has two rows (median and average)
- Format example:
  - `feature1, median, val_month1, val_month2, ...`
  - `feature1, average, val_month1, val_month2, ...`
  - `feature2, median, val_month1, val_month2, ...`
  - `feature2, average, val_month1, val_month2, ...`


In [None]:
# Monthly statistics trends (median and average) for all numerical features across ALL months
print("\n" + "="*80)
print("MONTHLY STATISTICS TRENDS (MEDIAN & AVERAGE)")
print("="*80)

for fam_name, table, fam in TABLES:
    print(f"\nProcessing monthly statistics: {fam_name}-{table}")
    
    table_path = f"{DATA_PATH}/feature/{table}/parquet" if not fam else f"{DATA_PATH}/feature/{table}_{fam}/parquet"
    table_meta_key = table if not fam else f"{table}_{fam}"
    
    if fam_name not in feature_metadata or table_meta_key not in feature_metadata[fam_name]:
        print(f"  Skipping (not in metadata)")
        continue
    
    try:
        # Load table
        df_spark = spark.read.format("parquet").load(table_path)
        
        if 'efectv_dt' not in df_spark.columns:
            print(f"  Skipping (no efectv_dt column)")
            del df_spark
            gc.collect()
            continue
        
        if SAMPLING_RATIO < 1.0:
            df_spark = df_spark.sample(fraction=SAMPLING_RATIO, withReplacement=False, seed=42)
        
        # Get ALL numerical features
        num_features = feature_metadata[fam_name][table_meta_key].get("num_features", [])
        num_features = [f for f in num_features if f in df_spark.columns]
        
        if len(num_features) == 0:
            print(f"  Skipping (no numerical features)")
            del df_spark
            gc.collect()
            continue
        
        df = df_spark.select(['efectv_dt'] + num_features).toPandas()
        df['efectv_dt'] = pd.to_datetime(df['efectv_dt'])
        
        # Convert OOT_START_DATE to datetime for comparison (if needed later)
        oot_date = pd.to_datetime(OOT_START_DATE)
        
        # Get ALL unique months (both in-time and OOT)
        df['month'] = df['efectv_dt'].dt.to_period('M')
        all_months = sorted(df['month'].unique())
        
        if len(all_months) == 0:
            print(f"  Skipping (no months found)")
            del df, df_spark
            gc.collect()
            continue
        
        print(f"  Calculating statistics for {len(num_features)} features across {len(all_months)} months...")
        
        # Build statistics table
        stats_rows = []
        
        for feature in num_features:
            if feature not in df.columns:
                continue
            
            # Calculate median and average for each month
            median_values = {}
            average_values = {}
            
            for month in all_months:
                month_data = df[df['month'] == month][feature].dropna()
                if len(month_data) > 0:
                    median_values[str(month)] = month_data.median()
                    average_values[str(month)] = month_data.mean()
                else:
                    median_values[str(month)] = None
                    average_values[str(month)] = None
            
            # Create row for median
            median_row = {'feature_name': feature, 'stat_method': 'median'}
            median_row.update({str(m): median_values[str(m)] for m in all_months})
            stats_rows.append(median_row)
            
            # Create row for average
            average_row = {'feature_name': feature, 'stat_method': 'average'}
            average_row.update({str(m): average_values[str(m)] for m in all_months})
            stats_rows.append(average_row)
        
        # Convert to DataFrame and save
        if stats_rows:
            stats_df = pd.DataFrame(stats_rows)
            # Reorder columns: feature_name, stat_method, then months in order
            col_order = ['feature_name', 'stat_method'] + [str(m) for m in all_months]
            stats_df = stats_df[col_order]
            
            table_folder_name = f"{fam_name}_{table}" if not fam else f"{fam_name}_{table}_{fam}"
            csv_file = f"{OUTPUT_PATH}monthly_statistics_trends_{table_folder_name}.csv"
            save_pandas_to_csv_adls(stats_df, csv_file)
            print(f"  ✓ Saved monthly statistics trends for {len(num_features)} features")
        
        del df, df_spark
        gc.collect()
        
    except Exception as e:
        print(f"  ✗ Error processing {fam_name}-{table}: {str(e)}")
        try:
            del df_spark
            gc.collect()
        except:
            pass

print("\n✓ Monthly statistics trends analysis complete")


## Chi-Square Overall Analysis (In-Time vs OOT)

Calculate Chi-square statistic between in-time and OOT distributions for categorical features.


In [None]:
# Monthly PSI trends for all tables
print("\n" + "="*80)
print("MONTHLY PSI TREND ANALYSIS")
print("="*80)

MONTHLY_TRENDS_PLOT_PATH = PLOT_PATH + "monthly_trends/"
dbutils.fs.mkdirs(MONTHLY_TRENDS_PLOT_PATH)

for fam_name, table, fam in TABLES:
    print(f"\nProcessing monthly trends: {fam_name}-{table}")
    
    table_path = f"{DATA_PATH}/feature/{table}/parquet" if not fam else f"{DATA_PATH}/feature/{table}_{fam}/parquet"
    table_meta_key = table if not fam else f"{table}_{fam}"
    
    if fam_name not in feature_metadata or table_meta_key not in feature_metadata[fam_name]:
        print(f"  Skipping (not in metadata)")
        continue
    
    try:
        # Load table
        df_spark = spark.read.format("parquet").load(table_path)
        
        if 'efectv_dt' not in df_spark.columns:
            print(f"  Skipping (no efectv_dt column)")
            del df_spark
            gc.collect()
            continue
        
        if SAMPLING_RATIO < 1.0:
            df_spark = df_spark.sample(fraction=SAMPLING_RATIO, withReplacement=False, seed=42)
        
        # Get ALL numerical features (not just first 10)
        num_features = feature_metadata[fam_name][table_meta_key].get("num_features", [])
        num_features = [f for f in num_features if f in df_spark.columns]
        
        if len(num_features) == 0:
            print(f"  Skipping (no numerical features)")
            del df_spark
            gc.collect()
            continue
        
        df = df_spark.select(['efectv_dt'] + num_features).toPandas()
        df['efectv_dt'] = pd.to_datetime(df['efectv_dt'])
        
        # Convert OOT_START_DATE to datetime for comparison
        oot_date = pd.to_datetime(OOT_START_DATE)
        
        # Separate in-time and OOT
        df_intime = df[df['efectv_dt'] < oot_date]
        df_oot = df[df['efectv_dt'] >= oot_date]
        
        if len(df_intime) == 0 or len(df_oot) == 0:
            print(f"  Skipping (insufficient data)")
            del df, df_intime, df_oot, df_spark
            gc.collect()
            continue
        
        # Get unique OOT months
        oot_months = df_oot['efectv_dt'].dt.to_period('M').unique()
        
        if len(oot_months) == 0:
            print(f"  Skipping (no OOT months)")
            del df, df_intime, df_oot, df_spark
            gc.collect()
            continue
        
        # Create folder for this table's monthly trend plots
        table_folder_name = f"{fam_name}_{table}" if not fam else f"{fam_name}_{table}_{fam}"
        table_trend_folder = f"{MONTHLY_TRENDS_PLOT_PATH}{table_folder_name}/"
        dbutils.fs.mkdirs(table_trend_folder)
        
        # Calculate PSI for each month and feature
        monthly_psi_results = []
        print(f"  Calculating PSI for {len(num_features)} features across {len(oot_months)} months...")
        
        for month in sorted(oot_months):
            df_month = df_oot[df_oot['efectv_dt'].dt.to_period('M') == month]
            for feature in num_features:
                psi = calculate_psi(df_intime[feature], df_month[feature])
                if psi is not None:
                    monthly_psi_results.append({
                        'month': str(month),
                        'feature': feature,
                        'psi': psi
                    })
        
        # Save monthly PSI results CSV per table
        if monthly_psi_results:
            monthly_psi_df = pd.DataFrame(monthly_psi_results)
            csv_file = f"{OUTPUT_PATH}psi_monthly_trends_{table_folder_name}.csv"
            save_pandas_to_csv_adls(monthly_psi_df, csv_file)
            
            # Create individual trend plots for each feature
            print(f"  Creating {len(num_features)} individual trend plots...")
            saved_count = 0
            failed_count = 0
            
            for feature in num_features:
                fig = None
                try:
                    # Get data for this feature
                    feature_data = monthly_psi_df[monthly_psi_df['feature'] == feature].sort_values('month')
                    
                    if len(feature_data) > 0:
                        # Create individual figure for each feature
                        fig, ax = plt.subplots(figsize=(12, 6))
                        
                        # Plot trend
                        ax.plot(feature_data['month'], feature_data['psi'], 
                               marker='o', linewidth=2, markersize=6, color='steelblue')
                        
                        # Add threshold lines
                        ax.axhline(y=PSI_THRESHOLD_MODERATE, color='orange', 
                                  linestyle='--', linewidth=1.5, label='Moderate (0.1)')
                        ax.axhline(y=PSI_THRESHOLD_SIGNIFICANT, color='red', 
                                  linestyle='--', linewidth=1.5, label='Significant (0.25)')
                        
                        # Style the plot
                        ax.set_title(f'Monthly PSI Trend: {feature}\n({table_folder_name})', 
                                   fontsize=12, fontweight='bold')
                        ax.set_ylabel('PSI', fontsize=10)
                        ax.set_xlabel('Month', fontsize=10)
                        ax.legend(fontsize=9, loc='best')
                        ax.grid(True, alpha=0.3, linestyle='--')
                        ax.tick_params(axis='x', rotation=45)
                        
                        plt.tight_layout()
                        
                        # Save individual plot (no display - saved directly to ADLS)
                        plot_file = f"{table_trend_folder}{feature}.png"
                        save_plot_to_adls(fig, plot_file, dpi=150)
                        plt.close(fig)  # Explicitly close to free memory
                        fig = None  # Prevent double-close
                        saved_count += 1
                    else:
                        if fig is not None:
                            plt.close(fig)
                        failed_count += 1
                        print(f"    Warning: Skipped {feature} (no data)")
                        
                except Exception as e:
                    if fig is not None:
                        plt.close(fig)
                    failed_count += 1
                    print(f"    Warning: Could not plot {feature}: {str(e)}")
            
            print(f"  ✓ Monthly trend plots saved: {saved_count} successful, {failed_count} failed")
            print(f"    Location: {table_trend_folder}")
        
        del df, df_intime, df_oot, df_spark
        gc.collect()
        
    except Exception as e:
        print(f"  ✗ Error processing {fam_name}-{table}: {str(e)}")
        try:
            del df_spark
            gc.collect()
        except:
            pass

print("\n✓ Monthly PSI trend analysis complete")
