# Feature Profiling by Table (Pandas Version)

## Overview
Comprehensive feature profiling using **pandas** with memory-efficient chunk processing.

### Pandas Optimizations:
- **Chunked reading**: Process data in manageable chunks
- **Streaming statistics**: Calculate stats without loading full table
- **Memory efficient**: Use pandas iterators and explicit cleanup

### Statistics Calculated:
- Data type, % zeros, n_unique
- Most frequent value and percentage
- Percentiles: min, 1%, 50%, 99%, max, mean

### Outputs:
- Feature profiling CSVs per table with **separate statistics for In-Time vs OOT**
  - Each feature has two rows: one for 'In-Time' period, one for 'OOT' period
  - Includes `time_period` column to distinguish periods
- Individual boxplots for each feature (one PNG per feature)
  - Saved in table-specific folders: `plots/{table_name}/`
  - Each file named: `{feature_name}.png`
  - Comparing OOT vs in-time distributions

---


In [None]:
%pip install --upgrade pandas==2 -i https://repo.td.com/repository/pypi-all/simple

In [None]:
dbutils.library.restartPython()

In [None]:
import pandas
print(pandas.__version__)

In [None]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
from io import BytesIO
import gc
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.window import Window
import warnings
warnings.filterwarnings('ignore')

# Helper functions
def save_pandas_to_csv_adls(df_pandas, adls_path):
    csv_string = df_pandas.to_csv(index=False)
    dbutils.fs.put(adls_path, csv_string, overwrite=True)
    print(f"✓ Saved CSV to {adls_path}")

def save_plot_to_adls(fig, adls_path, dpi=150):
    import tempfile, os
    buf = BytesIO()
    fig.savefig(buf, format='png', dpi=dpi, bbox_inches='tight')
    buf.seek(0)
    with tempfile.NamedTemporaryFile(mode='wb', suffix='.png', delete=False) as tmp:
        tmp.write(buf.getvalue())
        tmp_path = tmp.name
    dbutils.fs.cp(f"file:{tmp_path}", adls_path)
    os.remove(tmp_path)
    # Removed print to reduce I/O congestion

# Spark-based profiling function (MUCH faster and memory-efficient)
def profile_features_spark(df_spark, features, num_features, cat_features, time_period=None):
    """Profile features using Spark - avoids memory issues"""
    results = []
    
    for feature in features:
        if feature not in df_spark.columns:
            continue
            
        is_cat = feature in cat_features
        
        # Base stats using Spark
        df_feat = df_spark.select(feature).filter(F.col(feature).isNotNull())
        total_count = df_spark.count()
        non_null_count = df_feat.count()
        
        if non_null_count == 0:
            results.append({
                'feature': feature,
                'data_type': 'categorical' if is_cat else 'numerical',
                'time_period': time_period or 'All',
                'pct_zero': None, 'n_unique': 0, 'most_frequent_value': None,
                'pct_most_frequent': None, 'min': None, 'max': None,
                'p1': None, 'median': None, 'p99': None, 'mean': None
            })
            continue
        
        # Calculate basic stats in Spark
        stats = {
            'feature': feature,
            'data_type': 'categorical' if is_cat else 'numerical',
            'time_period': time_period or 'All'
        }
        
        # Percent zeros (for numerical)
        if not is_cat:
            zero_count = df_spark.filter(F.col(feature) == 0).count()
            stats['pct_zero'] = zero_count / total_count
        else:
            stats['pct_zero'] = None
        
        # Number of unique values
        stats['n_unique'] = df_feat.select(feature).distinct().count()
        
        # Most frequent value
        mode_df = df_feat.groupBy(feature).count().orderBy(F.desc('count')).limit(1).collect()
        if mode_df:
            stats['most_frequent_value'] = mode_df[0][0]
            stats['pct_most_frequent'] = mode_df[0][1] / non_null_count
        else:
            stats['most_frequent_value'] = None
            stats['pct_most_frequent'] = None
        
        if not is_cat:
            # For numerical features, use Spark's built-in functions
            try:
                # Cast to double for numerical operations
                df_numeric = df_feat.select(F.col(feature).cast('double').alias(feature))
                
                # Use Spark's summary for basic stats (VERY fast)
                summary_stats = df_numeric.select(
                    F.min(feature).alias('min'),
                    F.max(feature).alias('max'),
                    F.mean(feature).alias('mean')
                ).collect()[0]
                
                stats['min'] = float(summary_stats['min']) if summary_stats['min'] is not None else None
                stats['max'] = float(summary_stats['max']) if summary_stats['max'] is not None else None
                stats['mean'] = float(summary_stats['mean']) if summary_stats['mean'] is not None else None
                
                # Use approxQuantile for percentiles (much faster than exact)
                percentiles = df_numeric.approxQuantile(feature, [0.01, 0.5, 0.99], 0.01)
                stats['p1'] = float(percentiles[0]) if percentiles[0] is not None else None
                stats['median'] = float(percentiles[1]) if percentiles[1] is not None else None
                stats['p99'] = float(percentiles[2]) if percentiles[2] is not None else None
                
            except:
                stats.update({'min': None, 'p1': None, 'median': None, 'p99': None, 'max': None, 'mean': None})
        else:
            # For categorical, get min/max
            try:
                min_max = df_feat.select(F.min(feature).alias('min'), F.max(feature).alias('max')).collect()[0]
                stats['min'] = min_max['min']
                stats['max'] = min_max['max']
                stats.update({'p1': None, 'median': None, 'p99': None, 'mean': None})
            except:
                stats.update({'min': None, 'max': None, 'p1': None, 'median': None, 'p99': None, 'mean': None})
        
        results.append(stats)
    
    return results

# Batch boxplot creation using Spark sampling
def create_boxplots_batch_spark(df_spark, features, table_folder_name, table_plot_folder, oot_date):
    """Create boxplots using Spark sampling - much more memory efficient"""
    saved_count = 0
    failed_count = 0
    
    # Sample for plotting (much smaller dataset)
    PLOT_SAMPLE_SIZE = 10000  # Fixed sample size for consistency
    
    for feature in features:
        try:
            # Get sample data from Spark
            df_sample = df_spark.select('efectv_dt', feature).filter(
                F.col(feature).isNotNull()
            ).limit(PLOT_SAMPLE_SIZE).toPandas()
            
            if len(df_sample) == 0:
                failed_count += 1
                continue
            
            # Convert to numeric
            df_sample[feature] = pd.to_numeric(df_sample[feature], errors='coerce')
            df_sample = df_sample.dropna()
            
            if len(df_sample) == 0:
                failed_count += 1
                continue
            
            # Split by time period
            df_sample['efectv_dt'] = pd.to_datetime(df_sample['efectv_dt'])
            intime_data = df_sample[df_sample['efectv_dt'] < oot_date][feature].values
            oot_data = df_sample[df_sample['efectv_dt'] >= oot_date][feature].values
            
            if len(intime_data) > 0 and len(oot_data) > 0:
                # Create plot
                fig, ax = plt.subplots(figsize=(10, 6))
                
                bp = ax.boxplot([intime_data, oot_data], 
                               labels=['In-Time', 'OOT'], 
                               vert=False,
                               patch_artist=True,
                               showmeans=False,
                               showfliers=True)
                
                # Style
                colors = ['lightblue', 'lightcoral']
                for patch, color in zip(bp['boxes'], colors):
                    patch.set_facecolor(color)
                    patch.set_alpha(0.7)
                
                for flier in bp['fliers']:
                    flier.set_marker('o')
                    flier.set_markerfacecolor('black')
                    flier.set_markersize(3)
                    flier.set_alpha(0.1)
                
                ax.set_title(f'{feature}\n({table_folder_name})', fontsize=12, fontweight='bold')
                ax.set_xlabel('Value', fontsize=10)
                ax.set_ylabel('Time Period', fontsize=10)
                ax.grid(True, alpha=0.3, axis='x', linestyle='--')
                
                plt.tight_layout()
                
                # Save
                plot_file = f"{table_plot_folder}{feature}.png"
                save_plot_to_adls(fig, plot_file, dpi=150)
                plt.close(fig)
                
                saved_count += 1
            else:
                failed_count += 1
                
        except Exception as e:
            failed_count += 1
            continue
    
    return saved_count, failed_count

print("✓ Setup complete with Spark optimizations")


In [None]:
# Configuration
DATA_PATH = "abfss://home@edaaaazepcalayelaye0001.dfs.core.windows.net/MD_Artifacts/money-out/data/"
OUTPUT_PATH = "abfss://home@edaaaazepcalayelaye0001.dfs.core.windows.net/MD_Artifacts/money-out/mv/eda_validation/feature_profiling/"
PLOT_PATH = OUTPUT_PATH + "plots/"
dbutils.fs.mkdirs(OUTPUT_PATH)
dbutils.fs.mkdirs(PLOT_PATH)

SAMPLING_RATIO = 0.01
PLOT_SAMPLING_RATIO = 0.01
OOT_START_DATE = '2024-01-01'

# Feature tables to analyze
TABLES = [
    ("cust", "batch_credit_bureau", ''),
    ("cust", "cust_basic_sumary", ''),
    ("dem", "acct_trans", 2438),
    ("cc", "acct_trans", 2444),
    ("dem", "acct", 2438),
    ("cc", "acct", 2444),
    ("loc", "acct", 2442),
    ("loan", "acct", 2439),
    ("mtg", "acct", 2440),
    ("inv", "acct", 1331),
]

# Load metadata
feature_metadata_rows = spark.read.text(f"{DATA_PATH}/feature/feature_metadata.jsonl").collect()
feature_metadata = json.loads('\n'.join([row.value for row in feature_metadata_rows]))

print("✓ Config loaded")


## Processing Strategy: Sampled Full-Table (Accuracy Prioritized)

### Why This Approach?
This notebook calculates **median, percentiles (p1, p99)** which **CANNOT be calculated incrementally**. We must see all values to sort/rank them accurately.

### Memory Efficiency:
- **Memory usage**: Scales with SAMPLING_RATIO
- **Mitigation**: Process one table at a time (10 tables total), free memory between tables
- **Recommendation for memory issue**: Use `SAMPLING_RATIO = 0.01` (1%) for accurate results with manageable memory

### How It Works:
```
For each table (10 total):
  1. Load FULL table via Spark (efficient Parquet reading)
  2. Apply sampling at Spark level: .sample(fraction=SAMPLING_RATIO)
  3. Convert to pandas: .toPandas()
  4. Calculate accurate statistics:
     - median: df[col].median() ← requires sorted values
     - p99: df[col].quantile(0.99) ← requires percentile calculation
     - mean, min, max, n_unique, etc.
  5. Free memory before next table (del df; gc.collect())
```

### Why Incremental Doesn't Work Here:
- ❌ median(chunk1) + median(chunk2) ≠ median(all_data)
- ❌ p99(chunk1) combined with p99(chunk2) ≠ p99(all_data)
- ✅ Must see all sampled values together to calculate correct percentiles
- ✅ 1% sampling gives exact statistics on representative sample

### Alternative Considered:
Could use approximate algorithms (T-Digest, Q-Digest) for streaming percentiles, but:
- ❌ Introduces approximation error
- ❌ Complex to implement and debug
- ✅ 1% sampling gives exact results with manageable memory
- ✅ Simpler code is easier to maintain

---


## Success Criteria and Expected Results

### ✅ **Profiling Succeeds If**:
- All tables processed successfully
- Statistics calculated for **all features** (numerical + categorical) in metadata
- **Separate statistics** calculated for In-Time vs OOT periods
- No excessive missing values (>99.9%) unless expected
- Reasonable value ranges (no extreme outliers unless business-valid)
- Categorical features have reasonable cardinality
- **Time-period comparison** shows expected differences between In-Time and OOT

### 📊 **Statistics Calculated Per Feature (Per Time Period)**:
| Statistic | Numerical | Categorical | Notes |
|-----------|-----------|-------------|-------|
| time_period | ✓ | ✓ | 'In-Time' or 'OOT' |
| feature | ✓ | ✓ | Feature name |
| data_type | ✓ | ✓ | Identifies feature type |
| pct_zero | ✓ | ✓ | % of values that are 0 |
| n_unique | ✓ | ✓ | Number of distinct values |
| most_frequent_value | ✓ | ✓ | Mode |
| pct_most_frequent | ✓ | ✓ | % of samples with mode |
| min | ✓ | ✓ | Minimum value |
| max | ✓ | ✓ | Maximum value |
| p1 | ✓ | ✗ | 1st percentile |
| median (p50) | ✓ | ✗ | 50th percentile |
| p99 | ✓ | ✗ | 99th percentile |
| mean | ✓ | ✗ | Average value |

### 📈 **Time-Period Analysis**:
Each feature has **two rows** in the output CSV:
- **Row 1**: Statistics for 'In-Time' period (before 2024-01-01)
- **Row 2**: Statistics for 'OOT' period (2024-01-01 and after)

**What to Compare**:
- **Distributions**: Compare median, mean, percentiles between periods
- **Sparsity**: Compare pct_zero (may increase/decrease over time)
- **Cardinality**: Compare n_unique for categorical features
- **Ranges**: Compare min/max values (may indicate data quality issues)
- **Mode**: Compare most_frequent_value (distribution shifts)

### ⚠️ **Potential Issues to Flag**:
- **Features with >99% zeros** (may be redundant or sparse)
- **Features with only 1 unique value** (constant features - no information)
- **Features with extreme ranges** (may need normalization or clipping)
- **Categorical features with very high cardinality** (>1000 categories - may need bucketing)
- **Features missing from certain tables** (expected for table-specific features)
- **Large differences between In-Time and OOT**:
  - Significant shifts in median/mean (potential drift)
  - Large changes in pct_zero (sparsity changes)
  - Cardinality changes in categoricals (new categories appear/disappear)

### 📊 **Boxplot Visualizations**:
- **One boxplot per numerical feature** comparing In-Time vs OOT
- Saved individually: `plots/{table_name}/{feature_name}.png`
- **What to look for**:
  - Distribution shifts (boxes at different positions)
  - Spread changes (box width differences)
  - Outlier patterns (fliers in different locations)
  - Median differences (horizontal line position)

---


In [None]:
print("="*80)
print("FEATURE PROFILING (SPARK-OPTIMIZED)")
print("="*80)

for fam_name, table, fam in TABLES:
    print(f"\nProcessing: {fam_name}-{table}")
    
    table_path = f"{DATA_PATH}/feature/{table}/parquet" if not fam else f"{DATA_PATH}/feature/{table}_{fam}/parquet"
    table_meta_key = table if not fam else f"{table}_{fam}"
    
    if fam_name not in feature_metadata or table_meta_key not in feature_metadata[fam_name]:
        continue
    
    num_features = feature_metadata[fam_name][table_meta_key].get("num_features", [])
    cat_features = list(feature_metadata[fam_name][table_meta_key].get("cat_features", {}).keys())
    
    # OPTIMIZATION: Load table once, process features in batches to avoid driver memory limit
    df_spark = spark.read.format("parquet").load(table_path)
    if SAMPLING_RATIO < 1.0:
        df_spark = df_spark.sample(fraction=SAMPLING_RATIO, withReplacement=False, seed=42)
    
    # SPARK OPTIMIZATION: Process all features using Spark native functions
    # Cache DataFrame for reuse
    df_spark.cache()
    
    all_features = num_features + cat_features
    all_stats = []
    has_efectv_dt = 'efectv_dt' in df_spark.columns
    
    if has_efectv_dt:
        # Split into time periods using Spark (much more efficient)
        oot_date = pd.to_datetime(OOT_START_DATE)
        
        # Create In-Time DataFrame
        df_intime = df_spark.filter(
            F.col('efectv_dt') < F.lit(oot_date)
        )
        
        # Create OOT DataFrame  
        df_oot = df_spark.filter(
            F.col('efectv_dt') >= F.lit(oot_date)
        )
        
        # Profile In-Time features using Spark
        print("  Profiling In-Time features using Spark...")
        intime_stats = profile_features_spark(df_intime, all_features, num_features, cat_features, 'In-Time')
        all_stats.extend(intime_stats)
        
        # Profile OOT features using Spark
        print("  Profiling OOT features using Spark...")
        oot_stats = profile_features_spark(df_oot, all_features, num_features, cat_features, 'OOT')
        all_stats.extend(oot_stats)
    else:
        # No time splitting - profile all
        print("  Profiling all features using Spark...")
        stats = profile_features_spark(df_spark, all_features, num_features, cat_features, 'All')
        all_stats.extend(stats)
    
    # Save profiling results
    if all_stats:
        results_df = pd.DataFrame(all_stats)
        col_order = ['time_period', 'feature', 'data_type'] + [c for c in results_df.columns if c not in ['time_period', 'feature', 'data_type']]
        results_df = results_df[col_order]
        save_pandas_to_csv_adls(results_df, f"{OUTPUT_PATH}feature_profile_{fam_name}_{table}.csv")
    
    # Create boxplots (need to reload data with time_period for plotting)
    if has_efectv_dt:
        # Load only numerical features needed for plotting in batches
        plot_features = [f for f in num_features if f in df_spark.columns]
        
        if len(plot_features) > 0:
            print(f"  Creating {len(plot_features)} individual boxplots (parallel processing)...")
            
            table_folder_name = f"{fam_name}_{table}" if not fam else f"{fam_name}_{table}_{fam}"
            table_plot_folder = f"{PLOT_PATH}{table_folder_name}/"
            dbutils.fs.mkdirs(table_plot_folder)
            
            # SPARK OPTIMIZATION: Create boxplots using Spark sampling
            oot_date = pd.to_datetime(OOT_START_DATE)
            saved_count, failed_count = create_boxplots_batch_spark(
                df_spark, plot_features, table_folder_name, table_plot_folder, oot_date
            )
            
            print(f"  ✓ Boxplots saved: {saved_count} successful, {failed_count} failed")
            print(f"    Location: {table_plot_folder}")
    
    # Unpersist cached DataFrame to free memory
    df_spark.unpersist()
    gc.collect()

print("\n✓ Feature profiling complete")
