In [1]:
import os
import glob
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import numpy as np
import random
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import pprint
import pyspark
import pyspark.sql.functions as F
from pyspark.sql.functions import col
from pyspark.sql.types import StringType, IntegerType, FloatType, DateType
from sklearn.model_selection import train_test_split


In [2]:
current_dir = os.getcwd()
if current_dir.endswith('notebooks'):
    os.chdir('..')
    print("Adjusted working directory to project root")

from utils.preprocessing import PreprocessorFactory, DataQualityChecker
import utils.model_training as mt

%load_ext autoreload
%autoreload 2

Adjusted working directory to project root


## Initialize Spark Session

In [3]:
spark = pyspark.sql.SparkSession.builder \
    .appName("ML_Pipeline_Training") \
    .master("local[*]") \
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

print("Spark session initialized successfully")


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/28 08:27:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/06/28 08:27:37 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


Spark session initialized successfully


## Build Training Configuration

In [4]:
config = mt.build_training_config(
    training_date_str="2024-09-01",
    models_to_train=["logistic_regression", "random_forest", "xgboost"],
    train_test_period_months=12,
    oot_period_months=2,
    train_test_ratio=0.8,
    cv_folds=3,
    hyperparameter_iterations=10,
    random_state=88,
    feature_store_path="datamart/gold/feature_store/",
    label_store_path="datamart/gold/label_store/",
    model_bank_directory="models/"
)

print("Training configuration built successfully:")
pprint.pprint(config)


Training configuration built successfully:
{'cv_folds': 3,
 'feature_store_path': 'datamart/gold/feature_store/',
 'hyperparameter_iterations': 10,
 'label_store_path': 'datamart/gold/label_store/',
 'model_artifacts_output': {},
 'model_bank_directory': 'models/',
 'model_train_date': datetime.datetime(2024, 9, 1, 0, 0),
 'model_train_date_str': '2024-09-01',
 'models_to_train': ['logistic_regression', 'random_forest', 'xgboost'],
 'oot_end_date': datetime.datetime(2024, 8, 31, 0, 0),
 'oot_period_months': 2,
 'oot_start_date': datetime.datetime(2024, 7, 1, 0, 0),
 'random_state': 88,
 'train_test_end_date': datetime.datetime(2024, 6, 30, 0, 0),
 'train_test_period_months': 12,
 'train_test_ratio': 0.8,
 'train_test_start_date': datetime.datetime(2023, 7, 1, 0, 0)}


## Load Training Data

In [5]:
print("Loading training data from feature and label stores...")
data_pdf = mt.load_training_data(config, spark)

if data_pdf is None:
    raise ValueError("Failed to load training data")

print(f"Training data loaded successfully: {data_pdf.shape}")
print("\nSample merged data:")
print(data_pdf[['Customer_ID', 'snapshot_date', 'label', 'annual_income']].head())

Loading training data from feature and label stores...
Loading training data...
Training period: 2023-07-01 to 2024-06-30
OOT period: 2024-07-01 to 2024-08-31
Loaded 6961 label records


                                                                                

Loaded 127189 feature records


                                                                                

Merged dataset shape: (6961, 140)
Training data loaded successfully: (6961, 140)

Sample merged data:
  Customer_ID snapshot_date  label  annual_income
0  CUS_0x1015    2024-02-01      0   46951.019531
1  CUS_0x10eb    2023-09-01      0   28315.949219
2  CUS_0x10ff    2024-04-01      0   16341.075195
3  CUS_0x112f    2024-04-01      1   75506.421875
4  CUS_0x117d    2024-03-01      0   70779.179688


## Validate Data Quality

In [6]:
print("Validating training data quality...")
quality_report = mt.validate_training_data(data_pdf, config)

print("Data Quality Report:")
pprint.pprint(quality_report)

if quality_report.get('data_quality_flags'):
    print(f"\nWARNING: Data quality issues detected: {quality_report['data_quality_flags']}")
else:
    print("\nData quality validation passed successfully")


Validating training data quality...
Validating training data quality...
Data quality validation completed
Data quality validation passed
Data Quality Report:
{'data_quality_flags': [],
 'data_shape': {'columns': 136, 'rows': 6961},
 'feature_types': {'categorical': ['loan_id',
                                   'credit_mix',
                                   'payment_of_min_amount',
                                   'occupation',
                                   'age_group',
                                   'occupation_category'],
                   'numerical': ['annual_income',
                                 'monthly_salary',
                                 'credit_history_months',
                                 'credit_utilization_ratio',
                                 'delay_from_due_date',
                                 'num_loans',
                                 'num_delayed_payments',
                                 'outstanding_debt',
                         

## Prepare Training Datasets

In [7]:
print("Preparing training datasets...")
datasets = mt.prepare_training_datasets(data_pdf, config)

if datasets is None:
    raise ValueError("Failed to prepare training datasets")

print("Training datasets prepared successfully:")
print(f"  Train: {datasets['X_train'].shape[0]} samples, label rate: {datasets['y_train'].mean():.3f}")
print(f"  Test: {datasets['X_test'].shape[0]} samples, label rate: {datasets['y_test'].mean():.3f}")
print(f"  OOT: {datasets['X_oot'].shape[0]} samples, label rate: {datasets['y_oot'].mean():.3f}")
print(f"  Features: {len(datasets['feature_columns'])}")


Preparing training datasets...
Preparing training datasets...
Dataset preparation completed:
  Train: 4766 samples, label rate: 0.281
  Test: 1192 samples, label rate: 0.293
  OOT: 1003 samples, label rate: 0.290
  Features: 136
Training datasets prepared successfully:
  Train: 4766 samples, label rate: 0.281
  Test: 1192 samples, label rate: 0.293
  OOT: 1003 samples, label rate: 0.290
  Features: 136


## Model-Specific Preprocessing

In [8]:
print("Preparing model-specific preprocessing...")
preprocessing_results = {}

for model_type in config["models_to_train"]:
    print(f"\nProcessing {model_type}...")
    
    preprocessing_result = mt.prepare_model_preprocessing(
        datasets, model_type, config["random_state"]
    )
    
    if preprocessing_result is None:
        print(f"✗ {model_type} preprocessing failed")
        continue
    
    preprocessing_results[model_type] = preprocessing_result
    stats = preprocessing_result['preprocessing_stats']
    print(f"✓ {model_type}: {stats['original_features']} -> {stats['final_features']} features")

successful_models = list(preprocessing_results.keys())
config["models_to_train"] = successful_models
print(f"\nModels ready for training: {successful_models}")


Preparing model-specific preprocessing...

Processing logistic_regression...
Preparing preprocessing for logistic_regression...
[LR Preprocessor] Starting preprocessing for 4766 samples with 136 features
[LR Preprocessor] Selected 7 financial features
[LR Preprocessor] Selected 2 demographic features
[LR Preprocessor] Selected 4 categorical features
[LR Preprocessor] Selected 10 clickstream features
[LR Preprocessor] Selected 3 quality features
[LR Preprocessor] Preprocessing complete: 136 -> 36 features
logistic_regression: 136 -> 36 features
✓ logistic_regression: 136 -> 36 features

Processing random_forest...
Preparing preprocessing for random_forest...
[RF Preprocessor] Starting preprocessing for 4766 samples with 136 features
[RF Preprocessor] Selected 20 financial features
[RF Preprocessor] Selected 100 clickstream features
[RF Preprocessor] Selected 6 interaction features
[RF Preprocessor] Selected 9 demographic features
[RF Preprocessor] Selected 5 quality features
[RF Preproc

## Train Multiple Models

In [9]:
print("Starting model training phase...")
model_artifacts = {}
training_results = {}

for model_type in config["models_to_train"]:
    print(f"\n{'='*60}")
    print(f"TRAINING {model_type.upper()}")
    print(f"{'='*60}")
    
    preprocessing_result = preprocessing_results[model_type]
    
    model_artifact = mt.train_single_model(
        model_type=model_type,
        preprocessing_result=preprocessing_result,
        datasets=datasets,
        config=config
    )
    
    if model_artifact is None:
        print(f"✗ {model_type} training failed")
        training_results[model_type] = {'error': 'Training failed'}
        continue
    
    model_artifacts[model_type] = model_artifact
    training_results[model_type] = {
        'auc_train': model_artifact['results']['auc_train'],
        'auc_test': model_artifact['results']['auc_test'],
        'auc_oot': model_artifact['results']['auc_oot'],
        'gini_train': model_artifact['results']['gini_train'],
        'gini_test': model_artifact['results']['gini_test'],
        'gini_oot': model_artifact['results']['gini_oot'],
        'training_time': model_artifact['training_metadata']['training_time_seconds'],
        'cv_score': model_artifact['results']['cv_score']
    }
    
    print(f"✓ {model_type} training completed successfully")

print(f"\nTraining phase completed: {len(model_artifacts)}/{len(config['models_to_train'])} models trained successfully")


Starting model training phase...

TRAINING LOGISTIC_REGRESSION

[LOGISTIC_REGRESSION] Starting model training...
[LOGISTIC_REGRESSION] Train samples: 4766
[LOGISTIC_REGRESSION] Test samples: 1192
[LOGISTIC_REGRESSION] OOT samples: 1003
[LOGISTIC_REGRESSION] Features: 36
[LOGISTIC_REGRESSION] Starting hyperparameter search...
[LOGISTIC_REGRESSION] Search iterations: 10
[LOGISTIC_REGRESSION] CV folds: 3




[LOGISTIC_REGRESSION] Best CV score: 0.8041
[LOGISTIC_REGRESSION] Performance Results:
[LOGISTIC_REGRESSION]   Train AUC: 0.8114 (Gini: 0.623)
[LOGISTIC_REGRESSION]   Test AUC:  0.8060 (Gini: 0.612)
[LOGISTIC_REGRESSION]   OOT AUC:   0.7878 (Gini: 0.576)
[LOGISTIC_REGRESSION] Training completed in 6.06 seconds
✓ logistic_regression training completed successfully

TRAINING RANDOM_FOREST

[RANDOM_FOREST] Starting model training...
[RANDOM_FOREST] Train samples: 4766
[RANDOM_FOREST] Test samples: 1192
[RANDOM_FOREST] OOT samples: 1003
[RANDOM_FOREST] Features: 140
[RANDOM_FOREST] Starting hyperparameter search...
[RANDOM_FOREST] Search iterations: 10
[RANDOM_FOREST] CV folds: 3
[RANDOM_FOREST] Best CV score: 0.8668
[RANDOM_FOREST] Performance Results:
[RANDOM_FOREST]   Train AUC: 0.9868 (Gini: 0.974)
[RANDOM_FOREST]   Test AUC:  0.8802 (Gini: 0.760)
[RANDOM_FOREST]   OOT AUC:   0.8519 (Gini: 0.704)
[RANDOM_FOREST] Training completed in 26.09 seconds
✓ random_forest training completed suc

## Training Results Analysis

In [10]:
print("\n" + "="*80)
print("TRAINING RESULTS ANALYSIS")
print("="*80)

results_df = pd.DataFrame(training_results).T
print("\nModel Performance Comparison:")
print(results_df)

best_model_info = mt.select_best_model(model_artifacts, selection_metric='auc_test')

if best_model_info:
    best_model_type, best_model_artifact = best_model_info
    print(f"\nBest model selected: {best_model_type}")
    print(f"Test AUC: {best_model_artifact['results']['auc_test']:.4f}")
    print(f"Test Gini: {best_model_artifact['results']['gini_test']:.3f}")
    print(f"OOT AUC (final validation): {best_model_artifact['results']['auc_oot']:.4f}")
    print(f"OOT Gini (final validation): {best_model_artifact['results']['gini_oot']:.3f}")
else:
    print("\nNo valid models found for selection")

if len(model_artifacts) > 1:
    print(f"\nModel Ranking (by Test AUC):")
    valid_results = results_df[results_df['auc_test'].notna()].sort_values('auc_test', ascending=False)
    for i, (model, row) in enumerate(valid_results.iterrows(), 1):
        print(f"{i}. {model}: Test AUC {row['auc_test']:.4f}, OOT AUC {row['auc_oot']:.4f}")



TRAINING RESULTS ANALYSIS

Model Performance Comparison:
                     auc_train  auc_test   auc_oot  gini_train  gini_test  \
logistic_regression   0.811373  0.806048  0.787810    0.622747   0.612096   
random_forest         0.986768  0.880244  0.851949    0.973536   0.760488   
xgboost               0.968645  0.892742  0.884527    0.937290   0.785484   

                     gini_oot  training_time  cv_score  
logistic_regression  0.575621       6.058401  0.804054  
random_forest        0.703898      26.094765  0.866796  
xgboost              0.769055       9.578844  0.888354  

Best model selected: xgboost
Selection metric (auc_test): 0.8927

Best model selected: xgboost
Test AUC: 0.8927
Test Gini: 0.785
OOT AUC (final validation): 0.8845
OOT Gini (final validation): 0.769

Model Ranking (by Test AUC):
1. xgboost: Test AUC 0.8927, OOT AUC 0.8845
2. random_forest: Test AUC 0.8802, OOT AUC 0.8519
3. logistic_regression: Test AUC 0.8060, OOT AUC 0.7878


## Save Model Artifacts

In [11]:
print("Saving model artifacts to model bank...")
saved_paths = mt.save_model_artifacts(model_artifacts, config)

print(f"Model artifacts saved successfully:")
for model_type, path in saved_paths.items():
    print(f"  {model_type}: {path}")


Saving model artifacts to model bank...
✓ logistic_regression saved to: models/credit_model_logistic_regression_2024_09_01.pkl
✓ logistic_regression preprocessor saved to: models/credit_model_logistic_regression_2024_09_01_preprocessor.pkl
✓ random_forest saved to: models/credit_model_random_forest_2024_09_01.pkl
✓ random_forest preprocessor saved to: models/credit_model_random_forest_2024_09_01_preprocessor.pkl
✓ xgboost saved to: models/credit_model_xgboost_2024_09_01.pkl
✓ xgboost preprocessor saved to: models/credit_model_xgboost_2024_09_01_preprocessor.pkl
Model artifacts saved successfully:
  logistic_regression: models/credit_model_logistic_regression_2024_09_01.pkl
  random_forest: models/credit_model_random_forest_2024_09_01.pkl
  xgboost: models/credit_model_xgboost_2024_09_01.pkl


## Validate Model Artifacts

In [12]:
print("Validating saved model artifacts...")
validation_results = mt.validate_model_artifacts(model_artifacts, datasets)

print("Model validation results:")
for model_type, result in validation_results.items():
    if result.get('validation_successful'):
        print(f"✓ {model_type}: Validation successful - AUC: {result['auc_score']:.4f}")
    else:
        print(f"✗ {model_type}: Validation failed - {result.get('error', 'Unknown error')}")


Validating saved model artifacts...

[VALIDATION] Testing logistic_regression inference...
[VALIDATION] ✓ logistic_regression validation successful - AUC: 0.8060

[VALIDATION] Testing random_forest inference...
[VALIDATION] ✓ random_forest validation successful - AUC: 0.8802

[VALIDATION] Testing xgboost inference...
[VALIDATION] ✓ xgboost validation successful - AUC: 0.8927
Model validation results:
✓ logistic_regression: Validation successful - AUC: 0.8060
✓ random_forest: Validation successful - AUC: 0.8802
✓ xgboost: Validation successful - AUC: 0.8927


## Training Session Summary

In [13]:
print("\n" + "="*80)
print("TRAINING SESSION COMPLETE")
print("="*80)

print(f"Training Configuration:")
print(f"  Training Date: {config['model_train_date_str']}")
print(f"  Training Period: {config['train_test_start_date'].date()} to {config['train_test_end_date'].date()}")
print(f"  OOT Period: {config['oot_start_date'].date()} to {config['oot_end_date'].date()}")
print(f"  Models Requested: {', '.join(config['models_to_train'])}")

successful_models = list(model_artifacts.keys())
failed_models = [m for m in config['models_to_train'] if m not in model_artifacts]

print(f"\nTraining Outcomes:")
print(f"  Models Trained Successfully: {len(successful_models)}/{len(config['models_to_train'])}")
if successful_models:
    print(f"  Successful: {', '.join(successful_models)}")
if failed_models:
    print(f"  Failed: {', '.join(failed_models)}")

if best_model_info:
    best_model_type, best_model_artifact = best_model_info
    print(f"\nBest Model Summary:")
    print(f"  Model Type: {best_model_type}")
    print(f"  Model Version: {best_model_artifact['model_version']}")
    print(f"  Test AUC: {best_model_artifact['results']['auc_test']:.4f}")
    print(f"  OOT AUC: {best_model_artifact['results']['auc_oot']:.4f}")
    print(f"  Training Time: {best_model_artifact['training_metadata']['training_time_seconds']:.1f}s")
    print(f"  Feature Count: {best_model_artifact['data_stats']['feature_count']}")

print(f"\nData Summary:")
print(f"  Total Records: {len(data_pdf):,}")
print(f"  Train Samples: {datasets['X_train'].shape[0]:,} ({datasets['y_train'].mean():.3f} default rate)")
print(f"  Test Samples: {datasets['X_test'].shape[0]:,} ({datasets['y_test'].mean():.3f} default rate)")
print(f"  OOT Samples: {datasets['X_oot'].shape[0]:,} ({datasets['y_oot'].mean():.3f} default rate)")

print(f"\nModel Bank:")
print(f"  Location: {os.path.abspath(config['model_bank_directory'])}")
print(f"  Artifacts Saved: {len(saved_paths)}")

successful_validations = len([r for r in validation_results.values() if r.get('validation_successful')])
print(f"  Validation Success Rate: {successful_validations}/{len(validation_results)}")

if 'spark' in locals():
    spark.stop()
    print("\nSpark session terminated successfully")

print("\n" + "="*80)
print("TRAINING PIPELINE EXECUTION COMPLETE")
print("="*80)


TRAINING SESSION COMPLETE
Training Configuration:
  Training Date: 2024-09-01
  Training Period: 2023-07-01 to 2024-06-30
  OOT Period: 2024-07-01 to 2024-08-31
  Models Requested: logistic_regression, random_forest, xgboost

Training Outcomes:
  Models Trained Successfully: 3/3
  Successful: logistic_regression, random_forest, xgboost

Best Model Summary:
  Model Type: xgboost
  Model Version: credit_model_xgboost_2024_09_01
  Test AUC: 0.8927
  OOT AUC: 0.8845
  Training Time: 9.6s
  Feature Count: 135

Data Summary:
  Total Records: 6,961
  Train Samples: 4,766 (0.281 default rate)
  Test Samples: 1,192 (0.293 default rate)
  OOT Samples: 1,003 (0.290 default rate)

Model Bank:
  Location: /opt/airflow/models
  Artifacts Saved: 3
  Validation Success Rate: 3/3

Spark session terminated successfully

TRAINING PIPELINE EXECUTION COMPLETE
