# Model Training and Validation

## 1. Data Splitting Strategy

We use a three-way split for robust validation:

- **Training (70%)**: Used for model fitting
- **Validation (15%)**: Used for hyperparameter tuning
- **Test (15%)**: Used for final evaluation only

In [None]:


# %% [markdown]
"""
# Credit Risk Model Training & Validation
## Step-by-Step Implementation
"""

# %% [markdown]
"""
## 1. Setup Environment
"""

# %%
# Install required packages
!pip install mlflow scikit-learn xgboost pandas numpy matplotlib seaborn joblib

# Import libraries
import os
import mlflow
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.model_selection import train_test_split, TimeSeriesSplit, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                            f1_score, roc_auc_score, confusion_matrix, roc_curve)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Initialize MLflow
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("Credit_Risk_Model")

# Set random seed
np.random.seed(42)

# %% [markdown]
"""
## 2. Load Processed Data
"""

# %%
# Load processed data
data_path = "../data/processed/processed_data.csv"
if not os.path.exists(data_path):
    raise FileNotFoundError("Processed data not found. Run data_processing.py first")

data = pd.read_csv(data_path)
print(f"Data shape: {data.shape}")
print("\nData columns:")
print(data.columns.tolist())
print("\nClass distribution:")
print(data["is_high_risk"].value_counts(normalize=True))

# %% [markdown]
"""
## 3. Time-Based Data Splitting
"""

# %%
def time_based_split(data, time_col='TransactionStartTime'):
    """Split data into train/validation/test sets with time ordering"""
    # Sort by transaction time
    data = data.sort_values(time_col).reset_index(drop=True)
    
    # Calculate split indices
    train_end = int(0.7 * len(data))
    val_end = train_end + int(0.15 * len(data))
    
    # Create splits
    train = data.iloc[:train_end]
    val = data.iloc[train_end:val_end]
    test = data.iloc[val_end:]
    
    return train, val, test

# Split data
train, val, test = time_based_split(data)

print(f"Train size: {len(train)} ({len(train)/len(data):.1%})")
print(f"Validation size: {len(val)} ({len(val)/len(data):.1%})")
print(f"Test size: {len(test)} ({len(test)/len(data):.1%})")

# Visualize time distribution
plt.figure(figsize=(12, 6))
plt.plot(pd.to_datetime(train['TransactionStartTime']), label='Train', alpha=0.7)
plt.plot(pd.to_datetime(val['TransactionStartTime']), label='Validation', alpha=0.7)
plt.plot(pd.to_datetime(test['TransactionStartTime']), label='Test', alpha=0.7)
plt.title("Time-Based Data Split")
plt.xlabel("Index")
plt.ylabel("Transaction Time")
plt.legend()
plt.show()

# %% [markdown]
"""
## 4. Prepare Datasets
"""

# %%
# Separate features and target
X_train = train.drop(columns=["is_high_risk"])
y_train = train["is_high_risk"]

X_val = val.drop(columns=["is_high_risk"])
y_val = val["is_high_risk"]

X_test = test.drop(columns=["is_high_risk"])
y_test = test["is_high_risk"]

# Identify feature types
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns

print(f"Numeric features: {len(numeric_features)}")
print(f"Categorical features: {len(categorical_features)}")

# %% [markdown]
"""
## 5. Create Preprocessing Pipeline
"""

# %%
# Numeric preprocessing
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Categorical preprocessing
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Full preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# %% [markdown]
"""
## 6. Model Training & Hyperparameter Tuning
"""

# %%
# Define models and hyperparameters
models = {
    "LogisticRegression": {
        "model": LogisticRegression(solver='liblinear', random_state=42, max_iter=1000),
        "params": {
            'classifier__C': [0.01, 0.1, 1],
            'classifier__penalty': ['l1', 'l2']
        }
    },
    "RandomForest": {
        "model": RandomForestClassifier(random_state=42),
        "params": {
            'classifier__n_estimators': [100, 200],
            'classifier__max_depth': [10, None],
            'classifier__min_samples_split': [5, 10]
        }
    },
    "XGBoost": {
        "model": XGBClassifier(random_state=42, eval_metric='logloss'),
        "params": {
            'classifier__n_estimators': [100, 200],
            'classifier__learning_rate': [0.01, 0.1],
            'classifier__max_depth': [3, 5]
        }
    }
}

# Time-series cross-validation
tscv = TimeSeriesSplit(n_splits=5)

best_models = {}
mlflow.set_experiment("Credit_Risk_Model")

# %%
for name, config in models.items():
    print(f"\n{'='*50}")
    print(f"Training {name}")
    print(f"{'='*50}")
    
    with mlflow.start_run(run_name=f"{name}_Experiment", nested=True):
        # Create pipeline
        pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('classifier', config["model"])
        ])
        
        # Hyperparameter tuning
        grid_search = GridSearchCV(
            pipeline,
            param_grid=config["params"],
            cv=tscv,
            scoring='roc_auc',
            n_jobs=-1,
            verbose=1
        )
        
        # Train model
        grid_search.fit(X_train, y_train)
        
        # Get best model
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
        
        # Evaluate on validation set
        y_val_pred = best_model.predict(X_val)
        y_val_proba = best_model.predict_proba(X_val)[:, 1]
        val_auc = roc_auc_score(y_val, y_val_proba)
        
        # Log parameters and metrics
        mlflow.log_params(best_params)
        mlflow.log_metric("val_roc_auc", val_auc)
        mlflow.log_metric("val_accuracy", accuracy_score(y_val, y_val_pred))
        mlflow.log_metric("val_precision", precision_score(y_val, y_val_pred))
        mlflow.log_metric("val_recall", recall_score(y_val, y_val_pred))
        
        # Track best model
        best_models[name] = {
            "model": best_model,
            "val_auc": val_auc,
            "run_id": mlflow.active_run().info.run_id
        }
        
        print(f"Best parameters: {best_params}")
        print(f"Validation AUC: {val_auc:.4f}")

# %% [markdown]
"""
## 7. Model Selection & Final Evaluation
"""

# %%
# Select best model based on validation AUC
best_model_name = max(best_models, key=lambda k: best_models[k]["val_auc"])
best_model = best_models[best_model_name]["model"]
best_run_id = best_models[best_model_name]["run_id"]

print(f"\n{'='*50}")
print(f"Best model: {best_model_name} (AUC: {best_models[best_model_name]['val_auc']:.4f})")
print(f"{'='*50}")

# Final evaluation on test set
with mlflow.start_run(run_name="Final_Evaluation") as run:
    # Predict on test set
    y_test_pred = best_model.predict(X_test)
    y_test_proba = best_model.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    metrics = {
        "test_roc_auc": roc_auc_score(y_test, y_test_proba),
        "test_accuracy": accuracy_score(y_test, y_test_pred),
        "test_precision": precision_score(y_test, y_test_pred),
        "test_recall": recall_score(y_test, y_test_pred),
        "test_f1": f1_score(y_test, y_test_pred)
    }
    
    # Log metrics
    mlflow.log_metrics(metrics)
    
    # Log model
    mlflow.sklearn.log_model(
        best_model, 
        "model", 
        registered_model_name="CreditRiskModel"
    )
    
    # Print metrics
    print("\nTest Set Performance:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")
        mlflow.log_metric(metric, value)
    
    # Confusion matrix
    cm = confusion_matrix(y_test, y_test_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Low Risk', 'High Risk'],
                yticklabels=['Low Risk', 'High Risk'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.savefig("confusion_matrix.png")
    plt.show()
    mlflow.log_artifact("confusion_matrix.png")
    
    # ROC Curve
    fpr, tpr, _ = roc_curve(y_test, y_test_proba)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'AUC = {metrics["test_roc_auc"]:.4f}')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend()
    plt.savefig("roc_curve.png")
    plt.show()
    mlflow.log_artifact("roc_curve.png")

# %% [markdown]
"""
## 8. Feature Importance Analysis
"""

# %%
# Extract feature names
feature_names = list(numeric_features) + list(
    best_model.named_steps['preprocessor']
    .named_transformers_['cat']
    .named_steps['onehot']
    .get_feature_names_out(categorical_features)
)

# Plot feature importance
if hasattr(best_model.named_steps['classifier'], 'feature_importances_'):
    importances = best_model.named_steps['classifier'].feature_importances_
    importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': importances
    }).sort_values('importance', ascending=False).head(20)
    
    plt.figure(figsize=(12, 8))
    sns.barplot(x='importance', y='feature', data=importance_df)
    plt.title('Top 20 Feature Importances')
    plt.tight_layout()
    plt.savefig("feature_importance.png")
    plt.show()
    mlflow.log_artifact("feature_importance.png")
    
    # Save to CSV
    importance_df.to_csv("feature_importances.csv", index=False)
    mlflow.log_artifact("feature_importances.csv")

# %% [markdown]
"""
## 9. Save Model
"""

# %%
# Save model locally
os.makedirs("../models", exist_ok=True)
joblib.dump(best_model, "../models/credit_risk_model.pkl")
print("Model saved to models/credit_risk_model.pkl")

# Log model path
mlflow.log_param("model_path", "../models/credit_risk_model.pkl")

# %% [markdown]
"""
## 10. MLflow Tracking
"""

# %%
# Show MLflow experiment UI link
print("\nMLflow Experiment Tracking:")
print("Run the following command to view results:")
print("mlflow ui --backend-store-uri sqlite:///mlflow.db")
print(f"Then visit: http://localhost:5000")