# Python for Data Science in Credit Risk Modeling - Core Framework

This notebook outlines the key components for implementing credit risk models in commercial banking using Python.

## 1. Data Preparation

### Data Acquisition
- **Key Libraries**: pandas, sqlalchemy, requests
- **Banking Applications**: Loan portfolios, credit bureau data, economic indicators
- **Key Skills**: Database connectivity, API integration, file processing

In [None]:
# Data Acquisition Example
import pandas as pd
import sqlalchemy as sa
import requests

# Database connection example
# engine = sa.create_engine('your_database_connection_string')
# loan_data = pd.read_sql('SELECT * FROM loan_portfolio', engine)

# File processing example
# loan_data = pd.read_csv('loan_portfolio.csv')

print("Data acquisition libraries loaded")

### Data Cleaning
- **Key Libraries**: pandas, numpy, scipy
- **Banking Applications**: Missing values, outliers, data quality
- **Key Skills**: Imputation, outlier detection, data validation

In [None]:
# Data Cleaning Example
import numpy as np
from scipy import stats

# Missing value handling
# loan_data['income'].fillna(loan_data['income'].median(), inplace=True)

# Outlier detection using IQR method
def detect_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return (data[column] < lower_bound) | (data[column] > upper_bound)

print("Data cleaning functions defined")

### Feature Engineering
- **Key Libraries**: pandas, sklearn.preprocessing, category_encoders
- **Banking Applications**: Payment patterns, behavioral variables, WOE/IV
- **Key Skills**: Time-series features, encoding, transformation

In [None]:
# Feature Engineering Example
from sklearn.preprocessing import StandardScaler, LabelEncoder
import category_encoders as ce

# Weight of Evidence calculation
def calculate_woe_iv(data, feature, target):
    """
    Calculate Weight of Evidence and Information Value
    """
    df = data.groupby(feature).agg({target: ['count', 'sum']}).reset_index()
    df.columns = [feature, 'total', 'bad']
    df['good'] = df['total'] - df['bad']
    
    total_good = df['good'].sum()
    total_bad = df['bad'].sum()
    
    df['good_rate'] = df['good'] / total_good
    df['bad_rate'] = df['bad'] / total_bad
    df['woe'] = np.log(df['good_rate'] / df['bad_rate'])
    df['iv'] = (df['good_rate'] - df['bad_rate']) * df['woe']
    
    return df, df['iv'].sum()

print("Feature engineering functions defined")

## 2. Analysis

### Exploratory Analysis
- **Key Libraries**: pandas, matplotlib, seaborn
- **Banking Applications**: Portfolio segmentation, risk patterns, correlation analysis
- **Key Skills**: Statistical analysis, trend identification, data storytelling

In [None]:
# Exploratory Analysis Example
import matplotlib.pyplot as plt
import seaborn as sns

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")

# Example EDA functions
def plot_default_rates_by_segment(data, segment_col, target_col):
    """
    Plot default rates by customer segment
    """
    default_rates = data.groupby(segment_col)[target_col].mean()
    
    plt.figure(figsize=(10, 6))
    default_rates.plot(kind='bar')
    plt.title(f'Default Rates by {segment_col}')
    plt.ylabel('Default Rate')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

def correlation_heatmap(data, features):
    """
    Create correlation heatmap for risk factors
    """
    plt.figure(figsize=(12, 8))
    sns.heatmap(data[features].corr(), annot=True, cmap='coolwarm', center=0)
    plt.title('Risk Factor Correlation Matrix')
    plt.tight_layout()
    plt.show()

print("EDA functions defined")

### Credit Modeling
- **Key Libraries**: sklearn, statsmodels, xgboost
- **Banking Applications**: PD models, credit scoring, risk classification
- **Key Skills**: Logistic regression, ensemble methods, hyperparameter tuning

In [None]:
# Credit Modeling Example
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
import xgboost as xgb

# Logistic Regression for PD modeling
def build_logistic_regression_model(X, y):
    """
    Build and train logistic regression model for PD
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # Create pipeline with scaling and logistic regression
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('logistic', LogisticRegression(random_state=42))
    ])
    
    # Fit model
    pipeline.fit(X_train, y_train)
    
    return pipeline, X_test, y_test

# XGBoost model
def build_xgboost_model(X, y):
    """
    Build XGBoost model for credit scoring
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    model = xgb.XGBClassifier(
        objective='binary:logistic',
        random_state=42,
        eval_metric='auc'
    )
    
    model.fit(X_train, y_train)
    
    return model, X_test, y_test

print("Credit modeling functions defined")

### Validation & Reporting
- **Key Libraries**: sklearn.metrics, plotly, dash
- **Banking Applications**: Model performance, regulatory reports, executive dashboards
- **Key Skills**: ROC/AUC, backtesting, interactive visualization

In [None]:
# Model Validation Example
from sklearn.metrics import roc_auc_score, roc_curve, classification_report
from sklearn.metrics import confusion_matrix, precision_recall_curve
import plotly.graph_objects as go
import plotly.express as px

def evaluate_credit_model(model, X_test, y_test):
    """
    Comprehensive model evaluation for credit risk
    """
    # Predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # AUC Score
    auc_score = roc_auc_score(y_test, y_pred_proba)
    
    # Gini Coefficient
    gini = 2 * auc_score - 1
    
    print(f"AUC Score: {auc_score:.4f}")
    print(f"Gini Coefficient: {gini:.4f}")
    
    return {
        'auc': auc_score,
        'gini': gini,
        'predictions': y_pred,
        'probabilities': y_pred_proba
    }

def plot_roc_curve(y_test, y_pred_proba):
    """
    Plot ROC curve using Plotly
    """
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    auc_score = roc_auc_score(y_test, y_pred_proba)
    
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=fpr, y=tpr, name=f'ROC Curve (AUC = {auc_score:.3f})'))
    fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='Random Classifier'))
    
    fig.update_layout(
        title='ROC Curve - Credit Risk Model',
        xaxis_title='False Positive Rate',
        yaxis_title='True Positive Rate'
    )
    
    return fig

def ks_statistic(y_test, y_pred_proba):
    """
    Calculate Kolmogorov-Smirnov statistic
    """
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    ks = max(tpr - fpr)
    return ks

print("Model validation functions defined")

## Example Workflow

In [None]:
# Example workflow implementation
def credit_risk_modeling_workflow():
    """
    Complete workflow for credit risk modeling
    """
    print("Credit Risk Modeling Workflow:")
    print("1. Data Acquisition - Load loan portfolio data")
    print("2. Data Cleaning - Handle missing values and outliers")
    print("3. Feature Engineering - Create behavioral variables and WOE")
    print("4. Exploratory Analysis - Understand risk patterns")
    print("5. Model Building - Train logistic regression/XGBoost")
    print("6. Model Validation - Evaluate performance with AUC, Gini, KS")
    print("7. Reporting - Create dashboards and regulatory reports")

credit_risk_modeling_workflow()

## Next Steps

1. **Load your actual credit data** into this framework
2. **Customize the functions** based on your specific banking requirements
3. **Implement regulatory requirements** (Basel III, IFRS 9)
4. **Set up model monitoring** and challenger model processes
5. **Create production pipelines** for real-time scoring

This framework provides the foundation for implementing comprehensive credit risk models in a commercial banking environment.