# Credit Risk Modeling
This notebook explores credit risk data and trains a machine learning model for credit risk assessment.

## Configuration and Imports
Setting up necessary libraries and configurations for the analysis.

In [None]:
# Standard data manipulation and visualization libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning components
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, roc_curve

# Custom modules
from src.model_training import (
    load_data,
    build_full_pipeline,
    train_model,
    evaluate_model,
    save_pipeline
)

# Configure display options
pd.set_option('display.max_columns', None)
plt.style.use('seaborn')

## Data Loading
Loading the credit risk dataset from the data directory.

In [None]:
# Load the dataset
data = load_data('data/generated_credit_data.csv')
print("Dataset shape:", data.shape)

## Exploratory Data Analysis (EDA)
Analyzing the structure and characteristics of our dataset.

In [None]:
# Basic data exploration
print("Data Info:")
print(data.info())
print("\nDescriptive Statistics:")
print(data.describe())

# Check missing values
plt.figure(figsize=(10, 6))
sns.heatmap(data.isnull(), yticklabels=False, cbar=True, cmap='viridis')
plt.title('Missing Values Heatmap')
plt.show()

In [None]:
# Target variable distribution
plt.figure(figsize=(8, 6))
data['default'].value_counts(normalize=True).plot(kind='bar')
plt.title('Target Variable Distribution')
plt.xlabel('Default Status')
plt.ylabel('Proportion')
plt.show()

# Correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix')
plt.show()

## Preprocessing and Feature Engineering
Preparing the data for model training by splitting into train/test sets and applying the preprocessing pipeline.

In [None]:
# Split the data
X = data.drop('default', axis=1)
y = data['default']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Build and fit the pipeline
pipeline = build_full_pipeline()
X_train_transformed = pipeline.fit_transform(X_train)
X_test_transformed = pipeline.transform(X_test)

print("Transformed training data shape:", X_train_transformed.shape)
print("Transformed test data shape:", X_test_transformed.shape)

## Model Training
Training the model using GridSearchCV for hyperparameter tuning.

In [None]:
# Train the model
model, best_params, best_score = train_model(X_train, y_train)

print("Best Parameters:", best_params)
print("Best Validation ROC AUC Score:", best_score)

## Model Evaluation
Evaluating the model's performance on the test set.

In [None]:
# Evaluate the model
metrics = evaluate_model(model, X_test, y_test)

print("Test Set Metrics:")
print(f"ROC AUC: {metrics['roc_auc']:.3f}")
print(f"Precision: {metrics['precision']:.3f}")
print(f"Recall: {metrics['recall']:.3f}")

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(metrics['confusion_matrix'], annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(metrics['fpr'], metrics['tpr'])
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()

## Pipeline Saving
Saving the trained pipeline for future use.

In [None]:
# Save the pipeline
save_pipeline(pipeline, 'models/credit_risk_pipeline.pkl')
print("Pipeline saved successfully!")