In [4]:
import os
from pathlib import Path
print('CWD:', Path.cwd())
print('Parents:', [str(p) for p in list(Path.cwd().parents)[:4]])
print('Has src in any parent?', any((p/'src').exists() for p in [Path.cwd(), *Path.cwd().parents]))

CWD: d:\10  Academy\week4\assigniment\Credit-Risk-Probability-Model\notebooks
Parents: ['d:\\10  Academy\\week4\\assigniment\\Credit-Risk-Probability-Model', 'd:\\10  Academy\\week4\\assigniment', 'd:\\10  Academy\\week4', 'd:\\10  Academy']
Has src in any parent? True


# Credit Risk Model Training (Xente Transactions)
This notebook loads the raw Xente transaction data, applies the feature-engineering pipeline from `src.data_processing`, and trains/evaluates baseline classifiers (Logistic Regression, Random Forest). It includes ROC curves and optional hyperparameter tuning.

In [5]:
# Imports
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, classification_report, RocCurveDisplay
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

from src.data_processing import build_preprocessing_pipeline, TARGET_COLUMN

sns.set(style="whitegrid", context="notebook")
np.random.seed(42)

ModuleNotFoundError: No module named 'src'

In [None]:
# Load data
data_path = "../data/raw/data.csv"
df = pd.read_csv(data_path)
print(df.shape)
df.head()

In [None]:
# Quick EDA
print("Missing values (top 20):")
display(df.isnull().sum().sort_values(ascending=False).head(20))

if TARGET_COLUMN in df.columns:
    print("\nClass balance (FraudResult):")
    print(df[TARGET_COLUMN].value_counts(normalize=True))

In [None]:
# Build preprocessing pipeline (Task 3)
preprocessing = build_preprocessing_pipeline(woe_columns=None)  # WoE for all object columns
preprocessing

In [None]:
# Train-test split
X = df.drop(columns=[TARGET_COLUMN])
y = df[TARGET_COLUMN]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
 )
len(X_train), len(X_test)

In [None]:
# Model 1: Logistic Regression
pipe_lr = Pipeline([
    ("preprocess", preprocessing),
    ("model", LogisticRegression(max_iter=1000, class_weight="balanced")),
])
pipe_lr.fit(X_train, y_train)
pipe_lr

In [None]:
# Evaluate Logistic Regression
y_pred_lr = pipe_lr.predict(X_test)
y_proba_lr = pipe_lr.predict_proba(X_test)[:, 1]

print("Logistic Regression:")
print("Accuracy:", round(accuracy_score(y_test, y_pred_lr), 4))
print("Precision:", round(precision_score(y_test, y_pred_lr, zero_division=0), 4))
print("Recall:", round(recall_score(y_test, y_pred_lr, zero_division=0), 4))
print("F1:", round(f1_score(y_test, y_pred_lr), 4))
print("ROC-AUC:", round(roc_auc_score(y_test, y_proba_lr), 4))
print("\nClassification Report:\n", classification_report(y_test, y_pred_lr, zero_division=0))

RocCurveDisplay.from_predictions(y_test, y_proba_lr)
plt.title("ROC Curve - Logistic Regression")
plt.show()

In [None]:
# Model 2: Random Forest
preprocessing_rf = build_preprocessing_pipeline(woe_columns=None)
pipe_rf = Pipeline([
    ("preprocess", preprocessing_rf),
    ("model", RandomForestClassifier(n_estimators=200, random_state=42, class_weight="balanced")),
])
pipe_rf.fit(X_train, y_train)
pipe_rf

In [None]:
# Evaluate Random Forest
y_pred_rf = pipe_rf.predict(X_test)
y_proba_rf = pipe_rf.predict_proba(X_test)[:, 1]

print("Random Forest:")
print("Accuracy:", round(accuracy_score(y_test, y_pred_rf), 4))
print("Precision:", round(precision_score(y_test, y_pred_rf, zero_division=0), 4))
print("Recall:", round(recall_score(y_test, y_pred_rf, zero_division=0), 4))
print("F1:", round(f1_score(y_test, y_pred_rf), 4))
print("ROC-AUC:", round(roc_auc_score(y_test, y_proba_rf), 4))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf, zero_division=0))

RocCurveDisplay.from_predictions(y_test, y_proba_rf)
plt.title("ROC Curve - Random Forest")
plt.show()

In [None]:
# Optional: Hyperparameter Tuning (may take time)
param_grid = {
    "model__n_estimators": [100, 200],
    "model__max_depth": [None, 10, 20],
    "model__min_samples_split": [2, 5, 10],
}

base_rf_pipe = Pipeline([
    ("preprocess", build_preprocessing_pipeline(woe_columns=None)),
    ("model", RandomForestClassifier(random_state=42, class_weight="balanced")),
])

grid = GridSearchCV(
    estimator=base_rf_pipe,
    param_grid=param_grid,
    scoring="roc_auc",
    cv=3,
    n_jobs=-1,
    verbose=1,
 )

grid.fit(X_train, y_train)
print("Best params:", grid.best_params_)

best_rf = grid.best_estimator_
y_pred_gs = best_rf.predict(X_test)
y_proba_gs = best_rf.predict_proba(X_test)[:, 1]
print("Tuned RF ROC-AUC:", round(roc_auc_score(y_test, y_proba_gs), 4))

In [None]:
# Feature Importance (Random Forest)
def plot_feature_importance(fitted_pipeline, top_n=20):
    try:
        model = fitted_pipeline.named_steps["model"]
        preprocess = fitted_pipeline.named_steps["preprocess"]
        encode = preprocess.named_steps.get("encode")
        feat_names = list(encode.get_feature_names_out()) if encode is not None else []
        if hasattr(model, "feature_importances_") and feat_names:
            importances = model.feature_importances_
            imp_df = (
                pd.DataFrame({"feature": feat_names, "importance": importances})
                .sort_values("importance", ascending=False)
                .head(top_n)
            )
            plt.figure(figsize=(8, max(4, int(top_n * 0.4))))
            sns.barplot(data=imp_df, x="importance", y="feature")
            plt.title("Top Feature Importances (Random Forest)")
            plt.tight_layout()
            plt.show()
        else:
            print("Feature names or importances unavailable for this configuration.")
    except Exception as e:
        print("Could not plot feature importance:", e)

# Use tuned model if present; otherwise the baseline RF
plot_feature_importance(grid.best_estimator_ if 'grid' in globals() else pipe_rf, top_n=20)