In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    classification_report
)
from sklearn.model_selection import RandomizedSearchCV
import joblib

In [2]:
import os
import sys

# Absolute path to project root
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))

# Change working directory to project root
os.chdir(PROJECT_ROOT)

# Add project root to Python path
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

print("Current working directory:", os.getcwd())

Current working directory: c:\Users\venut\OneDrive\Desktop\credit_risk_project


In [3]:
cols = pd.read_csv("Dataset/dataset.csv", nrows=0).columns.tolist()

In [4]:
required_cols = [
    "loan_status", "loan_amnt", "term", "int_rate", "installment",
    "grade", "sub_grade", "emp_length", "home_ownership",
    "annual_inc", "verification_status", "purpose", "dti",
    "delinq_2yrs", "fico_range_low", "fico_range_high",
    "open_acc", "pub_rec", "revol_bal", "revol_util",
    "total_acc", "application_type"
]

In [6]:
chunks = []
chunk_size = 100_000

for chunk in pd.read_csv(
    "Dataset/dataset.csv",
    usecols=required_cols,
    chunksize=chunk_size,
    low_memory=False
):
    # Keep only clear outcomes
    chunk = chunk[chunk["loan_status"].isin(
        ["Fully Paid", "Charged Off", "Default"]
    )]
    
    chunks.append(chunk)

df = pd.concat(chunks, ignore_index=True)
df.shape


(1345350, 22)

In [7]:
df["target"] = df["loan_status"].map({
    "Fully Paid": 0,
    "Charged Off": 1,
    "Default": 1
})

In [8]:
X = df.drop(columns=["target", "loan_status"])
y = df["target"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [9]:
num_cols = X_train.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X_train.select_dtypes(include=["object"]).columns

In [10]:
numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=True))
])

preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, num_cols),
    ("cat", categorical_pipeline, cat_cols)
])


In [11]:
model = Pipeline([
    ("preprocessing", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000))
])

In [12]:
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))


              precision    recall  f1-score   support

           0       0.81      0.99      0.89    215350
           1       0.54      0.05      0.09     53720

    accuracy                           0.80    269070
   macro avg       0.68      0.52      0.49    269070
weighted avg       0.75      0.80      0.73    269070

ROC-AUC: 0.7090534861515678


In [13]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

models = {
    "Logistic Regression": LogisticRegression(
        max_iter=1000,
        class_weight="balanced",
        n_jobs=-1
    ),
    "Decision Tree": DecisionTreeClassifier(
        max_depth=6,
        class_weight="balanced",
        random_state=42
    ),
    "Random Forest": RandomForestClassifier(
        n_estimators=100,
        max_depth=8,
        class_weight="balanced",
        random_state=42,
        n_jobs=-1
    )
}

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

results = {}

for name, clf in models.items():
    pipe = Pipeline([
        ("preprocessing", preprocessor),
        ("classifier", clf)
    ])
    
    scores = cross_val_score(
        pipe,
        X_train,
        y_train,
        scoring="roc_auc",
        cv=cv,
        n_jobs=-1
    )
    
    results[name] = scores.mean()
    print(f"{name} ROC-AUC: {scores.mean():.4f}")


Logistic Regression ROC-AUC: 0.7094
Decision Tree ROC-AUC: 0.6996
Random Forest ROC-AUC: 0.7056


In [14]:
best_model = Pipeline([
    ("preprocessing", preprocessor),
    ("classifier", RandomForestClassifier(
        class_weight="balanced",
        random_state=42,
        n_jobs=-1
    ))
])

In [15]:
param_dist = {
    "classifier__n_estimators": [100, 150],
    "classifier__max_depth": [6, 8],
    "classifier__min_samples_split": [2, 5]
}

random_search = RandomizedSearchCV(
    estimator=best_model,
    param_distributions=param_dist,
    n_iter=4,               # VERY IMPORTANT (keep small)
    scoring="roc_auc",
    cv=3,
    random_state=42,
    n_jobs=1,               # CRITICAL FIX
    verbose=2
)

random_search.fit(X_train, y_train)

print("Best Params:", random_search.best_params_)
print("Best CV ROC-AUC:", random_search.best_score_)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] END classifier__max_depth=6, classifier__min_samples_split=2, classifier__n_estimators=150; total time=  57.4s
[CV] END classifier__max_depth=6, classifier__min_samples_split=2, classifier__n_estimators=150; total time= 1.1min
[CV] END classifier__max_depth=6, classifier__min_samples_split=2, classifier__n_estimators=150; total time= 1.2min
[CV] END classifier__max_depth=8, classifier__min_samples_split=2, classifier__n_estimators=150; total time= 1.8min
[CV] END classifier__max_depth=8, classifier__min_samples_split=2, classifier__n_estimators=150; total time= 1.7min
[CV] END classifier__max_depth=8, classifier__min_samples_split=2, classifier__n_estimators=150; total time= 1.6min
[CV] END classifier__max_depth=6, classifier__min_samples_split=2, classifier__n_estimators=100; total time=  46.1s
[CV] END classifier__max_depth=6, classifier__min_samples_split=2, classifier__n_estimators=100; total time=  46.9s
[CV] END cla

In [16]:
final_model = random_search.best_estimator_

y_pred = final_model.predict(X_test)
y_proba = final_model.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("Test ROC-AUC:", roc_auc_score(y_test, y_proba))


              precision    recall  f1-score   support

           0       0.89      0.60      0.71    215350
           1       0.30      0.70      0.42     53720

    accuracy                           0.62    269070
   macro avg       0.59      0.65      0.57    269070
weighted avg       0.77      0.62      0.66    269070

Test ROC-AUC: 0.7048568700003682


In [17]:
joblib.dump(final_model, "credit_risk_model.pkl")

['credit_risk_model.pkl']

In [4]:
import pandas as pd
from pathlib import Path

# Get project root (go up from Notebooks/)
PROJECT_ROOT = Path.cwd().parent

DATASET_PATH = PROJECT_ROOT / "Dataset" / "dataset.csv"

print("Looking for dataset at:", DATASET_PATH)
print("Exists?", DATASET_PATH.exists())

df = pd.read_csv(DATASET_PATH, low_memory=False)

# Remove target columns
df = df.drop(columns=["loan_status", "target"], errors="ignore")

df.head()


Looking for dataset at: c:\Users\venut\OneDrive\Desktop\credit_risk_project\Dataset\dataset.csv
Exists? True


Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,68407277,,3600.0,3600.0,3600.0,36 months,13.99,123.03,C,C4,...,,,Cash,N,,,,,,
1,68355089,,24700.0,24700.0,24700.0,36 months,11.99,820.28,C,C1,...,,,Cash,N,,,,,,
2,68341763,,20000.0,20000.0,20000.0,60 months,10.78,432.66,B,B4,...,,,Cash,N,,,,,,
3,66310712,,35000.0,35000.0,35000.0,60 months,14.85,829.9,C,C5,...,,,Cash,N,,,,,,
4,68476807,,10400.0,10400.0,10400.0,60 months,22.45,289.91,F,F1,...,,,Cash,N,,,,,,


In [5]:
import joblib
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent

# Paths
DATASET_PATH = PROJECT_ROOT / "Dataset" / "dataset.csv"
FEATURES_PATH = PROJECT_ROOT / "Models" / "expected_features.joblib"

# Load data
df = pd.read_csv(DATASET_PATH, low_memory=False)

# Load expected features
expected_features = joblib.load(FEATURES_PATH)

print("Number of expected features:", len(expected_features))

# Select only those features
sample_df = df[expected_features].sample(50, random_state=42)

# Save sample CSV
OUTPUT_PATH = PROJECT_ROOT / "sample_input.csv"
sample_df.to_csv(OUTPUT_PATH, index=False)

print("✅ Sample input saved at:", OUTPUT_PATH)


Number of expected features: 21
✅ Sample input saved at: c:\Users\venut\OneDrive\Desktop\credit_risk_project\sample_input.csv
