In [8]:
import os
import warnings
warnings.filterwarnings('ignore')


import numpy as np
import pandas as pd
from pathlib import Path
from pprint import pprint


from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (
accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
classification_report, confusion_matrix, roc_curve, auc
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC


# Optional boosters — the code falls back gracefully if not installed
try:
    import xgboost as xgb
    has_xgb = True
except Exception:
    has_xgb = False



try:
    import lightgbm as lgb
    has_lgb = True
except Exception:
    has_lgb = False



import joblib
import matplotlib.pyplot as plt


RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [23]:
import os

matches = []
for root, dirs, files in os.walk("C:\\", topdown=True):
    for name in files:
        if name.lower() == "loan.csv":  # exact name
            matches.append(os.path.join(root, name))

matches[:10]  # show first 10


['C:\\Credit-risk-scoring\\data\\raw\\loan.csv']

In [24]:
# --------------------------
# Cell 2: Load dataset
# --------------------------

DATA_PATH = Path(r"C:\Credit-risk-scoring\data\raw\loan.csv")

if not DATA_PATH.exists():
    raise FileNotFoundError("CSV not found — check path")

df = pd.read_csv(DATA_PATH)
print("Loaded dataset with shape:", df.shape)

display(df.head())

print("\nColumns:", df.columns.tolist())


Loaded dataset with shape: (2260668, 145)


Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,,,2500,2500,2500.0,36 months,13.56,84.92,C,C1,...,,,Cash,N,,,,,,
1,,,30000,30000,30000.0,60 months,18.94,777.23,D,D2,...,,,Cash,N,,,,,,
2,,,5000,5000,5000.0,36 months,17.97,180.69,D,D1,...,,,Cash,N,,,,,,
3,,,4000,4000,4000.0,36 months,18.94,146.51,D,D2,...,,,Cash,N,,,,,,
4,,,30000,30000,30000.0,60 months,16.14,731.78,C,C4,...,,,Cash,N,,,,,,



Columns: ['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'annual_inc', 'verification_status', 'issue_d', 'loan_status', 'pymnt_plan', 'url', 'desc', 'purpose', 'title', 'zip_code', 'addr_state', 'dti', 'delinq_2yrs', 'earliest_cr_line', 'inq_last_6mths', 'mths_since_last_delinq', 'mths_since_last_record', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'initial_list_status', 'out_prncp', 'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee', 'recoveries', 'collection_recovery_fee', 'last_pymnt_d', 'last_pymnt_amnt', 'next_pymnt_d', 'last_credit_pull_d', 'collections_12_mths_ex_med', 'mths_since_last_major_derog', 'policy_code', 'application_type', 'annual_inc_joint', 'dti_joint', 'verification_status_joint', 'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m', 'open_act_il', 'open_il_12m

In [25]:
# Map loan_status to binary:
bad_status = [
    'Charged Off', 
    'Default',
    'Late (31-120 days)', 
    'Late (16-30 days)',
    'In Grace Period'
]

df['target'] = df['loan_status'].apply(lambda x: 1 if x in bad_status else 0)
df['target'].value_counts()


target
0    1964396
1     296272
Name: count, dtype: int64

In [26]:
df = df.drop(columns=['loan_status'])


In [27]:
TARGET_COL = 'target'


In [29]:
for col in df.columns:
    print(col)


id
member_id
loan_amnt
funded_amnt
funded_amnt_inv
term
int_rate
installment
grade
sub_grade
emp_title
emp_length
home_ownership
annual_inc
verification_status
issue_d
pymnt_plan
url
desc
purpose
title
zip_code
addr_state
dti
delinq_2yrs
earliest_cr_line
inq_last_6mths
mths_since_last_delinq
mths_since_last_record
open_acc
pub_rec
revol_bal
revol_util
total_acc
initial_list_status
out_prncp
out_prncp_inv
total_pymnt
total_pymnt_inv
total_rec_prncp
total_rec_int
total_rec_late_fee
recoveries
collection_recovery_fee
last_pymnt_d
last_pymnt_amnt
next_pymnt_d
last_credit_pull_d
collections_12_mths_ex_med
mths_since_last_major_derog
policy_code
application_type
annual_inc_joint
dti_joint
verification_status_joint
acc_now_delinq
tot_coll_amt
tot_cur_bal
open_acc_6m
open_act_il
open_il_12m
open_il_24m
mths_since_rcnt_il
total_bal_il
il_util
open_rv_12m
open_rv_24m
max_bal_bc
all_util
total_rev_hi_lim
inq_fi
total_cu_tl
inq_last_12m
acc_open_past_24mths
avg_cur_bal
bc_open_to_buy
bc_util
charg

In [31]:
# --------------------------
# Cell 3: Train / Test split & feature typing
# --------------------------

# Make sure 'df' is already loaded from Cell 2
TARGET_COL = 'target'

# Sanity checks
if TARGET_COL not in df.columns:
    raise KeyError(f"Target column '{TARGET_COL}' not found in dataframe. Available columns: {df.columns.tolist()}")

# Separate X / y
X = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL]

# Print target distribution
print("Target distribution (counts):")
print(y.value_counts(dropna=False))
print("\nTarget distribution (percent):")
print(y.value_counts(normalize=True) * 100)

# Train / Test split (stratified)
from sklearn.model_selection import train_test_split
RANDOM_STATE = 42

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=RANDOM_STATE
)

print(f"\nTrain shape: {X_train.shape}, Test shape: {X_test.shape}")

# Feature typing: numeric vs categorical (basic automatic detection)
import numpy as np
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = [c for c in X.columns if c not in numeric_features]

print(f"\nDetected numeric features: {len(numeric_features)}")
print(f"Detected categorical features: {len(categorical_features)}")

# Show top 20 of each (for quick sanity)
print("\nSample numeric features (up to 20):")
print(numeric_features[:20])

print("\nSample categorical features (up to 20):")
print(categorical_features[:20])

# Quick check: missing value counts for train set (top 20)
print("\nTop 20 columns by missing % in X_train:")
miss_pct = X_train.isna().mean().sort_values(ascending=False) * 100
print(miss_pct.head(20))

# Save a small sample of the training set for quick manual checks (optional)
sample_path = Path("train_sample_head.csv")
pd.concat([X_train.reset_index(drop=True).head(200), y_train.reset_index(drop=True).head(200)], axis=1).to_csv(sample_path, index=False)
print(f"\nSaved a small head sample to: {sample_path} (open to inspect)")

# Basic guidance printed for next cell
print("\nNext: we'll build a ColumnTransformer to impute & scale numeric features and impute/encode categorical features.")


Target distribution (counts):
target
0    1964396
1     296272
Name: count, dtype: int64

Target distribution (percent):
target
0    86.894493
1    13.105507
Name: proportion, dtype: float64

Train shape: (1808534, 144), Test shape: (452134, 144)

Detected numeric features: 109
Detected categorical features: 35

Sample numeric features (up to 20):
['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'int_rate', 'installment', 'annual_inc', 'url', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'mths_since_last_delinq', 'mths_since_last_record', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'out_prncp']

Sample categorical features (up to 20):
['term', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'verification_status', 'issue_d', 'pymnt_plan', 'desc', 'purpose', 'title', 'zip_code', 'addr_state', 'earliest_cr_line', 'initial_list_status', 'last_pymnt_d', 'next_pymnt_d', 'last_credit_pull_d', 'application_type']

Top 20 columns by missing % in

In [32]:
# --------------------------
# Cell 4: Preprocessing Pipeline
# --------------------------

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import numpy as np

# -------------------------------------
# 1. Drop unusable / 100% missing columns
# -------------------------------------

missing_pct = X_train.isna().mean() * 100

# Drop columns with >95% missing values or that are obviously non-features
cols_to_drop = list(missing_pct[missing_pct > 95].index)

# Add non-predictive columns
cols_to_drop += [
    "id", "member_id", "url", "desc", "title"  # text/ID fields
]

# Keep unique
cols_to_drop = list(set(cols_to_drop))

print(f"Columns being dropped ({len(cols_to_drop)}):")
print(cols_to_drop[:30])

X_train_clean = X_train.drop(columns=cols_to_drop, errors="ignore")
X_test_clean = X_test.drop(columns=cols_to_drop, errors="ignore")

print("\nCleaned train shape:", X_train_clean.shape)
print("Cleaned test shape:", X_test_clean.shape)

# -------------------------------------
# 2. Identify numeric & categorical columns
# -------------------------------------

numeric_features = X_train_clean.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = [c for c in X_train_clean.columns if c not in numeric_features]

print(f"\nNumeric features: {len(numeric_features)}")
print(f"Categorical features: {len(categorical_features)}")

# -------------------------------------
# 3. Define transformers
# -------------------------------------

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

print("\nPreprocessing pipeline created successfully.")
print("Ready for model training in Cell 5.")


Columns being dropped (36):
['hardship_last_payment_amount', 'hardship_type', 'sec_app_collections_12_mths_ex_med', 'orig_projected_additional_accrued_interest', 'hardship_start_date', 'sec_app_inq_last_6mths', 'hardship_dpd', 'sec_app_mort_acc', 'title', 'hardship_end_date', 'settlement_percentage', 'desc', 'hardship_status', 'settlement_amount', 'member_id', 'hardship_reason', 'debt_settlement_flag_date', 'sec_app_revol_util', 'settlement_term', 'hardship_length', 'sec_app_chargeoff_within_12_mths', 'revol_bal_joint', 'sec_app_num_rev_accts', 'url', 'hardship_loan_status', 'sec_app_open_act_il', 'sec_app_mths_since_last_major_derog', 'hardship_payoff_balance_amount', 'sec_app_earliest_cr_line', 'settlement_date']

Cleaned train shape: (1808534, 108)
Cleaned test shape: (452134, 108)

Numeric features: 86
Categorical features: 22

Preprocessing pipeline created successfully.
Ready for model training in Cell 5.
