In [7]:
# ==============================
# Model Training - Modified
# ==============================

import pandas as pd
import numpy as np
import re
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

try:
    from lightgbm import LGBMClassifier
except ImportError:
    raise SystemExit("Please install lightgbm: pip install lightgbm")

# --- CONFIG ---
CSV_PATH = "../data/Lending_Club.csv"   # <- change if needed
N_PER_CLASS = 25_000                    # 50k total (25k good + 25k bad)
CHUNK_SIZE = 50_000                     # small chunks to be memory-safe
RANDOM_STATE = 42
TOP_K_FEATURES = 30

BASE_COLS = {
    "loan_status","loan_amnt","term","int_rate","installment",
    "grade","sub_grade",
    "emp_title","emp_length","home_ownership","annual_inc","verification_status",
    "purpose","zip_code","addr_state",
    "dti","delinq_2yrs","earliest_cr_line","fico_range_low","fico_range_high",
    "inq_last_6mths","mths_since_last_delinq","mths_since_last_record",
    "open_acc","pub_rec","revol_bal","revol_util","total_acc","initial_list_status",
    "out_prncp","out_prncp_inv","total_pymnt","total_pymnt_inv",
    "total_rec_prncp","total_rec_int","total_rec_late_fee","recoveries",
    "collection_recovery_fee","last_pymnt_d","last_pymnt_amnt",
    "next_pymnt_d","last_credit_pull_d",
    "last_fico_range_high","last_fico_range_low",
    "application_type"
}


In [8]:
# ==============================
# 1) Helper functions
# ==============================
def status_to_binary(s):
    mapping = {
        "Fully Paid": 1, "Current": 1,
        "Charged Off": 0, "Default": 0,
        "Late (31-120 days)": 0, "Late (16-30 days)": 0,
        "In Grace Period": 0
    }
    return mapping.get(s, np.nan)

def parse_emp_length(val):
    if pd.isna(val):
        return np.nan
    s = str(val).strip().lower()
    if s in {"n/a","na","none"}:
        return np.nan
    if "10+" in s:
        return 10
    if "<" in s:
        return 0
    m = re.search(r"(\d+)", s)
    return float(m.group(1)) if m else np.nan

def parse_term_months(val):
    if pd.isna(val):
        return np.nan
    m = re.search(r"(\d+)", str(val))
    return float(m.group(1)) if m else np.nan

def to_float_pct_ok(val):
    if pd.isna(val):
        return np.nan
    s = str(val).replace("%","").strip()
    try:
        return float(s)
    except:
        return np.nan

def parse_mmmyyyy(val):
    if pd.isna(val):
        return pd.NaT
    try:
        return pd.to_datetime(val, format="%b-%Y", errors="coerce")
    except:
        return pd.NaT

def months_between_vec(later, earlier_series):
    mask = earlier_series.notna()
    months = pd.Series(np.nan, index=earlier_series.index, dtype="float32")
    months[mask] = (later.year - earlier_series[mask].dt.year)*12 + (later.month - earlier_series[mask].dt.month)
    return months

def downcast_numeric(df):
    for c in df.select_dtypes(include=["float64","float32","int64","int32"]).columns:
        if pd.api.types.is_float_dtype(df[c]):
            df[c] = pd.to_numeric(df[c], downcast="float")
        else:
            df[c] = pd.to_numeric(df[c], downcast="integer")
    return df

In [9]:
# ==============================
# 2) Stream-read & build balanced sample
# ==============================
good_rows = []
bad_rows  = []
good_needed = N_PER_CLASS
bad_needed  = N_PER_CLASS

usecols_fn = lambda c: c in BASE_COLS

reader = pd.read_csv(
    CSV_PATH,
    usecols=usecols_fn,
    chunksize=CHUNK_SIZE,
    low_memory=True
)

for i, chunk in enumerate(reader, start=1):
    chunk["loan_status_binary"] = chunk["loan_status"].map(status_to_binary)
    chunk = chunk.dropna(subset=["loan_status_binary"])

    good_chunk = chunk[chunk["loan_status_binary"]==1]
    bad_chunk  = chunk[chunk["loan_status_binary"]==0]

    if good_needed > 0 and len(good_chunk)>0:
        take = min(good_needed, len(good_chunk))
        good_rows.append(good_chunk.sample(n=take, random_state=RANDOM_STATE))
        good_needed -= take

    if bad_needed > 0 and len(bad_chunk)>0:
        take = min(bad_needed, len(bad_chunk))
        bad_rows.append(bad_chunk.sample(n=take, random_state=RANDOM_STATE))
        bad_needed -= take

    print(f"[Chunk {i}] need good:{good_needed} bad:{bad_needed}")
    if good_needed <=0 and bad_needed <=0:
        break

good_df = pd.concat(good_rows, ignore_index=True) if good_rows else pd.DataFrame()
bad_df  = pd.concat(bad_rows, ignore_index=True) if bad_rows else pd.DataFrame()

if good_df.empty or bad_df.empty:
    raise SystemExit("Not enough rows collected. Increase CHUNK_SIZE or reduce N_PER_CLASS.")

data = shuffle(pd.concat([good_df, bad_df], ignore_index=True), random_state=RANDOM_STATE)
print("Balanced sample shape:", data.shape)
print(data["loan_status_binary"].value_counts())

[Chunk 1] need good:0 bad:15588
[Chunk 2] need good:0 bad:6690
[Chunk 3] need good:0 bad:0
Balanced sample shape: (50000, 46)
loan_status_binary
0    25000
1    25000
Name: count, dtype: int64


In [10]:
# ==============================
# 3) Light preprocessing / feature engineering
# ==============================
def have(col): return col in data.columns

if have("emp_length"):
    data["emp_length_years"] = data["emp_length"].map(parse_emp_length)
if have("term"):
    data["term_months"] = data["term"].map(parse_term_months)
if have("int_rate"):
    data["int_rate_num"] = data["int_rate"].map(to_float_pct_ok)
if have("revol_util"):
    data["revol_util_num"] = data["revol_util"].map(to_float_pct_ok)

issue_dt = parse_mmmyyyy("Jan-2016")
if have("issue_d"):
    data["issue_d_dt"] = data["issue_d"].map(parse_mmmyyyy).fillna(issue_dt)
else:
    data["issue_d_dt"] = issue_dt

if have("earliest_cr_line"):
    data["earliest_cr_line_dt"] = data["earliest_cr_line"].map(parse_mmmyyyy)
    data["credit_history_months"] = months_between_vec(issue_dt, data["earliest_cr_line_dt"])

to_drop = [c for c in [
    "loan_status","emp_title","emp_length","term","int_rate","revol_util",
    "issue_d","earliest_cr_line","issue_d_dt","earliest_cr_line_dt"
] if have(c)]
data.drop(columns=to_drop, inplace=True, errors="ignore")

for c in data.columns:
    if c=="loan_status_binary":
        continue
    if data[c].dtype.kind in "biufc":
        data[c] = data[c].astype("float32").fillna(data[c].median())
    else:
        mode_val = data[c].mode(dropna=True)
        mode_val = mode_val.iloc[0] if len(mode_val) else "Unknown"
        data[c] = data[c].fillna(mode_val)

# Encode remaining categorical columns
for c in data.select_dtypes(include=["object","category"]).columns:
    if c=="loan_status_binary": continue
    data[c] = pd.factorize(data[c])[0].astype("int32")

data = downcast_numeric(data)

In [11]:
# ==============================
# 4) Train/Val split
# ==============================
target = "loan_status_binary"
X = data.drop(columns=[target])
y = data[target].astype(int)

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)
print("Train shape:", X_train.shape, "| Val shape:", X_val.shape)

# Ensure all columns numeric for LGBM
for c in X_train.columns:
    if X_train[c].dtype == "object" or str(X_train[c].dtype)=="category":
        X_train[c] = pd.factorize(X_train[c])[0].astype("int32")
        X_val[c]   = pd.factorize(X_val[c])[0].astype("int32")

# Drop any remaining datetime columns (already converted to numeric)
for dt_col in ["last_pymnt_d","next_pymnt_d","last_credit_pull_d"]:
    if dt_col in X_train.columns:
        X_train = X_train.drop(columns=[dt_col])
        X_val   = X_val.drop(columns=[dt_col])

print("All columns numeric, ready for LightGBM.")

Train shape: (40000, 43) | Val shape: (10000, 43)
All columns numeric, ready for LightGBM.


In [12]:
# ==============================
# 5) Feature selection via LightGBM importance (pre-loan features only)
# ==============================
# Using all numeric columns in X_train (already ensured)
fs_model = LGBMClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=-1,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=RANDOM_STATE,
    n_jobs=-1
)

fs_model.fit(X_train, y_train)

# Get feature importances and pick top-K
importances = pd.Series(fs_model.feature_importances_, index=X_train.columns)
top_feats = importances.sort_values(ascending=False).head(TOP_K_FEATURES).index.tolist()
print(f"Top {TOP_K_FEATURES} features:")
print(top_feats)

# Reduce to top-K
X_train_k = X_train[top_feats]
X_val_k   = X_val[top_feats]

[LightGBM] [Info] Number of positive: 20000, number of negative: 20000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009917 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5414
[LightGBM] [Info] Number of data points in the train set: 40000, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Top 30 features:
['total_rec_prncp', 'loan_amnt', 'installment', 'last_pymnt_amnt', 'out_prncp', 'last_fico_range_high', 'total_pymnt', 'total_pymnt_inv', 'credit_history_months', 'total_rec_late_fee', 'total_rec_int', 'zip_code', 'int_rate_num', 'revol_util_num', 'total_acc', 'revol_bal', 'dti', 'out_prncp_inv', 'recoveries', 'annual_inc', 'mths_since_last_delinq', 'addr_state', 'open_acc', 'fico_range_low', 'sub_grade', 'emp_length_years', 'last_fico_range_low', 'inq_last_6mths', 'purpose', 'term_months']


In [13]:
# ==============================
# 6) Final LightGBM model on top-K features
# ==============================
from lightgbm.callback import log_evaluation

model = LGBMClassifier(
    n_estimators=600,
    learning_rate=0.03,
    max_depth=-1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=RANDOM_STATE,
    n_jobs=-1
)

model.fit(
    X_train_k, y_train,
    eval_set=[(X_val_k, y_val)],
    eval_metric="auc",
    callbacks=[log_evaluation(50)]   # logs every 50 rounds
)

[LightGBM] [Info] Number of positive: 20000, number of negative: 20000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008228 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4952
[LightGBM] [Info] Number of data points in the train set: 40000, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[50]	valid_0's auc: 0.996932	valid_0's binary_logloss: 0.174144
[100]	valid_0's auc: 0.997711	valid_0's binary_logloss: 0.0826861
[150]	valid_0's auc: 0.998111	valid_0's binary_logloss: 0.0576212
[200]	valid_0's auc: 0.998343	valid_0's binary_logloss: 0.0494571
[250]	valid_0's auc: 0.998482	valid_0's binary_logloss: 0.0458138
[300]	valid_0's auc: 0.998543	valid_0's binary_logloss: 0.0441616
[350]	valid_0's auc: 0.998571	valid_0's binary_logloss: 0.0433284
[400]	valid_0's auc: 0.998577	valid_0's binary_logloss: 0.042809
[450]	valid_0's auc: 0.99859	valid_0

In [14]:
# ==============================
# 7) Model evaluation
# ==============================
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

y_pred = model.predict(X_val_k)
y_proba = model.predict_proba(X_val_k)[:,1]

print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_pred))

print("\nClassification Report:")
print(classification_report(y_val, y_pred, digits=4))

print("ROC AUC:", round(roc_auc_score(y_val, y_proba), 4))


Confusion Matrix:
[[4894  106]
 [  39 4961]]

Classification Report:
              precision    recall  f1-score   support

           0     0.9921    0.9788    0.9854      5000
           1     0.9791    0.9922    0.9856      5000

    accuracy                         0.9855     10000
   macro avg     0.9856    0.9855    0.9855     10000
weighted avg     0.9856    0.9855    0.9855     10000

ROC AUC: 0.9987


In [15]:
import joblib

# Save the final model
model_path = r"C:\Users\jites\Desktop\EcoCred\backend\models\lending_club_model_1.pkl"
joblib.dump(model, model_path)
print("Model saved to", model_path)


Model saved to C:\Users\jites\Desktop\EcoCred\backend\models\lending_club_model_1.pkl
