In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [2]:
# set up directory and load the data
ROOT = Path.cwd()
DATA = ROOT/ "data/cleaned.csv"

dat_path = Path(DATA)
df = pd.read_csv(dat_path)
df.head()

Unnamed: 0,LoanNr_ChkDgt,Name,State,BankState,ApprovalFY,Term,NoEmp,CreateJob,RetainedJob,UrbanRural,...,IsFranchise,NewBusiness,Default,DaysToDisbursement,DisbursementFY,StateSame,GuarantyRate,AppvDisbursed,RealEstate,GreatRecession
0,1000014003,ABC HOBBYCRAFT,IN,OH,1997,84,4,0,0,0,...,0,1,0,730,1999,0,0.8,1,0,0
1,1000024006,LANDMARK BAR & GRILLE (THE),IN,IN,1997,60,2,0,0,0,...,0,1,0,92,1997,1,0.8,1,0,0
2,1000034009,"WHITLOCK DDS, TODD M.",IN,IN,1997,180,7,0,0,0,...,0,0,0,306,1997,1,0.75,1,0,1
3,1000084002,"B&T SCREW MACHINE COMPANY, INC",CT,DE,1997,120,19,0,0,0,...,0,0,0,122,1997,0,0.75,1,0,1
4,1000094005,WEAVER PRODUCTS,FL,AL,1997,84,1,0,0,0,...,0,1,0,487,1998,0,0.8,1,0,0


In [3]:
# drop columns not necessary for modeling 
df_alz = df.drop(columns= ["LoanNr_ChkDgt", "Name", "State", "BankState"]) # as they do not fit the scenario that I try to simulate

In [4]:
def schema_summary(df_alz):
    s = []
    for c in df_alz.columns:
        ser = df_alz[c]
        s.append({
            "column": c,
            "dtype": str(ser.dtype),
            "non_null": int(ser.notna().sum()),
            "nulls": int(ser.isna().sum()),
            "null_pct": round(100 * ser.isna().mean(), 2),
            "n_unique": int(ser.nunique(dropna=True)),
            "sample_values": ", ".join(map(str, ser.dropna().unique()[:5]))
        })
    return pd.DataFrame(s).sort_values(["null_pct", "n_unique"], ascending=[False, False])

ss = schema_summary(df_alz)
display(ss)

Unnamed: 0,column,dtype,non_null,nulls,null_pct,n_unique,sample_values
8,DisbursementGross,float64,456531,0,0.0,100829,"60000.0, 40000.0, 287000.0, 517000.0, 45000.0"
9,GrAppv,float64,456531,0,0.0,12771,"60000.0, 40000.0, 287000.0, 517000.0, 45000.0"
17,GuarantyRate,float64,456531,0,0.0,5714,"0.8, 0.75, 0.85, 0.5, 1.0"
14,DaysToDisbursement,int64,456531,0,0.0,2179,"730, 92, 306, 122, 487"
2,NoEmp,int64,456531,0,0.0,432,"4, 2, 7, 19, 1"
1,Term,int64,456531,0,0.0,373,"84, 60, 180, 120, 297"
4,RetainedJob,int64,456531,0,0.0,276,"0, 23, 4, 6, 2"
3,CreateJob,int64,456531,0,0.0,192,"0, 5, 4, 1, 8"
15,DisbursementFY,int64,456531,0,0.0,31,"1999, 1997, 1998, 2006, 2001"
0,ApprovalFY,int64,456531,0,0.0,29,"1997, 2006, 1998, 1999, 2000"


In [5]:
df_alz['Default'].value_counts()

Default
0    358187
1     98344
Name: count, dtype: int64

## Baseline model - logistic regression

In [6]:
# preparation
# the outcome variable 
target_col = "Default"

# separate column types
# numeric
num_cols = [col for col in df_alz.select_dtypes(include=[np.number]).columns if col != target_col]
# categorical
cat_cols = [col for col in df_alz.columns if col not in num_cols and col!= target_col]

# handle different data type and preprocess respectively
preprocessor = ColumnTransformer(transformers=[
    ("num", Pipeline([("scaler", StandardScaler())]), num_cols),
    ("cat", Pipeline([("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))]), cat_cols)
], remainder="drop")

# define outcome and features 
y = df_alz[target_col].copy()
x = df_alz.drop(columns=[target_col]).copy()
# x_pre = preprocessor.fit_transform(x)
x_pre = preprocessor.fit_transform(x)

# train and test split 
x_train, x_test, y_train, y_test = train_test_split(x_pre, y, test_size=0.2, stratify= y)

# model fitting 
lr = LogisticRegression(max_iter= 1000)
lr.fit(x_train, y_train)

# predictions
y_pred_lr = lr.predict(x_test)

In [7]:
print(classification_report(y_test, y_pred_lr, digits=3))

              precision    recall  f1-score   support

           0      0.882     0.949     0.914     71638
           1      0.742     0.539     0.625     19669

    accuracy                          0.860     91307
   macro avg      0.812     0.744     0.770     91307
weighted avg      0.852     0.860     0.852     91307



## Enhanced model - random forest

In [8]:
# initialization 
rf = RandomForestClassifier(n_estimators= 200, random_state= 42, n_jobs= -1)
rf.fit(x_train, y_train)

# predictions
y_pred_rf = rf.predict(x_test)

# evaluations
print(classification_report(y_test, y_pred_rf, digits= 3))

              precision    recall  f1-score   support

           0      0.956     0.975     0.965     71638
           1      0.901     0.835     0.867     19669

    accuracy                          0.945     91307
   macro avg      0.928     0.905     0.916     91307
weighted avg      0.944     0.945     0.944     91307



In [9]:
# examine the importance of variables
for name, importance in sorted(zip(x.columns, rf.feature_importances_) , key=lambda x: x[1], reverse=True):
    print(f"{name} = {round(importance, 3)}")

Term = 0.443
ApprovalFY = 0.07
NewBusiness = 0.062
DaysToDisbursement = 0.062
DisbursementGross = 0.057
GrAppv = 0.054
NoEmp = 0.032
RetainedJob = 0.03
DisbursementFY = 0.029
StateSame = 0.025
RealEstate = 0.018
UrbanRural = 0.018
CreateJob = 0.016
GuarantyRate = 0.011
RevLineCr = 0.011
IsFranchise = 0.009
AppvDisbursed = 0.003
NACE = 0.002
GreatRecession = 0.001
LowDoc = 0.001


In [10]:
# predicted probability of default 
y_pred_prob_rf = rf.predict_proba(x_pre)[: ,1]
# print(len(y_pred_prob_rf))
# add the result to the original data frame
df["PredictedDefaultProb"] = y_pred_prob_rf 
# classification of risks level
bins = [0, 0.1, 0.3, 0.5, 0.7, 1.0]
labels = ["Very Low", "Low", "Medium", "High", "Very High"]
df["RiskLevel"] = pd.cut(df["PredictedDefaultProb"], bins = bins, labels = labels, include_lowest= True)


In [12]:
# store the result into a csv file 
ANALYZED_DATA = ROOT / "data/analyzed.csv"

# remove the columns State and BankState
df.drop(columns= ["State", "BankState"])

if not ANALYZED_DATA.exists():
    df.to_csv(ANALYZED_DATA, index= False)
    print("File created!")
else:
    print("File already existed.")

File created!
