In [8]:
import pandas as pd 
df=pd.read_csv('loan_dataset.csv')
X=df.drop('default',axis=1)
y=df['default']
X.columns

Index(['age', 'income', 'loan_amount', 'employment_type', 'education',
       'marital_status', 'loan_purpose'],
      dtype='object')

In [10]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42,
    stratify=y
)


In [12]:
num_cols=['age','income','loan_amount']
cat_cols=X.select_dtypes(include=object).columns.tolist()

In [13]:
num_pipeline =Pipeline([
    ('impute',SimpleImputer(strategy='median')),
     ('scaler',StandardScaler())
])

cat_pipeline=Pipeline([
    ('impute',SimpleImputer(strategy="most_frequent")),
    ('encoding',OneHotEncoder(drop="first",handle_unknown="ignore"))
])

preprocessing=ColumnTransformer([
    ('num',num_pipeline,num_cols),
    ('cat',cat_pipeline,cat_cols)
])

     

* BASELINE — SINGLE DEEP TREE

In [14]:
tree = Pipeline([
    ("preprocessing", preprocessing),
    ("model", DecisionTreeClassifier(
        random_state=42
    ))
])

tree.fit(X_train, y_train)

tree_train_auc = roc_auc_score(
    y_train, tree.predict_proba(X_train)[:, 1]
)

tree_test_auc = roc_auc_score(
    y_test, tree.predict_proba(X_test)[:, 1]
)

tree_cv = cross_val_score(
    tree, X, y, cv=5, scoring="roc_auc"
)


* RANDOM FOREST — VARIANCE REDUCTION

In [15]:
rf = Pipeline([
    ("preprocessing", preprocessing),
    ("model", RandomForestClassifier(
        n_estimators=300,
        max_features="sqrt",
        random_state=42,
        n_jobs=-1
    ))
])

rf.fit(X_train, y_train)

rf_train_auc = roc_auc_score(
    y_train, rf.predict_proba(X_train)[:, 1]
)

rf_test_auc = roc_auc_score(
    y_test, rf.predict_proba(X_test)[:, 1]
)

rf_cv = cross_val_score(
    rf, X, y, cv=5, scoring="roc_auc"
)


* Compare Results

In [16]:
print("DECISION TREE")
print("Train AUC:", tree_train_auc)
print("Test  AUC:", tree_test_auc)
print("CV Mean :", tree_cv.mean())
print("CV Std  :", tree_cv.std())

print("\nRANDOM FOREST")
print("Train AUC:", rf_train_auc)
print("Test  AUC:", rf_test_auc)
print("CV Mean :", rf_cv.mean())
print("CV Std  :", rf_cv.std())


DECISION TREE
Train AUC: 1.0
Test  AUC: 0.5435624304564322
CV Mean : 0.5373304659604397
CV Std  : 0.003150418902986174

RANDOM FOREST
Train AUC: 1.0
Test  AUC: 0.558163608353265
CV Mean : 0.5535205593962191
CV Std  : 0.009554907245805527
