In [148]:
import pandas as pd

df = pd.read_csv('loan_data.csv')
df = pd.get_dummies(df, columns=['purpose'], drop_first=True)
X = df.drop('not.fully.paid', axis=1)
y = df['not.fully.paid']

In [149]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

def result(y_test, y_pred, model):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

    return { 
        "model_name": model if type(model) == str else type(model).__name__,
        "accuracy": accuracy, 
        "precision": precision, 
        "recall": recall, 
        "f1": f1, 
    }


In [150]:
from sklearn.model_selection import train_test_split


def create_model(model, X, y, **params):
    default_params = { "test_size": .3 } if not params else params
    X_train, X_test, y_train, y_test = train_test_split(X, y, **default_params)

    y_pred = model.fit(X_train, y_train).predict(X_test)

    return { "y_test": y_test, "y_pred": y_pred, "model": model }


In [157]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

results = []

logistic_regression = create_model(LogisticRegression(max_iter=1000), X, y, random_state=42, test_size=.25)
decision_tree = create_model(DecisionTreeClassifier(), X, y, random_state=42, test_size=.25)
random_forest = create_model(RandomForestClassifier(max_depth=5), X, y, random_state=42, test_size=.25)
xgboost = create_model(XGBClassifier(n_estimators=2, max_depth=6, learning_rate=1, objective='binary:logistic'), X, y, random_state=42, test_size=.25)

estimators = ([
    ('LogisticRegression', logistic_regression["model"]),
    ('DecisionTreeClassifier', decision_tree["model"]),
    ('RandomForestClassifier', random_forest["model"]),
    ('XGBClassifier', xgboost["model"])
])

soft_voting = create_model(VotingClassifier(estimators=estimators, voting="soft"), X, y)
hard_voting = create_model(VotingClassifier(estimators=estimators, voting="hard"), X, y)

results.append(result(**logistic_regression))
results.append(result(**decision_tree))
results.append(result(**random_forest))
results.append(result(**xgboost))

soft_voting["model"] = "Soft Voting"
results.append(result(**soft_voting))

hard_voting["model"] = "Hard Voting"
results.append(result(**hard_voting))


In [None]:
import dtreeviz

viz_model = dtreeviz.model(decision_tree,
                           X_train=X, y_train=y,
                           feature_names=X.columns,
                           target_name='loans')

v = viz_model.view()     # render as SVG into internal object 
v.show()                 # pop up window
v.save("./loans_decision_tree.svg")  # optionally save as svg

In [158]:
models_results = pd.DataFrame(results).sort_values(by=['f1'], ascending=False)

models_results

Unnamed: 0,model_name,accuracy,precision,recall,f1
4,Soft Voting,0.84064,0.786968,0.84064,0.780123
3,XGBClassifier,0.82881,0.760223,0.82881,0.776743
0,LogisticRegression,0.840084,0.787378,0.840084,0.772481
5,Hard Voting,0.840292,0.707168,0.840292,0.768004
2,RandomForestClassifier,0.840084,0.70574,0.840084,0.767074
1,DecisionTreeClassifier,0.74405,0.752729,0.74405,0.748278
