## Load data

In [None]:
from utils import *

import category_encoders as ce
import matplotlib.pyplot as plt
import seaborn as sb
from copy import deepcopy

%matplotlib inline

account_df = read_to_df("account.csv")
card_test_df = read_to_df("card_test.csv")
card_train_df = read_to_df("card_train.csv")
client_df = read_to_df("client.csv")
disp_df = read_to_df("disp.csv")
district_df = read_to_df("district.csv")
loan_test_df = read_to_df("loan_test.csv")
loan_train_df = read_to_df("loan_train.csv")
trans_test_df = read_to_df("trans_test.csv")
trans_train_df = read_to_df("trans_train.csv")

## Process data

### Process account data

In [None]:
avg_transactions_per_week = 3
avg_weeks_per_month = (365.25 / 7 / 12)

account_df['frequency'] = account_df['frequency'].apply(lambda x: 1 if x == 'monthly issuance' else avg_weeks_per_month if x == 'weekly issuance' else (365.25 / 7 / 12) * avg_transactions_per_week)
account_df["date"] = account_df["date"].apply(lambda x: read_date(x))
account_df.rename(columns={"date": "creation_date", "frequency": "issuance_frequency_per_month"}, inplace=True)

account_df.head()

### Process client data

In [None]:
client_df["sex"] = client_df["birth_number"].apply(lambda x: 0 if int(str(x)[2:4]) > 50 else 1)
client_df["age"] = client_df["birth_number"].apply(lambda x: calculate_age(read_date(x)))

client_df.drop("birth_number", inplace=True, axis=1)

client_df.head()

### Process disposition data

In [None]:
disp_df.rename(columns={"type": "is_owner"}, inplace=True)
disp_df["is_owner"].replace({"OWNER": True, "DISPONENT": False}, inplace=True)

# Count number clients per account
client_count_df = disp_df.groupby("account_id", as_index=False, group_keys=False).agg(client_count=("is_owner", "count"))

disp_df = disp_df.merge(client_count_df, on="account_id")
disp_df = disp_df[disp_df["is_owner"] == True] 
disp_df.drop("is_owner", axis=1, inplace=True)

disp_df.head()

### Process transaction data

In [None]:
dataframes = [trans_train_df, trans_test_df]

for i in range(len(dataframes)):
    dataframes[i]["operation"].replace(
        {
            "credit in cash": 1,
            "collection from another bank": 2,
            "withdrawal in cash": 3,
            "remittance to another bank": 4,
            "credit card withdrawal": 5,
            "interest credited": 6
        },
        inplace=True
    )

    # Convert "withdrawal in cash" to "withdrawal" in type column
    dataframes[i].loc[dataframes[i]["type"] == "withdrawal in cash", "type"] = "withdrawal"

    # Withdrawal amounts should be negative
    dataframes[i].loc[dataframes[i]["type"] == "withdrawal", "amount"] *= -1

    dataframes[i]["date"] = dataframes[i]["date"].apply(lambda x: read_date(x))
    dataframes[i].rename(columns={"date": "transaction_date"}, inplace=True)

    dataframes[i].drop(["k_symbol", "bank", "account"], axis=1, inplace=True)

trans_train_df, trans_test_df = dataframes

trans_train_df.head()

### Process card data

In [None]:
dataframes = [card_train_df, card_test_df]

for i in range(len(dataframes)):
    dataframes[i]["type"].replace({"classic": 1, "junior": 2, "gold": 3}, inplace=True)
    dataframes[i]["issued"] = dataframes[i]["issued"].apply(lambda x: read_date(x))

card_train_df, card_test_df = dataframes

card_train_df.head()

### Process demographic data

In [None]:

district_df.replace("?", np.NaN, inplace=True)

district_df["unemploymant rate '95"].fillna(district_df["unemploymant rate '95"].median(), inplace=True)
district_df["no. of commited crimes '95"].fillna(district_df["no. of commited crimes '95"].median(), inplace=True)

district_df["unemploymant rate '95"] = pd.to_numeric(district_df["unemploymant rate '95"])
district_df["no. of commited crimes '95"] = pd.to_numeric(district_df["no. of commited crimes '95"])

district_df["criminality_growth"] = (district_df["no. of commited crimes '96"] - district_df["no. of commited crimes '95"]) / district_df["no. of inhabitants"]
district_df["unemployment_growth"] = (district_df["unemploymant rate '96"] - district_df["unemploymant rate '95"])
district_df["ratio_entrepeneurs"] = district_df["no. of enterpreneurs per 1000 inhabitants"] / 1000

district_df.drop([
    "unemploymant rate '95",
    "unemploymant rate '96",
    "no. of commited crimes '95",
    "no. of commited crimes '96",
    "no. of enterpreneurs per 1000 inhabitants"
], axis=1, inplace=True)

### Process loan data

In [None]:
loan_dfs = [loan_train_df, loan_test_df]

for i in range(len(loan_dfs)):
    loan_dfs[i]["date"] = loan_dfs[i]["date"].apply(lambda x: read_date(x))
    loan_dfs[i].rename(columns={"date": "loan_date", "amount": "loan_amount"}, inplace=True)

## Generate feature from transaction data

In [None]:
# transactions = (trans_train_df, trans_test_df)
# account_features = [1, 2]

# for i in range(len(transactions)):
#     # Sorting transactions by date to figure out the most recent balance
#     account_features[i] = transactions[i].sort_values(by="date", axis=0, ascending=False)
#     account_features[i].drop_duplicates(subset='account_id', keep='first', inplace=True)

#     account_features[i].drop(account_features[i].columns.difference(['account_id', 'balance']), axis=1, inplace=True)
#     account_features[i].rename(columns={'balance': 'final_amount'}, inplace=True)

# account_features[0]

## Merge data

In [None]:
loan_dfs = [loan_train_df, loan_test_df]
trans_dfs = (trans_train_df, trans_test_df)
cards_dfs = (card_train_df, card_test_df)

for i in range(len(loan_dfs)):
    # Merge with dispositions
    loan_dfs[i] = loan_dfs[i].merge(disp_df, on="account_id", how="left")

    # Merge with accounts
    loan_dfs[i] = loan_dfs[i].merge(account_df, on="account_id")

    # Merge with clients
    loan_dfs[i] = loan_dfs[i].merge(client_df, on="client_id", suffixes=["_account", "_client"])

    # Merge with districts
    loan_dfs[i] = loan_dfs[i].merge(district_df, left_on="district_id_client", right_on="code")

    # Merge with cards
    loan_dfs[i] = loan_dfs[i].merge(cards_dfs[i], on="disp_id", how="left")

    # Merge with transactions
    loan_dfs[i] = loan_dfs[i].merge(trans_dfs[i], on="account_id", suffixes=["_card", "_transaction"])

loan_train_df, loan_test_df = loan_dfs

## Encode district name

In [None]:
loan_train_df["status"] = loan_train_df["status"].apply(lambda x: True if (x == 1) else False)
columns = ["region"]
woe_encoder = ce.WOEEncoder(cols=columns).fit(loan_train_df[columns], loan_train_df["status"])
woe_encoded_train = woe_encoder.transform(loan_train_df[columns]).add_suffix('_woe')
loan_train_df = loan_train_df.join(woe_encoded_train)
loan_train_df["status"] = loan_train_df["status"].apply(lambda x: 1 if (x == True) else -1)

loan_test_df["status"] = loan_test_df["status"].apply(lambda x: True if (x == 1) else False)
woe_encoded_test = woe_encoder.transform(loan_test_df[columns]).add_suffix('_woe')
loan_test_df = loan_test_df.join(woe_encoded_test)
loan_test_df["status"] = loan_test_df["status"].apply(lambda x: 1 if (x == True) else -1)


## Dropping features

- All cards can be dropped as there are only 11 out of the total 328 loans making it very hard or impossible to fill in missing values.
- IDs are no longer needed

In [None]:
loan_dfs = [loan_train_df, loan_test_df]

for i in range(len(loan_dfs)):
    loan_dfs[i].drop(["card_id", "type_card", "issued"], axis=1, inplace=True)
    loan_dfs[i].drop(["disp_id", "account_id", "client_id"], axis=1, inplace=True)
    loan_dfs[i].drop(["district_id_account", "district_id_client"], axis=1, inplace=True)
    loan_dfs[i].drop(["trans_id"], axis=1, inplace=True)

loan_train_df

## Aggregate data 

In [None]:
loan_dfs = [loan_train_df, loan_test_df]

def count_withdrawal(x):
    return sum(x=="withdrawal")

def count_credit(x):
    return sum(x=="credit")

for i in range(len(loan_dfs)):
    aggregated_columns = ("transaction_date", "operation", "amount", "balance", "type_transaction", "client_count")
    columns = [x for x in loan_dfs[i].columns.to_list() if x not in aggregated_columns]

    df = loan_dfs[i].groupby(columns, as_index=False, group_keys=False, dropna=False)

    num_times_under_zero = df.apply(lambda x: pd.Series(dict(
        num_times_under_zero = (x.balance < x.payments).sum() > 2
    )))["num_times_under_zero"]

    df = df.agg({
        "balance": ["mean", "min", "max"],
        "transaction_date": ["max"],
        "client_count": ["mean"],
        "operation": ["count"],
        "amount": ["mean", "min", "max", "std"],
        "type_transaction": [count_withdrawal, count_credit]
    })

    df["balance_dropped_below_zero"] = num_times_under_zero

    df.columns = ['%s%s' % (a, '_%s' % b if b else '') for a, b in df.columns]

    # Account age at time of loan in days
    df["account_age"] = (df['loan_date'] - df['creation_date']).dt.days

    # Number of days since last transaction
    df["days_since_last_transaction"] = (df["loan_date"] - df["transaction_date_max"]).dt.days

    # Whether an account has reached a negative balance
    df["reached_negative_balance"] = df["balance_min"] < 0

    # Drop non-numeric columns
    loan_dfs[i] = df.select_dtypes(["number", "bool"])

    # loan_dfs[i].drop(["balance_min", "balance_max"], axis=1, inplace=True)

    status = loan_dfs[i].pop("status")
    loan_dfs[i]["status"] = status

train_df, test_df = loan_dfs

pd.set_option('display.max_columns', 500)

train_df


## Clustering

In [None]:
import numpy as np
from sklearn.cluster import KMeans
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

RANDOM_STATE=42

X = train_df
# X = train_df[["reached_negative_balance", "balance_min", "balance_mean"]]
y = train_df["status"]

X = StandardScaler().fit_transform(X)
X = PCA(n_components=2, random_state=RANDOM_STATE).fit_transform(X)

distorsions = []
silhouette = []

for k in range(2, 10):
    kmeans = KMeans(n_clusters=k, random_state=RANDOM_STATE)
    kmeans.fit(X)
    distorsions.append(kmeans.inertia_)
    silhouette.append(metrics.silhouette_score(X, kmeans.labels_))

fig, ax = plt.subplots(1, 2, figsize=(15, 5))

ax[0].plot(range(2, 10), silhouette)
ax[1].plot(range(2, 10), distorsions)

plt.grid(True)

ax[0].set_title('Silhouette method')
ax[1].set_title('Elbow curve')

plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

X = train_df
# X = train_df[["reached_negative_balance", "balance_min", "balance_mean"]]
y = train_df["status"]

X = StandardScaler().fit_transform(X)
X = PCA(n_components=2, random_state=RANDOM_STATE).fit_transform(X)
db = KMeans(n_clusters=3, random_state=RANDOM_STATE).fit_predict(X)

# Plot result
fig, axs = plt.subplots(1, 2, figsize=(18, 9))

axs[0].scatter(X[:, 0], X[:, 1], c=db)

scatter = axs[1].scatter(X[:, 0], X[:, 1], c=y)

axs[0].set_title("Kmeans")
axs[1].set_title("Status")

axs[1].legend(handles=scatter.legend_elements()[0], labels=[-1, 1]);

### Age

In [None]:
plt.title("Age distribution by loan request")
plt.xlabel("Age")
plt.ylabel("Number of loans")

sb.histplot(data=loan_train_df, x="age", hue="status", bins=20).set(title="");

### Loan amount

In [None]:
sb.histplot(data=loan_train_df, x=loan_train_df["loan_amount"], hue="status", bins=30);

### Salary


In [None]:
plt.figure(figsize=(18, 8))

sb.scatterplot(x=loan_train_df["average salary"], y=loan_train_df["loan_amount"], marker="x");

### Correlation matrix

In [None]:
corr_matrix = loan_train_df.drop("loan_id", axis=1).corr(method='spearman')

threshold = 0.05

correlation_status = corr_matrix.loc[['status'], :]
selected_cols = set(correlation_status.loc[:, (abs(correlation_status) > threshold).any()].columns.to_list())
dropped_cols = set.difference(set(correlation_status.columns.to_list()), selected_cols)

loan_train_df.drop(dropped_cols, axis=1, inplace=True)
loan_test_df.drop(dropped_cols, axis=1, inplace=True)

corr_matrix = loan_train_df.drop("loan_id", axis=1).corr(method='spearman')

mask = np.zeros(corr_matrix.shape, dtype=bool)

plt.figure(figsize=(15, 15))

mask[np.triu_indices(len(mask))] = True

plt.title('Correlation Heatmap of client Dataset')

sb.heatmap(corr_matrix, square=True, annot=True, fmt='.2f', linecolor='black', mask=mask, cbar=False)

plt.show()

# Prediction

## Set random state

In [None]:
RANDOM_STATE=42

## Columns to drop and feature target

In [None]:
columns_to_drop = ["loan_id", "status"]
target_column = "status"

train_df.drop(columns_to_drop, axis=1).head()

## Tune Models

For each model and combination of applying or not oversampling/feature selection, we plot ROC curves and confusion matrix (in which 0 represents `status` equal to -1, that is, rejected loans). 

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree

parameter_grid = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': range(1, 7)
}

dt, dt_fs, dt_os, dt_fs_os = (tune_model(
    train_df, 
    DecisionTreeClassifier(random_state=RANDOM_STATE),
    parameter_grid, 
    columns_to_drop, 
    target_column,
    oversample=oversample,
    feature_selection=feature_selection
) for oversample, feature_selection in ((False, False), (False, True), (True, False), (True,True)))

In [None]:
plotROC([dt, dt_fs, dt_os, dt_fs_os], train_df, columns_to_drop, target_column, scaler=None)

In [None]:
confMatrix([dt, dt_fs, dt_os, dt_fs_os], columns_to_drop, target_column, train_df)

### SVM
Needs to be scaled (StandardScaler)

In [None]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

parameter_grid = {
    'C': [1, 10, 50],
    'gamma': [0.001, 0.0001],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
}

svc, svc_fs, svc_os, svc_fs_os = (tune_model(
    train_df,
    SVC(probability=True, random_state=RANDOM_STATE),
    parameter_grid,
    columns_to_drop,
    target_column,
    scaler=StandardScaler(),
    oversample=oversample,
    feature_selection=feature_selection
) for oversample, feature_selection in ((False, False), (False, True), (True, False), (True, True)))


In [None]:
plotROC([svc, svc_fs, svc_os, svc_fs_os], train_df, columns_to_drop, target_column, scaler=StandardScaler());

In [None]:
confMatrix([svc, svc_fs, svc_os, svc_fs_os], columns_to_drop, target_column, train_df, scaler=StandardScaler());

###  K-nearest neighbours (KNN)
Just like the SVM model, the KNN model also requires the data to be scaled.

In [None]:
from sklearn import neighbors

parameter_grid = {
    'n_neighbors': [4, 5, 6, 7, 10, 15],
    'leaf_size': [5, 10, 15, 20, 50, 100],
    'n_jobs': [-1],
    'algorithm': ['auto']
}

knn, knn_fs, knn_os, knn_fs_os = (tune_model(
    train_df, 
    neighbors.KNeighborsClassifier(), 
    parameter_grid, 
    columns_to_drop, 
    target_column, 
    scaler=StandardScaler(),
    oversample=oversample,
    feature_selection=feature_selection
) for oversample, feature_selection in ((False, False), (False, True), (True, False), (True,True)))

In [None]:
plotROC([knn, knn_fs, knn_os, knn_fs_os], train_df, columns_to_drop, target_column, scaler=StandardScaler())

In [None]:
confMatrix([knn, knn_fs, knn_os, knn_fs_os], columns_to_drop, target_column, train_df, scaler=StandardScaler())

### Naïve Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

parameter_grid = {}

nb, nb_fs, nb_os, nb_fs_os = (tune_model(
    train_df, 
    GaussianNB(),
    parameter_grid, 
    columns_to_drop, 
    target_column, 
    scaler=StandardScaler(),
    oversample=oversample,
    feature_selection=feature_selection
) for oversample, feature_selection in ((False, False), (False, True), (True, False), (True,True)))

In [None]:
plotROC([nb, nb_fs, nb_os, nb_fs_os], train_df, columns_to_drop, target_column)

In [None]:
confMatrix([nb, nb_fs, nb_os, nb_fs_os], columns_to_drop, target_column, train_df)


In [None]:
# print(pd.DataFrame(nb.best_estimator_.steps[-1][1].feature_importances_, index=train_df.drop(columns_to_drop, axis=1).columns, columns=["Importance"]))

### Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

parameter_grid = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [5, 10, 15],
    'n_jobs': [-1],  # Use all cores
    'criterion': ['gini', 'entropy']
}

rfc, rfc_fs, rfc_os, rfc_fs_os = (tune_model(
    train_df, 
    RandomForestClassifier(random_state=RANDOM_STATE), 
    parameter_grid, 
    columns_to_drop, 
    target_column,
    oversample=oversample,
    feature_selection=feature_selection
) for oversample, feature_selection in ((False, False), (False, True), (True, False), (True,True)))

In [None]:
plotROC([rfc, rfc_fs, rfc_os, rfc_fs_os], train_df, columns_to_drop, target_column, scaler=None)

In [None]:
confMatrix([rfc, rfc_fs, rfc_os, rfc_fs_os], columns_to_drop, target_column, train_df, scaler=StandardScaler())


In [None]:
print(pd.DataFrame(rfc.best_estimator_.steps[-1][1].feature_importances_, index=train_df.drop(columns_to_drop, axis=1).columns, columns=["Importance"]))
# print(pd.DataFrame(rfc_fs.best_estimator_.steps[1][1].feature_importances_, index=train_df.drop(columns_to_drop, axis=1).columns, columns=["Importance"]))
# print(pd.DataFrame(rfc_fs.best_estimator_.steps[1][1].feature_importances_, index=train_df.drop(columns_to_drop, axis=1).columns[rfc_fs.best_estimator_.steps[0][1].get_feature_names_out()], columns=["Importance"]))
# rfc_fs.best_estimator_.steps[0][1].get_feature_names_out()

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

parameter_grid = {
    "max_iter": [1000, 5000, 10000],
    "solver": ["lbfgs"]
}

lr, lr_fs, lr_os, lr_fs_os = (tune_model(
    train_df,
    LogisticRegression(class_weight="balanced", random_state=RANDOM_STATE),
    parameter_grid,
    columns_to_drop,
    target_column,
    scaler=StandardScaler(),
    oversample=oversample,
    feature_selection=feature_selection
) for oversample, feature_selection in ((False, False), (False, True), (True, False), (True,True)))

In [None]:
plotROC([lr, lr_fs, lr_os, lr_fs_os], train_df, columns_to_drop, target_column, scaler=StandardScaler())

In [None]:
confMatrix([lr, lr_fs, lr_os, lr_fs_os], columns_to_drop, target_column, train_df, scaler=StandardScaler())

## Comparing Models

In [None]:
import seaborn as sb

scores = {
    "Decision Tree": [dt, dt_fs, dt_os, dt_fs_os],
    "SVC": [svc, svc_fs, svc_os, svc_fs_os],
    "K-nearest Neighbours": [knn, knn_fs, knn_os, knn_fs_os],
    "Naive Bayes": [nb, nb_fs, nb_os, nb_fs_os],
    "Random Forest": [rfc, rfc_fs, rfc_os, rfc_fs_os],
    "Logistic Regression": [lr, lr_fs, lr_os, lr_fs_os]
}

x_axis_labels = ["No Feature selection/No oversampling",
                 "Feature Selection", "Oversampling", "Feature Selection/Oversampling"]
y_axis_labels = scores.keys()

plt.figure(figsize=(10, 8))

scores_array = np.array([[model.best_score_ for model in models]
                         for models in scores.values()])

sb.set(font_scale=1.3)
sb.heatmap(scores_array, annot=True, linewidths=0.5, vmax=1,
           square=True, xticklabels=x_axis_labels, yticklabels=y_axis_labels, cbar=False)
plt.title("ROC-AUC")

plt.xticks(rotation=45, horizontalalignment='right')
;

In [None]:

from sklearn.metrics import f1_score, accuracy_score

scores = {
    "Decision Tree": [dt, dt_fs, dt_os, dt_fs_os],
    "SVC": [svc, svc_fs, svc_os, svc_fs_os],
    "K-nearest Neighbours": [knn, knn_fs, knn_os, knn_fs_os],
    "Naive Bayes": [nb, nb_fs, nb_os, nb_fs_os],
    "Random Forest": [rfc, rfc_fs, rfc_os, rfc_fs_os],
    "Logistic Regression": [lr, lr_fs, lr_os, lr_fs_os]
}

x_axis_labels = ["No Feature selection/No oversampling",
                 "Feature Selection", "Oversampling", "Feature Selection/Oversampling"]
y_axis_labels = scores.keys()

plt.figure(figsize=(10, 8))

scale = [None, StandardScaler(), StandardScaler(), StandardScaler(), None, StandardScaler()]

scores_array = np.array([[accuracy_score(get_X_y(train_df, columns_to_drop, target_column, scaler=scale[i])[1], 
                            models[i].best_estimator_.predict(get_X_y(train_df, columns_to_drop, target_column, scaler=scale[i])[0])) for i in range(len(models))]
                         for models in scores.values()])

sb.set(font_scale=1.3)
sb.heatmap(scores_array, annot=True, linewidths=0.5, vmax=1,
           square=True, xticklabels=x_axis_labels, yticklabels=y_axis_labels, cbar=False)
plt.title("Accuracy")

plt.xticks(rotation=45, horizontalalignment='right')


In [None]:

from sklearn.metrics import f1_score

scores = {
    "Decision Tree": [dt, dt_fs, dt_os, dt_fs_os],
    "SVC": [svc, svc_fs, svc_os, svc_fs_os],
    "K-nearest Neighbours": [knn, knn_fs, knn_os, knn_fs_os],
    "Naive Bayes": [nb, nb_fs, nb_os, nb_fs_os],
    "Random Forest": [rfc, rfc_fs, rfc_os, rfc_fs_os],
    "Logistic Regression": [lr, lr_fs, lr_os, lr_fs_os]
}

x_axis_labels = ["No Feature selection/No oversampling",
                 "Feature Selection", "Oversampling", "Feature Selection/Oversampling"]
y_axis_labels = scores.keys()

plt.figure(figsize=(10, 8))

scale = [None, StandardScaler(), StandardScaler(), StandardScaler(), None, StandardScaler()]

scores_array = np.array([[f1_score(get_X_y(train_df, columns_to_drop, target_column, scaler=scale[i])[1], 
                            models[i].best_estimator_.predict(get_X_y(train_df, columns_to_drop, target_column, scaler=scale[i])[0])) for i in range(len(models))]
                         for models in scores.values()])

sb.set(font_scale=1.3)
sb.heatmap(scores_array, annot=True, linewidths=0.5, vmax=1,
           square=True, xticklabels=x_axis_labels, yticklabels=y_axis_labels, cbar=False)
plt.title("F1-Score")

plt.xticks(rotation=45, horizontalalignment='right')

## ROC curves

In [None]:
scalers = [False, True, True, True, False, True]

plotAlgorithmROC([dt, svc, knn, nb, rfc, lr],
                 ["Decision Tree", "SVC", "KNN", "Naive Bayes", "Random Forest", "Logistic Regression"],
                 train_df,
                 columns_to_drop,
                 target_column,
                 scalers=scalers)


### Feature Selection

In [None]:
plotAlgorithmROC([dt_fs, svc_fs, knn_fs, nb_fs, rfc_fs, lr_fs],
                 ["Decision Tree", "SVC", "KNN", "Naive Bayes", "Random Forest", "Logistic Regression"],
                 train_df,
                 columns_to_drop,
                 target_column,
                 scalers=scalers)

### Oversampling

In [None]:
plotAlgorithmROC([dt_os, svc_os, knn_os, nb_os, rfc_os, lr_os],
                 ["Decision Tree", "SVC", "KNN", "Naive Bayes", "Random Forest", "Logistic Regression"],
                 train_df,
                 columns_to_drop,
                 target_column,
                 scalers=scalers)

### Feature Selection / Oversampling

In [None]:
plotAlgorithmROC([dt_fs_os, svc_fs_os, knn_fs_os, nb_fs_os, rfc_fs_os, lr_fs_os],
                 ["Decision Tree", "SVC", "KNN", "Naive Bayes", "Random Forest", "Logistic Regression"],
                 train_df,
                 columns_to_drop,
                 target_column,
                 scalers=scalers)

In [None]:
confMatrix([dt, dt_fs, dt_os, dt_fs_os], columns_to_drop, target_column, train_df)

### SVM
Needs to be scaled (StandardScaler)

In [None]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

parameter_grid = {
    'C': [1, 10, 50],
    'gamma': [0.001, 0.0001],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
}

svc, svc_fs, svc_os, svc_fs_os = (tune_model(
    train_df,
    SVC(probability=True, random_state=RANDOM_STATE),
    parameter_grid,
    columns_to_drop,
    target_column,
    scaler=StandardScaler(),
    oversample=oversample,
    feature_selection=feature_selection
) for oversample, feature_selection in ((False, False), (False, True), (True, False), (True, True)))


In [None]:
plotROC([svc, svc_fs, svc_os, svc_fs_os], train_df, columns_to_drop, target_column, scaler=StandardScaler())

In [None]:
confMatrix([svc, svc_fs, svc_os, svc_fs_os], columns_to_drop, target_column, train_df, scaler=StandardScaler())

###  K-nearest neighbours (KNN)
Just like the SVM model, the KNN model also requires the data to be scaled.

In [None]:
from sklearn import neighbors

parameter_grid = {
    'n_neighbors': [4, 5, 6, 7, 10, 15],
    'leaf_size': [5, 10, 15, 20, 50, 100],
    'n_jobs': [-1],
    'algorithm': ['auto']
}

knn, knn_fs, knn_os, knn_fs_os = (tune_model(
    train_df, 
    neighbors.KNeighborsClassifier(), 
    parameter_grid, 
    columns_to_drop, 
    target_column, 
    scaler=StandardScaler(),
    oversample=oversample,
    feature_selection=feature_selection
) for oversample, feature_selection in ((False, False), (False, True), (True, False), (True,True)))

In [None]:
plotROC([knn, knn_fs, knn_os, knn_fs_os], train_df, columns_to_drop, target_column, scaler=StandardScaler())

In [None]:
confMatrix([knn, knn_fs, knn_os, knn_fs_os], columns_to_drop, target_column, train_df, scaler=StandardScaler())

### Naïve Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

parameter_grid = {}

nb, nb_fs, nb_os, nb_fs_os = (tune_model(
    train_df, 
    GaussianNB(),
    parameter_grid, 
    columns_to_drop, 
    target_column, 
    scaler=StandardScaler(),
    oversample=oversample,
    feature_selection=feature_selection
) for oversample, feature_selection in ((False, False), (False, True), (True, False), (True,True)))

In [None]:
plotROC([nb, nb_fs, nb_os, nb_fs_os], train_df, columns_to_drop, target_column)

In [None]:
confMatrix([nb, nb_fs, nb_os, nb_fs_os], columns_to_drop, target_column, train_df)


In [None]:
# print(pd.DataFrame(nb.best_estimator_.steps[-1][1].feature_importances_, index=train_df.drop(columns_to_drop, axis=1).columns, columns=["Importance"]))


### Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

parameter_grid = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [5, 10, 15],
    'n_jobs': [-1],  # Use all cores
    'criterion': ['gini', 'entropy']
}

rfc, rfc_fs, rfc_os, rfc_fs_os = (tune_model(
    train_df, 
    RandomForestClassifier(random_state=RANDOM_STATE), 
    parameter_grid, 
    columns_to_drop, 
    target_column,
    oversample=oversample,
    feature_selection=feature_selection
) for oversample, feature_selection in ((False, False), (False, True), (True, False), (True,True)))

In [None]:
plotROC([rfc, rfc_fs, rfc_os, rfc_fs_os], train_df, columns_to_drop, target_column, scaler=None)

In [None]:
confMatrix([rfc, rfc_fs, rfc_os, rfc_fs_os], columns_to_drop, target_column, train_df, scaler=StandardScaler())


In [None]:
print(pd.DataFrame(rfc.best_estimator_.steps[-1][1].feature_importances_, index=train_df.drop(columns_to_drop, axis=1).columns, columns=["Importance"]))
# print(pd.DataFrame(rfc_fs.best_estimator_.steps[1][1].feature_importances_, index=train_df.drop(columns_to_drop, axis=1).columns, columns=["Importance"]))
# print(pd.DataFrame(rfc_fs.best_estimator_.steps[1][1].feature_importances_, index=train_df.drop(columns_to_drop, axis=1).columns[rfc_fs.best_estimator_.steps[0][1].get_feature_names_out()], columns=["Importance"]))
# rfc_fs.best_estimator_.steps[0][1].get_feature_names_out()

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

parameter_grid = {
    "max_iter": [1000, 5000, 10000],
    "solver": ["lbfgs"]
}

lr, lr_fs, lr_os, lr_fs_os = (tune_model(
    train_df,
    LogisticRegression(class_weight="balanced", random_state=RANDOM_STATE),
    parameter_grid,
    columns_to_drop,
    target_column,
    scaler=StandardScaler(),
    oversample=oversample,
    feature_selection=feature_selection
) for oversample, feature_selection in ((False, False), (False, True), (True, False), (True,True)))

In [None]:
plotROC([lr, lr_fs, lr_os, lr_fs_os], train_df, columns_to_drop, target_column, scaler=StandardScaler())

In [None]:
confMatrix([lr, lr_fs, lr_os, lr_fs_os], columns_to_drop, target_column, train_df, scaler=StandardScaler())

## Comparing Models

In [None]:
import seaborn as sb

scores = {
    "Decision Tree": [dt, dt_fs, dt_os, dt_fs_os],
    "SVC": [svc, svc_fs, svc_os, svc_fs_os],
    "K-nearest Neighbours": [knn, knn_fs, knn_os, knn_fs_os],
    "Naive Bayes": [nb, nb_fs, nb_os, nb_fs_os],
    "Random Forest": [rfc, rfc_fs, rfc_os, rfc_fs_os],
    "Logistic Regression": [lr, lr_fs, lr_os, lr_fs_os]
}

x_axis_labels = ["No Feature selection/No oversampling",
                 "Feature Selection", "Oversampling", "Feature Selection/Oversampling"]
y_axis_labels = scores.keys()

plt.figure(figsize=(10, 8))

scores_array = np.array([[model.best_score_ for model in models]
                         for models in scores.values()])

sb.set(font_scale=1.3)
sb.heatmap(scores_array, annot=True, linewidths=0.5, vmax=1,
           square=True, xticklabels=x_axis_labels, yticklabels=y_axis_labels, cbar=False)
plt.title("ROC-AUC")

plt.xticks(rotation=45, horizontalalignment='right');

In [None]:

from sklearn.metrics import f1_score, accuracy_score

scores = {
    "Decision Tree": [dt, dt_fs, dt_os, dt_fs_os],
    "SVC": [svc, svc_fs, svc_os, svc_fs_os],
    "K-nearest Neighbours": [knn, knn_fs, knn_os, knn_fs_os],
    "Naive Bayes": [nb, nb_fs, nb_os, nb_fs_os],
    "Random Forest": [rfc, rfc_fs, rfc_os, rfc_fs_os],
    "Logistic Regression": [lr, lr_fs, lr_os, lr_fs_os]
}

x_axis_labels = ["No Feature selection/No oversampling",
                 "Feature Selection", "Oversampling", "Feature Selection/Oversampling"]
y_axis_labels = scores.keys()

plt.figure(figsize=(10, 8))

scale = [None, StandardScaler(), StandardScaler(), StandardScaler(), None, StandardScaler()]

scores_array = np.array([[accuracy_score(get_X_y(train_df, columns_to_drop, target_column, scaler=scale[i])[1], 
                            models[i].best_estimator_.predict(get_X_y(train_df, columns_to_drop, target_column, scaler=scale[i])[0])) for i in range(len(models))]
                         for models in scores.values()])

sb.set(font_scale=1.3)
sb.heatmap(scores_array, annot=True, linewidths=0.5, vmax=1,
           square=True, xticklabels=x_axis_labels, yticklabels=y_axis_labels, cbar=False)
plt.title("Accuracy")

plt.xticks(rotation=45, horizontalalignment='right')


In [None]:

from sklearn.metrics import f1_score

scores = {
    "Decision Tree": [dt, dt_fs, dt_os, dt_fs_os],
    "SVC": [svc, svc_fs, svc_os, svc_fs_os],
    "K-nearest Neighbours": [knn, knn_fs, knn_os, knn_fs_os],
    "Naive Bayes": [nb, nb_fs, nb_os, nb_fs_os],
    "Random Forest": [rfc, rfc_fs, rfc_os, rfc_fs_os],
    "Logistic Regression": [lr, lr_fs, lr_os, lr_fs_os]
}

x_axis_labels = ["No Feature selection/No oversampling",
                 "Feature Selection", "Oversampling", "Feature Selection/Oversampling"]
y_axis_labels = scores.keys()

plt.figure(figsize=(10, 8))

scale = [None, StandardScaler(), StandardScaler(), StandardScaler(), None, StandardScaler()]

scores_array = np.array([[f1_score(get_X_y(train_df, columns_to_drop, target_column, scaler=scale[i])[1], 
                            models[i].best_estimator_.predict(get_X_y(train_df, columns_to_drop, target_column, scaler=scale[i])[0])) for i in range(len(models))]
                         for models in scores.values()])

sb.set(font_scale=1.3)
sb.heatmap(scores_array, annot=True, linewidths=0.5, vmax=1,
           square=True, xticklabels=x_axis_labels, yticklabels=y_axis_labels, cbar=False)
plt.title("F1-Score")

plt.xticks(rotation=45, horizontalalignment='right')

## ROC curves

In [None]:
scalers = [False, True, True, True, False, True]

plotAlgorithmROC([dt, svc, knn, nb, rfc, lr],
                 ["Decision Tree", "SVC", "KNN", "Naive Bayes", "Random Forest", "Logistic Regression"],
                 train_df,
                 columns_to_drop,
                 target_column,
                 scalers=scalers)


### Feature Selection

In [None]:
plotAlgorithmROC([dt_fs, svc_fs, knn_fs, nb_fs, rfc_fs, lr_fs],
                 ["Decision Tree", "SVC", "KNN", "Naive Bayes", "Random Forest", "Logistic Regression"],
                 train_df,
                 columns_to_drop,
                 target_column,
                 scalers=scalers)

### Oversampling

In [None]:
plotAlgorithmROC([dt_os, svc_os, knn_os, nb_os, rfc_os, lr_os],
                 ["Decision Tree", "SVC", "KNN", "Naive Bayes", "Random Forest", "Logistic Regression"],
                 train_df,
                 columns_to_drop,
                 target_column,
                 scalers=scalers)

### Feature Selection / Oversampling

In [None]:
plotAlgorithmROC([dt_fs_os, svc_fs_os, knn_fs_os, nb_fs_os, rfc_fs_os, lr_fs_os],
                 ["Decision Tree", "SVC", "KNN", "Naive Bayes", "Random Forest", "Logistic Regression"],
                 train_df,
                 columns_to_drop,
                 target_column,
                 scalers=scalers)

## Save results

In [None]:
X_result = test_df.drop(columns_to_drop, axis=1)

y_result = dt_fs_os.predict_proba(X_result)[:, 0]

result = pd.DataFrame({"Id": test_df["loan_id"], "Predicted": y_result})
result.drop_duplicates(inplace=True)
result.to_csv("../out/result.csv", index=False)