# Models Evaluation

1. Create models
2. Compare models
3. Export .csv file with results

In [1]:
import pandas as pd
from datetime import date

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

In [2]:
train_data = pd.read_pickle("train_data.pkl")
test_data = pd.read_pickle("test_data.pkl")

train_data

Unnamed: 0,loan_id,amount_loan,duration,payments,status,age_clt,gender_clt,frequency,age_acc,region,...,amount_trans_max,amount_trans_std,amount_trans_last,amount_trans_abs_min,balance_mean,balance_min,balance_max,balance_std,balance_last,balance_abs_min
0,4959,80952,24,3373,1,76,1,1,28,0,...,30354.0,8793.271132,138.3,13.5,32590.624074,1100.0,67529.6,12061.705682,27855.2,1100.0
1,4961,30276,12,2523,-1,82,0,1,26,5,...,22708.0,7074.892542,15139.0,14.6,25197.092500,715.0,58157.5,15039.248405,15854.0,715.0
2,4973,165960,24,6915,1,77,0,1,27,5,...,62235.0,16442.527059,114.1,14.6,52523.244800,700.0,107069.6,20955.646998,23703.8,700.0
3,4996,88440,12,7370,1,76,0,0,25,3,...,47976.0,14855.181613,282.6,100.0,62778.090323,200.0,103239.0,21638.258870,79007.6,200.0
4,5002,104808,12,8734,1,81,1,1,28,6,...,25970.0,7504.299350,3900.0,14.6,38709.830000,500.0,57865.3,11517.175248,28015.4,500.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
323,7271,392460,60,6541,1,41,0,1,27,1,...,58870.0,15051.099854,216.2,14.6,57131.740559,900.0,115913.4,20293.363480,53259.7,900.0
324,7284,52788,12,4399,1,48,1,1,28,5,...,17920.0,3862.234636,1900.0,14.6,22198.179070,1000.0,41469.1,6652.642956,21029.0,1000.0
325,7304,419880,60,6998,1,76,0,2,26,0,...,64800.0,22749.020565,300.0,39.0,59352.833333,200.0,104039.9,27879.396857,24704.4,200.0
326,7305,54024,12,4502,1,53,1,1,27,6,...,40521.0,10938.351222,109.6,14.6,36480.185034,1000.0,81705.8,15469.988113,25697.2,1000.0


## Train Test Split

In [3]:
features = list(train_data.columns)
features.remove('status')
features.remove('loan_id')

x = train_data[features]
y = train_data['status']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

## Resampling

Our data analysis stage showed that our working dataset was heavily umbalanced.

Early exploratory analysis of classification methods proved that this was having a negative effect on the accuracy of the classifier, especially for the minority classes. To solve this problem we implemented resampling techniques that would generate a more balanced training set.

For oversampling we used the SMOTE (Synthetic Minority Over-sampling Technique) algorithm. This generates new samples interpolated from the existing ones.

In [4]:
smote = SMOTE()
x_train, y_train = smote.fit_resample(x_train, y_train)

Some of the algorithms we plan on using (KNN and SVM) require the data to be standardized. To do so, we used a StandardScaler from SciKit Learn's preprocessing library.

In [5]:
scaler = StandardScaler()

scaler.fit(x_train)
x_train = scaler.fit_transform(x_train)
x_test = scaler.fit_transform(x_test)

## Decision Tree Classifier

In [6]:
# Create Decision Tree classifer object
dtc = DecisionTreeClassifier(min_samples_leaf=10, random_state=0)

# Train Decision Tree Classifer
dtc.fit(x_train, y_train)

#Predict the response for test dataset
dtc_pred = dtc.predict_proba(x_test)[:, -1]

# Metrics
print("AUC Score: ", metrics.roc_auc_score(y_test, dtc_pred)) # Area Under the Curve
print(f"Confusion matrix:\n{metrics.confusion_matrix(y_test, dtc.predict(x_test))}\n")
print(f"Classification report:\n{metrics.classification_report(y_test, dtc.predict(x_test))}\n")

AUC Score:  0.5869047619047619
Confusion matrix:
[[12  3]
 [49 35]]

Classification report:
              precision    recall  f1-score   support

          -1       0.20      0.80      0.32        15
           1       0.92      0.42      0.57        84

    accuracy                           0.47        99
   macro avg       0.56      0.61      0.44        99
weighted avg       0.81      0.47      0.53        99




### Parameter Tuning

In [7]:
parameter_grid = {'criterion': ['gini', 'entropy'],
                  'splitter': ['best', 'random']}

grid_search = GridSearchCV(DecisionTreeClassifier(),
                           param_grid=parameter_grid,
                           scoring='roc_auc',
                           refit="AUC",
                           cv=5,
                           verbose=4,
                           n_jobs=-1)

grid_search.fit(x_train, y_train)

print(f"Best score: {grid_search.best_score_}")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best estimator: {grid_search.best_estimator_}")

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best score: 0.8792307692307691
Best parameters: {'criterion': 'entropy', 'splitter': 'best'}
Best estimator: DecisionTreeClassifier(criterion='entropy')


In [8]:
dtc_grid = grid_search.best_estimator_
dtc_grid_pred = dtc_grid.predict_proba(x_test)[:, -1]

auc = metrics.roc_auc_score(y_test, dtc_grid_pred)
print("AUC Score: ", auc)

AUC Score:  0.6083333333333334


## Random Forest

In [9]:
rfc = RandomForestClassifier(random_state=0)

rfc.fit(x_train, y_train)

rfc_pred = rfc.predict_proba(x_test)[:, -1]

print("AUC Score: ", metrics.roc_auc_score(y_test, rfc_pred))
print(f"Confusion matrix:\n{metrics.confusion_matrix(y_test, rfc.predict(x_test))}\n")
print(f"Classification report:\n{metrics.classification_report(y_test, rfc.predict(x_test))}\n")

AUC Score:  0.709126984126984
Confusion matrix:
[[11  4]
 [37 47]]

Classification report:
              precision    recall  f1-score   support

          -1       0.23      0.73      0.35        15
           1       0.92      0.56      0.70        84

    accuracy                           0.59        99
   macro avg       0.58      0.65      0.52        99
weighted avg       0.82      0.59      0.64        99




### Parameter Tuning

In [10]:
# parameter_grid = {'bootstrap': [True, False],
#                     'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
#                     'max_features': ['auto', 'sqrt'],
#                     'min_samples_leaf': [1, 2, 4],
#                     'min_samples_split': [2, 5, 10],
#                     'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

# grid_search = GridSearchCV(RandomForestClassifier(),
#                            param_grid=parameter_grid,
#                            scoring='roc_auc',
#                            refit="AUC",
#                            cv=5,
#                            verbose=4,
#                            n_jobs=-1)

# grid_search.fit(x_train, y_train)

# print(f"Best score: {grid_search.best_score_}")
# print(f"Best parameters: {grid_search.best_params_}")
# print(f"Best estimator: {grid_search.best_estimator_}")

In [11]:
# dtc_grid = grid_search.best_estimator_
# dtc_grid_pred = dtc_grid.predict_proba(x_test)[:, -1]

# auc = metrics.roc_auc_score(y_test, dtc_grid_pred)
# print("AUC Score: ", auc)

## KNN

In [12]:
knn = KNeighborsClassifier()

knn.fit(x_train, y_train)

knn_pred = knn.predict_proba(x_test)[:, -1]

print("AUC Score: ", metrics.roc_auc_score(y_test, knn_pred))
print(f"Confusion matrix:\n{metrics.confusion_matrix(y_test, knn.predict(x_test))}\n")
print(f"Classification report:\n{metrics.classification_report(y_test, knn.predict(x_test))}\n")

AUC Score:  0.6936507936507936
Confusion matrix:
[[12  3]
 [42 42]]

Classification report:
              precision    recall  f1-score   support

          -1       0.22      0.80      0.35        15
           1       0.93      0.50      0.65        84

    accuracy                           0.55        99
   macro avg       0.58      0.65      0.50        99
weighted avg       0.83      0.55      0.61        99




### Parameter Tuning

In [13]:
parameter_grid = {'n_neighbors': [5, 10, 15],
                  'weights': ['uniform', 'distance'],
                  'algorithm': ['ball_tree', 'kd_tree', 'brute']}

grid_search = GridSearchCV(KNeighborsClassifier(),
                           param_grid=parameter_grid,
                           scoring='roc_auc',
                           refit="AUC",
                           cv=5,
                           verbose=4,
                           n_jobs=-1)

grid_search.fit(x_train, y_train)

print(f"Best score: {grid_search.best_score_}")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best estimator: {grid_search.best_estimator_}")

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best score: 0.9716442307692308
Best parameters: {'algorithm': 'ball_tree', 'n_neighbors': 10, 'weights': 'distance'}
Best estimator: KNeighborsClassifier(algorithm='ball_tree', n_neighbors=10, weights='distance')


In [14]:
knn_grid = grid_search.best_estimator_
knn_grid_pred = knn_grid.predict_proba(x_test)[:, -1]

auc = metrics.roc_auc_score(y_test, knn_grid_pred)
print("AUC Score: ", auc)

AUC Score:  0.6833333333333333


## SVM

## AdaBoost

In [15]:
ada = AdaBoostClassifier(random_state=0)

ada.fit(x_train, y_train)

ada_pred = ada.predict_proba(x_test)[:, -1]

print("AUC Score: ", metrics.roc_auc_score(y_test, ada_pred))
print(f"Confusion matrix:\n{metrics.confusion_matrix(y_test, ada.predict(x_test))}\n")
print(f"Classification report:\n{metrics.classification_report(y_test, ada.predict(x_test))}\n")

AUC Score:  0.6849206349206349
Confusion matrix:
[[15  0]
 [53 31]]

Classification report:
              precision    recall  f1-score   support

          -1       0.22      1.00      0.36        15
           1       1.00      0.37      0.54        84

    accuracy                           0.46        99
   macro avg       0.61      0.68      0.45        99
weighted avg       0.88      0.46      0.51        99




In [16]:
parameter_grid={}

grid_search = GridSearchCV(AdaBoostClassifier(),
                              param_grid=parameter_grid,
                              scoring='roc_auc',
                              cv=5,
                              verbose=4,
                              n_jobs=-1)

grid_search.fit(x_train, y_train)

print(f"Best score: {grid_search.best_score_}")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best estimator: {grid_search.best_estimator_}")

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best score: 0.9432980769230769
Best parameters: {}
Best estimator: AdaBoostClassifier()


In [17]:
ada_grid = grid_search.best_estimator_
ada_grid_pred = ada_grid.predict_proba(x_test)[:, -1]

auc = metrics.roc_auc_score(y_test, ada_grid_pred)
print("AUC Score: ", auc)

AUC Score:  0.6849206349206349


## Submission

In [18]:
x_real = test_data[features]

predictions = ada.predict_proba(x_real)[:, -1]

In [19]:
submission = pd.DataFrame()
submission['Id'] = test_data['loan_id']
submission['Predicted'] = predictions

submission.to_csv(date.today().strftime("%d-%m-%Y") + '.csv', index=False)

In [20]:
submission

Unnamed: 0,Id,Predicted
0,4962,0.811913
1,4967,0.654030
2,4968,0.811913
3,4986,0.811913
4,4988,0.811913
...,...,...
349,7279,0.811913
350,7286,0.811913
351,7292,0.811913
352,7294,0.811913
