# Models Evaluation

1. Create models
2. Compare models
3. Export .csv file with results

In [25]:
# Load libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from imblearn.over_sampling import SMOTE

In [2]:
train_data = pd.read_pickle("train_data.pkl")
test_data = pd.read_pickle("test_data.pkl")

train_data

Unnamed: 0,loan_id,amount_loan,duration,payments,status,age_clt,gender_clt,frequency,age_acc,type_trans_count_withdrawal,...,amount_trans_last,amount_trans_abs_min,amount_trans_rangev,balance_mean,balance_min,balance_max,balance_std,balance_last,balance_abs_min,balance_rangev
0,4959,80952,24,3373,1,76,1,1,28,32,...,138.3,13.5,30340.5,32590.624074,1100.0,67529.6,12061.705682,27855.2,1100.0,66429.6
1,4961,30276,12,2523,-1,82,0,1,26,34,...,15139.0,14.6,22693.4,25197.092500,715.0,58157.5,15039.248405,15854.0,715.0,57442.5
2,4973,165960,24,6915,1,77,0,1,27,88,...,114.1,14.6,62220.4,52523.244800,700.0,107069.6,20955.646998,23703.8,700.0,106369.6
3,4996,88440,12,7370,1,76,0,0,25,15,...,282.6,100.0,47876.0,62778.090323,200.0,103239.0,21638.258870,79007.6,200.0,103039.0
4,5002,104808,12,8734,1,81,1,1,27,18,...,3900.0,14.6,25955.4,38709.830000,500.0,57865.3,11517.175248,28015.4,500.0,57365.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
323,7271,392460,60,6541,1,41,0,1,26,97,...,216.2,14.6,58855.4,57131.740559,900.0,115913.4,20293.363480,53259.7,900.0,115013.4
324,7284,52788,12,4399,1,48,1,1,28,22,...,1900.0,14.6,17905.4,22198.179070,1000.0,41469.1,6652.642956,21029.0,1000.0,40469.1
325,7304,419880,60,6998,1,76,0,2,26,7,...,300.0,39.0,64761.0,59352.833333,200.0,104039.9,27879.396857,24704.4,200.0,103839.9
326,7305,54024,12,4502,1,53,1,1,27,102,...,109.6,14.6,40506.4,36480.185034,1000.0,81705.8,15469.988113,25697.2,1000.0,80705.8


In [3]:
features = list(train_data.columns)
features.remove('status')
features.remove('loan_id')

x = train_data[features]
y = train_data['status']

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0) 

In [27]:
smote = SMOTE()
x_train, y_train = smote.fit_resample(x_train, y_train)

396

## Decision Tree Classifier

In [5]:
# Create Decision Tree classifer object
dtc = DecisionTreeClassifier(min_samples_leaf=10, random_state=0)

# Train Decision Tree Classifer
dtc.fit(x_train, y_train)

#Predict the response for test dataset
dtc_pred = dtc.predict_proba(x_test)[:, -1]

# Area Under the Curve
auc = metrics.roc_auc_score(y_test, dtc_pred)
print("AUC Score: ", auc)

AUC Score:  0.7365079365079366


### Parameter Tuning

In [6]:
parameter_grid = {'criterion': ['gini', 'entropy'],
                  'splitter': ['best', 'random']}

grid_search = GridSearchCV(DecisionTreeClassifier(),
                           param_grid=parameter_grid,
                           scoring='roc_auc',
                           refit="AUC",
                           cv=10,
                           verbose=4,
                           n_jobs=-1)

grid_search.fit(x_train, y_train)

print(f"Best score: {grid_search.best_score_}")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best estimator: {grid_search.best_estimator_}")

Fitting 10 folds for each of 4 candidates, totalling 40 fits
Best score: 0.7064035087719299
Best parameters: {'criterion': 'gini', 'splitter': 'best'}
Best estimator: DecisionTreeClassifier()


In [7]:
dtc_grid = grid_search.best_estimator_
dtc_grid_pred = dtc_grid.predict_proba(x_test)[:, -1]

auc = metrics.roc_auc_score(y_test, dtc_grid_pred)
print("AUC Score: ", auc)

AUC Score:  0.6821428571428572


## KNN

In [8]:
knn = KNeighborsClassifier()

knn.fit(x_train, y_train)

knn_pred = knn.predict_proba(x_test)[:, -1]

auc = metrics.roc_auc_score(y_test, knn_pred)
print("AUC Score: ", auc)

AUC Score:  0.6095238095238096


### Parameter Tuning

In [9]:
parameter_grid = {'n_neighbors': [5, 10, 15],
                  'weights': ['uniform', 'distance'],
                  'algorithm': ['ball_tree', 'kd_tree', 'brute']}

grid_search = GridSearchCV(KNeighborsClassifier(),
                           param_grid=parameter_grid,
                           scoring='roc_auc',
                           refit="AUC",
                           cv=10,
                           verbose=4,
                           n_jobs=-1)

grid_search.fit(x_train, y_train)

print(f"Best score: {grid_search.best_score_}")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best estimator: {grid_search.best_estimator_}")

Fitting 10 folds for each of 18 candidates, totalling 180 fits
Best score: 0.6998245614035088
Best parameters: {'algorithm': 'ball_tree', 'n_neighbors': 5, 'weights': 'distance'}
Best estimator: KNeighborsClassifier(algorithm='ball_tree', weights='distance')


In [10]:
knn_grid = grid_search.best_estimator_
knn_grid_pred = knn_grid.predict_proba(x_test)[:, -1]

auc = metrics.roc_auc_score(y_test, knn_grid_pred)
print("AUC Score: ", auc)

AUC Score:  0.6269841269841271


## SVM

## Submission

In [11]:
x_real = test_data[features]

predictions = dtc.predict_proba(x_real)[:, -1]

In [12]:
submission = pd.DataFrame()
submission['Id'] = test_data['loan_id']
submission['Predicted'] = predictions

submission.to_csv("submission.csv", index=False)

In [13]:
submission

Unnamed: 0,Id,Predicted
0,4962,1.000000
1,4967,0.100000
2,4968,1.000000
3,4986,1.000000
4,4988,0.333333
...,...,...
349,7279,1.000000
350,7286,0.727273
351,7292,0.857143
352,7294,1.000000


[CV 6/10] END ....criterion=gini, splitter=best;, score=0.592 total time=   0.1s
[CV 9/10] END ....criterion=gini, splitter=best;, score=0.849 total time=   0.1s
[CV 7/10] END ..criterion=gini, splitter=random;, score=0.542 total time=   0.0s
[CV 4/10] END .criterion=entropy, splitter=best;, score=0.500 total time=   0.0s
[CV 2/10] END criterion=entropy, splitter=random;, score=0.642 total time=   0.0s
[CV 10/10] END criterion=entropy, splitter=random;, score=0.588 total time=   0.0s
[CV 8/10] END algorithm=ball_tree, n_neighbors=5, weights=uniform;, score=0.958 total time=   0.0s
[CV 6/10] END algorithm=ball_tree, n_neighbors=5, weights=distance;, score=0.517 total time=   0.0s
[CV 1/10] END algorithm=ball_tree, n_neighbors=10, weights=distance;, score=0.700 total time=   0.0s
[CV 2/10] END algorithm=ball_tree, n_neighbors=10, weights=distance;, score=0.633 total time=   0.0s
[CV 7/10] END algorithm=ball_tree, n_neighbors=15, weights=uniform;, score=0.642 total time=   0.0s
[CV 8/10] 