In [1]:
# necessary libraries for prediciton
import utils
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

# Prediction Algorithms

* Decision Tree

In [2]:
def create_DT():
    '''Create a new Decision Tree'''
    # Useful DecisionTree tutorial:
    # https://www.datacamp.com/community/tutorials/decision-tree-classification-python
    return DecisionTreeClassifier()

# Prediction

* Predictions are done in this notebook.
* It is also useful to compare how serveral algorithms perform against one another.

In [3]:
dataset =  utils.read_csv_to_df('dataset/preprocessed_data.csv')
display(dataset)

Unnamed: 0,loan_id,amount,duration,payments,account_age,frequency_monthly issuance,frequency_weekly issuance,status
0,5314,96396,12,8033,105,0,1,-1
1,5316,165960,36,4610,148,1,0,1
2,6863,127080,60,2118,170,1,0,1
3,5325,105804,36,2939,185,1,0,1
4,7240,274740,60,4579,204,0,1,1
...,...,...,...,...,...,...,...,...
323,6818,155616,48,3242,691,1,0,1
324,5625,222180,60,3703,382,1,0,-1
325,6805,45024,48,938,214,1,0,1
326,7233,115812,36,3217,585,1,0,1


In [4]:
# Useful MACROS
STATUS_COL = dataset.columns.get_loc("status")
K_FOLD_NUM_SPLITS = 5
SEED = 42

In [5]:
# Setting X and Y
X = dataset.iloc[:, 0:STATUS_COL]
y = dataset.iloc[:, [STATUS_COL]]

In [6]:
# Cross validation settings
f1_scores = []
accuracies = []
cv = KFold(n_splits=K_FOLD_NUM_SPLITS, random_state=SEED, shuffle=False)

# CHANGE THIS LINE TO CHANGE THE USED CLASSIFICATION METHOD
classifier = create_DT()

# Applying Cross validation
for train_index, test_index in cv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Training with this fold
    classifier.fit(X_train, y_train)
    
    # Testing & Measuring accuracy
    y_pred = classifier.predict(X_test)
    f1_scores.append(metrics.f1_score(y_test, y_pred))
    accuracies.append(metrics.accuracy_score(y_test, y_pred))

In [7]:
# Printing the obtained results
print('Classification Method used:', classifier, '\n')
print('F1 score:', f1_scores)
print('\t > Average: ', sum(f1_scores)/len(f1_scores))
print('Accuracies: ', accuracies)
print('\t > Average:', sum(accuracies)/len(accuracies))

Classification Method used: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best') 

F1 score: [0.9075630252100839, 0.8113207547169811, 0.8869565217391304, 0.8256880733944955, 0.9152542372881356]
	 > Average:  0.8693565224697654
Accuracies:  [0.8333333333333334, 0.696969696969697, 0.803030303030303, 0.7076923076923077, 0.8461538461538461]
	 > Average: 0.7774358974358975


### After having our model trained we shall use the model on the data to be sumitted in the kaggle

In [8]:
test_dataset =  utils.read_csv_to_df('dataset/test_dataset.csv')
display(test_dataset)

Unnamed: 0,loan_id,amount,duration,payments,account_age,frequency_monthly issuance,frequency_weekly issuance,status
0,5895,93960,60,1566,452,1,0,
1,7122,260640,36,7240,490,1,0,
2,6173,232560,48,4845,630,0,0,
3,6142,221880,60,3698,311,1,0,
4,5358,38520,12,3210,597,1,0,
...,...,...,...,...,...,...,...,...
349,4989,352704,48,7348,513,1,0,
350,5221,52512,12,4376,521,1,0,
351,6402,139488,24,5812,573,0,1,
352,5346,55632,24,2318,644,1,0,


In [9]:
# We now remove the Y column with NaNs
test_dataset = test_dataset.iloc[:, 0:STATUS_COL]
display(test_dataset)

Unnamed: 0,loan_id,amount,duration,payments,account_age,frequency_monthly issuance,frequency_weekly issuance
0,5895,93960,60,1566,452,1,0
1,7122,260640,36,7240,490,1,0
2,6173,232560,48,4845,630,0,0
3,6142,221880,60,3698,311,1,0
4,5358,38520,12,3210,597,1,0
...,...,...,...,...,...,...,...
349,4989,352704,48,7348,513,1,0
350,5221,52512,12,4376,521,1,0
351,6402,139488,24,5812,573,0,1
352,5346,55632,24,2318,644,1,0


In [10]:
# Using the model to get the 'status' predictions
predictions = classifier.predict(test_dataset)

# We must now associate each prediciton to the respective loan_id
predictions_df = test_dataset.loc[:, ['loan_id']]\
                             .rename(columns={'loan_id': 'Id'})
predictions_df['Predicted'] = predictions
display(predictions_df)

Unnamed: 0,Id,Predicted
0,5895,1
1,7122,1
2,6173,1
3,6142,-1
4,5358,1
...,...,...
349,4989,1
350,5221,1
351,6402,1
352,5346,1


In [11]:
# Outputting predictions to .csv
# CHANGE FILE NAME TO PRESERVE DIFFERENT INSTANCES
utils.write_df_to_csv(predictions_df, 'predictions', 'prediction.csv')