In [1]:
# necessary libraries for prediciton
import utils
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

# Prediction Algorithms

* Decision Tree

In [2]:
def create_DT():
    '''Create a new Decision Tree'''
    # Useful DecisionTree tutorial:
    # https://www.datacamp.com/community/tutorials/decision-tree-classification-python
    return DecisionTreeClassifier()

# Prediction

* Predictions are done in this notebook.
* It is also useful to compare how serveral algorithms perform against one another.

In [3]:
dataset =  utils.read_csv_to_df('dataset/preprocessed_data.csv')
display(dataset)

Unnamed: 0,loan_id,account_id,date,amount,duration,payments,status
0,5314,1787,930705,96396,12,8033,-1
1,5316,1801,930711,165960,36,4610,1
2,6863,9188,930728,127080,60,2118,1
3,5325,1843,930803,105804,36,2939,1
4,7240,11013,930906,274740,60,4579,1
...,...,...,...,...,...,...,...
323,6818,9030,961212,155616,48,3242,1
324,5625,3189,961215,222180,60,3703,-1
325,6805,8972,961221,45024,48,938,1
326,7233,10963,961225,115812,36,3217,1


In [4]:
# Useful MACROS
STATUS_COL = 6
K_FOLD_NUM_SPLITS = 5
SEED = 42

In [5]:
# Setting X and Y
X = dataset.iloc[:, 0:STATUS_COL]
y = dataset.iloc[:, [STATUS_COL]]

In [6]:
# Cross validation settings
f1_scores = []
accuracies = []
cv = KFold(n_splits=K_FOLD_NUM_SPLITS, random_state=SEED, shuffle=False)

# CHANGE THIS LINE TO CHANGE THE USED CLASSIFICATION METHOD
classifier = create_DT()

# Applying Cross validation
for train_index, test_index in cv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Training with this fold
    classifier.fit(X_train, y_train)
    
    # Testing & Measuring accuracy
    y_pred = classifier.predict(X_test)
    f1_scores.append(metrics.f1_score(y_test, y_pred))
    accuracies.append(metrics.accuracy_score(y_test, y_pred))

In [7]:
# Printing the obtained results
print('Classification Method used:', classifier, '\n')
print('F1 score average: ', sum(f1_scores)/len(f1_scores))
print('Accuracy average: ', sum(accuracies)/len(accuracies))

Classification Method used: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best') 

F1 score average:  0.8479994806594744
Accuracy average:  0.747039627039627


### After having our model trained we shall use the model on the data to be sumitted in the kaggle

In [8]:
test_dataset =  utils.read_csv_to_df('competition_dataset/loan_test.csv', delimiter=';')
display(test_dataset)

Unnamed: 0,loan_id,account_id,date,amount,duration,payments,status
0,5895,4473,970103,93960,60,1566,
1,7122,10365,970104,260640,36,7240,
2,6173,5724,970108,232560,48,4845,
3,6142,5591,970121,221880,60,3698,
4,5358,2018,970121,38520,12,3210,
...,...,...,...,...,...,...,...
349,4989,105,981205,352704,48,7348,
350,5221,1284,981205,52512,12,4376,
351,6402,6922,981206,139488,24,5812,
352,5346,1928,981206,55632,24,2318,


In [9]:
# We now remove the Y column with NaNs
test_dataset = test_dataset.iloc[:, 0:STATUS_COL]
display(test_dataset)

Unnamed: 0,loan_id,account_id,date,amount,duration,payments
0,5895,4473,970103,93960,60,1566
1,7122,10365,970104,260640,36,7240
2,6173,5724,970108,232560,48,4845
3,6142,5591,970121,221880,60,3698
4,5358,2018,970121,38520,12,3210
...,...,...,...,...,...,...
349,4989,105,981205,352704,48,7348
350,5221,1284,981205,52512,12,4376
351,6402,6922,981206,139488,24,5812
352,5346,1928,981206,55632,24,2318


In [10]:
# Using the model to get the 'status' predictions
predictions = classifier.predict(test_dataset)

# We must now associate each prediciton to the respective loan_id
predictions_df = test_dataset.loc[:, ['loan_id']]\
                    .rename(columns={'loan_id': 'Id'})
predictions_df['Predicted'] = predictions
display(predictions_df)

Unnamed: 0,Id,Predicted
0,5895,1
1,7122,-1
2,6173,1
3,6142,1
4,5358,1
...,...,...
349,4989,1
350,5221,1
351,6402,1
352,5346,1


In [11]:
# Outputting predictions to .csv
# CHANGE FILE NAME TO PRESERVE DIFFERENT INSTANCES
utils.write_df_to_csv(predictions_df, 'predictions', 'prediction.csv')