In [1]:
# necessary libraries for prediciton
import utils
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

# Prediction Algorithms

* Decision Tree

In [2]:
def create_DT():
    '''Create a new Decision Tree'''
    # Useful DecisionTree tutorial:
    # https://www.datacamp.com/community/tutorials/decision-tree-classification-python
    return DecisionTreeClassifier()

# Prediction

* Predictions are done in this notebook.
* It is also useful to compare how serveral algorithms perform against one another.

In [3]:
dataset =  utils.read_csv_to_df('dataset/preprocessed_data.csv')
display(dataset)

Unnamed: 0,loan_id,account_id,date,amount,duration,payments,status,district_id,frequency,account_date,trans_id,trans_date,type,operation,trans_amount,balance,k_symbol
0,5314,1787,930705,96396,12,8033,-1,30,weekly issuance,930322,523621,930322,credit,credit in cash,1100.0,1100.0,Undefined
1,5314,1787,930705,96396,12,8033,-1,30,weekly issuance,930322,524054,930421,credit,credit in cash,9900.0,11000.0,Undefined
2,5314,1787,930705,96396,12,8033,-1,30,weekly issuance,930322,524055,930521,credit,credit in cash,5800.0,16800.0,Undefined
3,5314,1787,930705,96396,12,8033,-1,30,weekly issuance,930322,524056,930620,credit,credit in cash,3300.0,20100.0,Undefined
4,5316,1801,930711,165960,36,4610,1,46,monthly issuance,930213,527445,930213,credit,credit in cash,700.0,700.0,Undefined
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24489,7308,11362,961227,129408,24,5392,1,67,monthly issuance,951014,3424164,961206,withdrawal,remittance to another bank,129.0,39765.5,
24490,7308,11362,961227,129408,24,5392,1,67,monthly issuance,951014,3424358,961207,withdrawal,withdrawal in cash,10400.0,29365.5,Undefined
24491,7308,11362,961227,129408,24,5392,1,67,monthly issuance,951014,3424116,961207,withdrawal,remittance to another bank,330.0,29035.5,insurrance payment
24492,7308,11362,961227,129408,24,5392,1,67,monthly issuance,951014,3424068,961208,withdrawal,remittance to another bank,56.0,28979.5,


In [4]:
# Useful MACROS
STATUS_COL = 6
K_FOLD_NUM_SPLITS = 5
SEED = 42

In [5]:
# Setting X and Y
X = dataset.iloc[:, 0:STATUS_COL]
y = dataset.iloc[:, [STATUS_COL]]

In [6]:
# Cross validation settings
f1_scores = []
accuracies = []
cv = KFold(n_splits=K_FOLD_NUM_SPLITS, random_state=SEED, shuffle=False)

# CHANGE THIS LINE TO CHANGE THE USED CLASSIFICATION METHOD
classifier = create_DT()

# Applying Cross validation
for train_index, test_index in cv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Training with this fold
    classifier.fit(X_train, y_train)
    
    # Testing & Measuring accuracy
    y_pred = classifier.predict(X_test)
    f1_scores.append(metrics.f1_score(y_test, y_pred))
    accuracies.append(metrics.accuracy_score(y_test, y_pred))

In [7]:
# Printing the obtained results
print('Classification Method used:', classifier, '\n')
print('F1 score average: ', sum(f1_scores)/len(f1_scores))
print('Accuracy average: ', sum(accuracies)/len(accuracies))

Classification Method used: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best') 

F1 score average:  0.894728462723263
Accuracy average:  0.8180784638593004


### After having our model trained we shall use the model on the data to be sumitted in the kaggle

In [8]:
test_dataset =  utils.read_csv_to_df('dataset/test_dataset.csv')
display(test_dataset)

Unnamed: 0,loan_id,account_id,date,amount,duration,payments,status,district_id,frequency,account_date,trans_id,trans_date,type,operation,trans_amount,balance,k_symbol,account
0,5895,4473,970103,93960,60,1566,,45,monthly issuance,951009,1309867,951009,credit,credit in cash,800.0,800.0,Undefined,
1,5895,4473,970103,93960,60,1566,,45,monthly issuance,951009,1309878,951107,credit,credit in cash,24425.0,25225.0,Undefined,
2,5895,4473,970103,93960,60,1566,,45,monthly issuance,951009,1310161,951108,credit,credit in cash,500.0,25725.0,Undefined,
3,5895,4473,970103,93960,60,1566,,45,monthly issuance,951009,3664951,951130,credit,,82.2,25807.2,interest credited,
4,5895,4473,970103,93960,60,1566,,45,monthly issuance,951009,1309879,951207,credit,credit in cash,36637.5,62444.7,Undefined,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30195,6748,8645,981208,240900,60,4015,,60,monthly issuance,970623,2616397,981115,withdrawal,withdrawal in cash,1800.0,28535.7,Undefined,
30196,6748,8645,981208,240900,60,4015,,60,monthly issuance,970623,3502674,981130,credit,,105.8,28747.4,interest credited,
30197,6748,8645,981208,240900,60,4015,,60,monthly issuance,970623,2616380,981130,withdrawal,withdrawal in cash,14.6,28732.8,payment for statement,
30198,6748,8645,981208,240900,60,4015,,60,monthly issuance,970623,3476398,981130,credit,,105.8,28641.6,interest credited,


In [9]:
# We now remove the Y column with NaNs
test_dataset = test_dataset.iloc[:, 0:STATUS_COL]
display(test_dataset)

Unnamed: 0,loan_id,account_id,date,amount,duration,payments
0,5895,4473,970103,93960,60,1566
1,5895,4473,970103,93960,60,1566
2,5895,4473,970103,93960,60,1566
3,5895,4473,970103,93960,60,1566
4,5895,4473,970103,93960,60,1566
...,...,...,...,...,...,...
30195,6748,8645,981208,240900,60,4015
30196,6748,8645,981208,240900,60,4015
30197,6748,8645,981208,240900,60,4015
30198,6748,8645,981208,240900,60,4015


In [10]:
# Using the model to get the 'status' predictions
predictions = classifier.predict(test_dataset)

# We must now associate each prediciton to the respective loan_id
predictions_df = test_dataset.loc[:, ['loan_id']]\
                             .rename(columns={'loan_id': 'Id'})
predictions_df['Predicted'] = predictions
display(predictions_df)

Unnamed: 0,Id,Predicted
0,5895,1
1,5895,1
2,5895,1
3,5895,1
4,5895,1
...,...,...
30195,6748,1
30196,6748,1
30197,6748,1
30198,6748,1


In [11]:
# Outputting predictions to .csv
# CHANGE FILE NAME TO PRESERVE DIFFERENT INSTANCES
utils.write_df_to_csv(predictions_df, 'predictions', 'prediction.csv')