In [1]:
# necessary libraries for prediciton
import utils
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

# Useful Functions

In [2]:
def pretty_matrix(matrix, row_label, col_label):
    """Pretty print of the given matrix """

    # Restraining labels that are too big
    row_label = [el[:10] + '..' if len(el) > 10 else el
                for el in row_label]
    col_label = [el[:10] + '..' if len(el) > 10 else el
                for el in col_label]

    # Stringfying everything & Joining top label
    s_matrix = [list([" "] + (col_label))] + \
               [[row_label[row_idx]] + \
                [str(e) for e in row] for row_idx, row in enumerate(matrix)]

    # Length of each matrix column
    len_s = [max(map(len, col)) for col in zip(*s_matrix)]

    # Cell formatation
    formatation = '\t'.join('{{:{}}}'.format(x) for x in len_s)

    # Apply cell formation to each matrix element
    pretty_mat = [formatation.format(*row) for row in s_matrix]

    # Print Pretty Matrix
    print('\n'.join(pretty_mat))


def display_confusion_matrix(values):
    '''Display the given array as a confusion matrix'''
    pretty_matrix([values[0:2], values[2:4]],
                  ['Actual NO', 'Actual YES'],
                  ['Predic NO', 'Predic YES'])

# Prediction Algorithms

* Decision Tree
* Random Forest

In [3]:
def create_DT():
    '''Create a new Decision Tree'''
    # Useful DecisionTree tutorial:
    # https://www.datacamp.com/community/tutorials/decision-tree-classification-python
    return DecisionTreeClassifier()

In [4]:
def create_RF():
    '''Create a new Ranfom Forest model'''
    return RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)


# Prediction

* Predictions are done in this notebook.
* It is also useful to compare how serveral algorithms perform against one another.

In [5]:
dataset =  utils.read_csv_to_df('dataset/preprocessed_data.csv')
display(dataset.head())

Unnamed: 0,loan_id,date,amount,duration,payments,account_loan_age,frequency_MI,frequency_WI,owners_count,C,G,num_payments,monthly_loan,months_of_receiving,monthly_loan-to-monthly_receiving,monthly_only_receiving,status
0,5314,741830400000000000,96396,0.387097,8033,3.387097,0,1,1,0,0,12.0,249023.0,11.612903,31.0,93286.451613,-1
1,5316,742348800000000000,165960,1.16129,4610,4.774194,1,0,1,0,0,36.0,142910.0,34.83871,31.0,160606.451613,1
2,6863,743817600000000000,127080,1.935484,2118,5.483871,1,0,1,0,0,60.0,65658.0,58.064516,31.0,122980.645161,1
3,5325,744336000000000000,105804,1.16129,2939,5.967742,1,0,1,0,0,36.0,91109.0,34.83871,31.0,102390.967742,1
4,7240,747273600000000000,274740,1.935484,4579,6.580645,0,1,1,0,0,60.0,141949.0,58.064516,31.0,265877.419355,1


In [6]:
# Useful MACROS
STATUS_COL = dataset.columns.get_loc("status")
K_FOLD_NUM_SPLITS = 5
SEED = 42

In [7]:
# Setting X and Y
X = dataset.iloc[:, 0:STATUS_COL]
y = dataset.iloc[:, [STATUS_COL]]

In [8]:
# Cross validation settings
f1_scores = []
accuracies = []
confusion_matrixes = []
cv = KFold(n_splits=K_FOLD_NUM_SPLITS, random_state=SEED, shuffle=False)

# CHANGE THIS LINE TO CHANGE THE USED CLASSIFICATION METHOD
classifier = create_DT()
# classifier = create_RF()

# Applying Cross validation
for train_index, test_index in cv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Training with this fold
    classifier.fit(X_train, y_train)
    
    # Testing & Measuring accuracy
    y_pred = classifier.predict(X_test)
    f1_scores.append(metrics.f1_score(y_test, y_pred))
    accuracies.append(metrics.accuracy_score(y_test, y_pred))
    confusion_matrixes.append(metrics.confusion_matrix(y_test, y_pred).ravel())

In [9]:
# Printing the obtained results
print('Classification Method used:', classifier, '\n')
print('F1 score:', f1_scores)
print('\t > Average: ', sum(f1_scores)/len(f1_scores))
print('Accuracies: ', accuracies)
print('\t > Average:', sum(accuracies)/len(accuracies))
for cf in confusion_matrixes:
    display_confusion_matrix(cf)

Classification Method used: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best') 

F1 score: [0.8148148148148149, 0.7920792079207921, 0.8869565217391304, 0.9043478260869565, 0.9137931034482758]
	 > Average:  0.8623982948019939
Accuracies:  [0.696969696969697, 0.6818181818181818, 0.803030303030303, 0.8307692307692308, 0.8461538461538461]
	 > Average: 0.7717482517482518
          	Predic NO	Predic YES
Actual NO 	2        	7         
Actual YES	13       	44        
          	Predic NO	Predic YES
Actual NO 	5        	7         
Actual YES	14       	40        
          	Predic NO	Predic YES
Actual NO 	2        	6         
Actual YES	7        	

### After having our model trained we shall use the model on the data to be sumitted in the kaggle

In [10]:
test_dataset =  utils.read_csv_to_df('dataset/test_dataset.csv')
display(test_dataset.head())

Unnamed: 0,loan_id,date,amount,duration,payments,account_loan_age,frequency_MI,frequency_WI,owners_count,C,G,num_payments,monthly_loan,months_of_receiving,monthly_loan-to-monthly_receiving,monthly_only_receiving,status
0,5895,852249600000000000,93960,1.935484,1566,14.580645,1,0,1,0,0,60.0,48546.0,58.064516,31.0,90929.032258,
1,7122,852336000000000000,260640,1.16129,7240,15.806452,1,0,1,0,0,36.0,224440.0,34.83871,31.0,252232.258065,
2,6173,852681600000000000,232560,1.548387,4845,20.322581,0,0,1,0,0,48.0,150195.0,46.451613,31.0,225058.064516,
3,6142,853804800000000000,221880,1.935484,3698,10.032258,1,0,2,0,0,60.0,114638.0,58.064516,31.0,214722.580645,
4,5358,853804800000000000,38520,0.387097,3210,19.258065,1,0,1,0,0,12.0,99510.0,11.612903,31.0,37277.419355,


In [11]:
# We now remove the Y column with NaNs
test_dataset = test_dataset.iloc[:, 0:STATUS_COL]
display(test_dataset.head())

Unnamed: 0,loan_id,date,amount,duration,payments,account_loan_age,frequency_MI,frequency_WI,owners_count,C,G,num_payments,monthly_loan,months_of_receiving,monthly_loan-to-monthly_receiving,monthly_only_receiving
0,5895,852249600000000000,93960,1.935484,1566,14.580645,1,0,1,0,0,60.0,48546.0,58.064516,31.0,90929.032258
1,7122,852336000000000000,260640,1.16129,7240,15.806452,1,0,1,0,0,36.0,224440.0,34.83871,31.0,252232.258065
2,6173,852681600000000000,232560,1.548387,4845,20.322581,0,0,1,0,0,48.0,150195.0,46.451613,31.0,225058.064516
3,6142,853804800000000000,221880,1.935484,3698,10.032258,1,0,2,0,0,60.0,114638.0,58.064516,31.0,214722.580645
4,5358,853804800000000000,38520,0.387097,3210,19.258065,1,0,1,0,0,12.0,99510.0,11.612903,31.0,37277.419355


In [12]:
# Using the model to get the 'status' predictions
predictions = classifier.predict(test_dataset)

# We must now associate each prediciton to the respective loan_id
predictions_df = test_dataset.loc[:, ['loan_id']]\
                             .rename(columns={'loan_id': 'Id'})
predictions_df['Predicted'] = predictions
display(predictions_df)

Unnamed: 0,Id,Predicted
0,5895,1
1,7122,1
2,6173,1
3,6142,1
4,5358,-1
...,...,...
349,4989,1
350,5221,-1
351,6402,1
352,5346,-1


In [13]:
# Outputting predictions to .csv
# CHANGE FILE NAME TO PRESERVE DIFFERENT INSTANCES
utils.write_df_to_csv(predictions_df, 'predictions', 'prediction.csv')