In [1]:
# necessary libraries for prediciton
import utils
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

# Useful Functions

In [2]:
def pretty_matrix(matrix, row_label, col_label):
    """Pretty print of the given matrix """

    # Restraining labels that are too big
    row_label = [el[:10] + '..' if len(el) > 10 else el
                for el in row_label]
    col_label = [el[:10] + '..' if len(el) > 10 else el
                for el in col_label]

    # Stringfying everything & Joining top label
    s_matrix = [list([" "] + (col_label))] + \
               [[row_label[row_idx]] + \
                [str(e) for e in row] for row_idx, row in enumerate(matrix)]

    # Length of each matrix column
    len_s = [max(map(len, col)) for col in zip(*s_matrix)]

    # Cell formatation
    formatation = '\t'.join('{{:{}}}'.format(x) for x in len_s)

    # Apply cell formation to each matrix element
    pretty_mat = [formatation.format(*row) for row in s_matrix]

    # Print Pretty Matrix
    print('\n'.join(pretty_mat))


def display_confusion_matrix(values):
    '''Display the given array as a confusion matrix'''
    pretty_matrix([values[0:2], values[2:4]],
                  ['Actual NO', 'Actual YES'],
                  ['Predic NO', 'Predic YES'])

# Prediction Algorithms

* Decision Tree
* Random Forest

In [3]:
def create_DT():
    '''Create a new Decision Tree'''
    # Useful DecisionTree tutorial:
    # https://www.datacamp.com/community/tutorials/decision-tree-classification-python
    return DecisionTreeClassifier()

In [4]:
def create_RF():
    '''Create a new Ranfom Forest model'''
    return RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)


# Prediction

* Predictions are done in this notebook.
* It is also useful to compare how serveral algorithms perform against one another.

In [5]:
dataset =  utils.read_csv_to_df('dataset/preprocessed_data.csv')
display(dataset.head())

Unnamed: 0,date,amount,payments,account_loan_age,frequency_MI,frequency_WI,owners_count,C,monthly_loan,monthly_loan-to-monthly_receiving,monthly_only_receiving,status
0,0.0,0.171345,0.823266,3.387097,0,1,1,0,0.823266,1.0,0.171345,-1
1,0.004721,0.301732,0.457951,4.774194,1,0,1,0,0.457951,1.0,0.301732,1
2,0.018096,0.228857,0.191996,5.483871,1,0,1,0,0.191996,1.0,0.228857,1
3,0.022817,0.188979,0.279616,5.967742,1,0,1,0,0.279616,1.0,0.188979,1
4,0.049567,0.505623,0.454642,6.580645,0,1,1,0,0.454642,1.0,0.505623,1


In [6]:
# Useful MACROS
STATUS_COL = dataset.columns.get_loc("status")
K_FOLD_NUM_SPLITS = 5
SEED = 42

In [7]:
# Setting X and Y
X = dataset.iloc[:, 0:STATUS_COL]
y = dataset.iloc[:, [STATUS_COL]]

In [8]:
# Cross validation settings
f1_scores = []
accuracies = []
confusion_matrixes = []
cv = KFold(n_splits=K_FOLD_NUM_SPLITS, random_state=SEED, shuffle=False)

# CHANGE THIS LINE TO CHANGE THE USED CLASSIFICATION METHOD
classifier = create_DT()
# classifier = create_RF()

# Applying Cross validation
for train_index, test_index in cv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Training with this fold
    classifier.fit(X_train, y_train)
    
    # Testing & Measuring accuracy
    y_pred = classifier.predict(X_test)
    f1_scores.append(metrics.f1_score(y_test, y_pred))
    accuracies.append(metrics.accuracy_score(y_test, y_pred))
    confusion_matrixes.append(metrics.confusion_matrix(y_test, y_pred).ravel())

In [9]:
# Printing the obtained results
print('Classification Method used:', classifier, '\n')
print('F1 score:', f1_scores)
print('\t > Average: ', sum(f1_scores)/len(f1_scores))
print('Accuracies: ', accuracies)
print('\t > Average:', sum(accuracies)/len(accuracies))
for cf in confusion_matrixes:
    display_confusion_matrix(cf)

Classification Method used: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best') 

F1 score: [0.8440366972477065, 0.830188679245283, 0.9166666666666667, 0.9043478260869565, 0.8928571428571429]
	 > Average:  0.8776194024207513
Accuracies:  [0.7424242424242424, 0.7272727272727273, 0.8484848484848485, 0.8307692307692308, 0.8153846153846154]
	 > Average: 0.792867132867133
          	Predic NO	Predic YES
Actual NO 	3        	6         
Actual YES	11       	46        
          	Predic NO	Predic YES
Actual NO 	4        	8         
Actual YES	10       	44        
          	Predic NO	Predic YES
Actual NO 	1        	7         
Actual YES	3        	

### After having our model trained we shall use the model on the data to be sumitted in the kaggle

In [10]:
test_dataset =  utils.read_csv_to_df('dataset/test_dataset.csv')
ids = utils.read_csv_to_df('competition_dataset/loan_test.csv', delimiter=';')
display(test_dataset.head())

Unnamed: 0,date,amount,payments,account_loan_age,frequency_MI,frequency_WI,owners_count,C,monthly_loan,monthly_loan-to-monthly_receiving,monthly_only_receiving,status
0,0.0,0.151641,0.131376,14.580645,1,0,1,0,0.131376,1.0,0.151641,
1,0.00142,0.436237,0.722049,15.806452,1,0,1,0,0.722049,0.0,0.436237,
2,0.007102,0.388292,0.472725,20.322581,0,0,1,0,0.472725,1.0,0.388292,
3,0.025568,0.370057,0.353321,10.032258,1,0,2,0,0.353321,1.0,0.370057,
4,0.025568,0.056981,0.302519,19.258065,1,0,1,0,0.302519,1.0,0.056981,


In [11]:
# We now remove the Y column with NaNs
test_dataset = test_dataset.iloc[:, 0:STATUS_COL]
display(test_dataset.head())

Unnamed: 0,date,amount,payments,account_loan_age,frequency_MI,frequency_WI,owners_count,C,monthly_loan,monthly_loan-to-monthly_receiving,monthly_only_receiving
0,0.0,0.151641,0.131376,14.580645,1,0,1,0,0.131376,1.0,0.151641
1,0.00142,0.436237,0.722049,15.806452,1,0,1,0,0.722049,0.0,0.436237
2,0.007102,0.388292,0.472725,20.322581,0,0,1,0,0.472725,1.0,0.388292
3,0.025568,0.370057,0.353321,10.032258,1,0,2,0,0.353321,1.0,0.370057
4,0.025568,0.056981,0.302519,19.258065,1,0,1,0,0.302519,1.0,0.056981


In [12]:
# Using the model to get the 'status' predictions
predictions_df = pd.DataFrame({'Id': ids['loan_id'].array,
                            'Predicted': classifier.predict(test_dataset)})

display(predictions_df)

Unnamed: 0,Id,Predicted
0,5895,1
1,7122,1
2,6173,1
3,6142,1
4,5358,1
...,...,...
349,4989,1
350,5221,1
351,6402,1
352,5346,1


In [13]:
# Outputting predictions to .csv
# CHANGE FILE NAME TO PRESERVE DIFFERENT INSTANCES
utils.write_df_to_csv(predictions_df, 'predictions', 'prediction.csv')