In [1]:
# necessary libraries for prediciton
import utils
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

# Useful Functions

In [2]:
def pretty_matrix(matrix, row_label, col_label):
    """Pretty print of the given matrix """

    # Restraining labels that are too big
    row_label = [el[:10] + '..' if len(el) > 10 else el
                for el in row_label]
    col_label = [el[:10] + '..' if len(el) > 10 else el
                for el in col_label]

    # Stringfying everything & Joining top label
    s_matrix = [list([" "] + (col_label))] + \
               [[row_label[row_idx]] + \
                [str(e) for e in row] for row_idx, row in enumerate(matrix)]

    # Length of each matrix column
    len_s = [max(map(len, col)) for col in zip(*s_matrix)]

    # Cell formatation
    formatation = '\t'.join('{{:{}}}'.format(x) for x in len_s)

    # Apply cell formation to each matrix element
    pretty_mat = [formatation.format(*row) for row in s_matrix]

    # Print Pretty Matrix
    print('\n'.join(pretty_mat))


def display_confusion_matrix(values):
    '''Display the given array as a confusion matrix'''
    pretty_matrix([values[0:2], values[2:4]],
                  ['Actual NO', 'Actual YES'],
                  ['Predic NO', 'Predic YES'])

# Prediction Algorithms

* Decision Tree
* Random Forest

In [3]:
def create_DT():
    '''Create a new Decision Tree'''
    # Useful DecisionTree tutorial:
    # https://www.datacamp.com/community/tutorials/decision-tree-classification-python
    return DecisionTreeClassifier()

In [4]:
def create_RF():
    '''Create a new Ranfom Forest model'''
    return RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)

# Prediction

* Predictions are done in this notebook.
* It is also useful to compare how serveral algorithms perform against one another.

In [5]:
dataset =  utils.read_csv_to_df('dataset/preprocessed_data.csv')
display(dataset.head())

Unnamed: 0,date,amount,duration,payments,account_creation_date,frequency_MI,frequency_WI,balance_mean,balance_max,balance_min,...,no. of enterpreneurs per 1000 inhabitants,no. of commited crimes '95,no. of commited crimes '96,owner_count,disponent_count,owner_gender,owner_birthdate,loan_to_account_age,salary_over_payments,status
0,0.0,96396,12,8033,0.054011,0,1,12250.0,20100.0,1100.0,...,100,2985,2804,1,1,1,0.272076,105,1617,-1
1,0.262785,52128,24,2172,0.19301,1,0,33459.680282,59944.2,144.2,...,100,2985,2804,1,1,1,0.533465,264,7478,1
2,0.004721,165960,36,4610,0.024623,1,0,52083.859459,120512.8,700.0,...,117,2854,2618,1,1,0,0.730073,148,3759,1
3,0.018096,127080,60,2118,0.020651,1,0,30060.954167,49590.4,800.0,...,132,2080,2122,1,1,0,0.029255,170,6272,1
4,0.306845,74736,36,2076,0.130262,1,0,37912.998507,62084.0,700.0,...,132,2080,2122,1,1,0,0.670547,399,6314,1


In [6]:
# Useful MACROS
STATUS_COL = dataset.columns.get_loc("status")
K_FOLD_NUM_SPLITS = 5
SEED = 42

In [7]:
# Setting X and Y
X = dataset.iloc[:, 0:STATUS_COL]
y = dataset.iloc[:, [STATUS_COL]]

In [8]:
# Cross validation settings
auc_scores = []
confusion_matrixes = []
cv = KFold(n_splits=K_FOLD_NUM_SPLITS, random_state=SEED, shuffle=False)

# CHANGE THIS LINE TO CHANGE THE USED CLASSIFICATION METHOD
classifier = create_DT()
# classifier = create_RF()

# Applying Cross validation
for train_index, test_index in cv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Training with this fold
    classifier.fit(X_train, y_train)
    
    # Testing & Measuring accuracy
    y_pred = classifier.predict(X_test)
    
    fpr, tpr, _ = metrics.roc_curve(y_test, y_pred)
    auc_scores.append(metrics.auc(fpr, tpr))
    confusion_matrixes.append(metrics.confusion_matrix(y_test, y_pred).ravel())

In [9]:
# Printing the obtained results
print('Classification Method used:', classifier, '\n')
print('AUC scores:', auc_scores)
print('> Average: ', sum(auc_scores)/len(auc_scores))
for cf in confusion_matrixes:
    display_confusion_matrix(cf)

Classification Method used: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best') 

AUC scores: [0.509090909090909, 0.5732758620689655, 0.8836065573770492, 0.8621031746031745, 0.6923076923076923]
> Average:  0.7040768390895581
          	Predic NO	Predic YES
Actual NO 	2        	9         
Actual YES	9        	46        
          	Predic NO	Predic YES
Actual NO 	2        	6         
Actual YES	6        	52        
          	Predic NO	Predic YES
Actual NO 	4        	1         
Actual YES	2        	59        
          	Predic NO	Predic YES
Actual NO 	7        	2         
Actual YES	3        	53        
          	Predic NO	Predic YES
Actual

### After having our model trained we shall use the model on the data to be sumitted in the kaggle

In [10]:
test_dataset =  utils.read_csv_to_df('dataset/test_dataset.csv')
ids = utils.read_csv_to_df('dataset/ids.csv')
display(test_dataset.head())

Unnamed: 0,date,amount,duration,payments,account_creation_date,frequency_MI,frequency_WI,balance_mean,balance_max,balance_min,...,no. of enterpreneurs per 1000 inhabitants,no. of commited crimes '95,no. of commited crimes '96,owner_count,disponent_count,owner_gender,owner_birthdate,loan_to_account_age,salary_over_payments,status
0,0.0,93960,60,1566,0.176892,1,0,54520.202247,88246.7,800.0,...,132,2080,2122,1,1,0,0.591135,452,6824,
1,0.664773,50976,36,1416,0.610429,1,0,28050.299187,48735.9,400.0,...,132,2080,2122,1,1,0,0.980347,496,6974,
2,0.761364,184620,60,3077,0.567485,1,0,34785.534513,65517.7,1100.0,...,132,2080,2122,1,1,1,0.620033,606,5313,
3,0.00142,260640,36,7240,0.139059,1,0,31518.182051,88731.8,-718.6,...,100,1089,1117,1,1,1,0.292414,490,1380,
4,0.52983,136368,24,5682,0.754601,1,0,60593.840678,119446.2,600.0,...,100,1089,1117,1,1,0,0.749709,260,2938,


In [11]:
# We now remove the Y column with NaNs
test_dataset = test_dataset.iloc[:, 0:STATUS_COL]
display(test_dataset.head())

Unnamed: 0,date,amount,duration,payments,account_creation_date,frequency_MI,frequency_WI,balance_mean,balance_max,balance_min,...,unemploymant rate '96,no. of enterpreneurs per 1000 inhabitants,no. of commited crimes '95,no. of commited crimes '96,owner_count,disponent_count,owner_gender,owner_birthdate,loan_to_account_age,salary_over_payments
0,0.0,93960,60,1566,0.176892,1,0,54520.202247,88246.7,800.0,...,2.89,132,2080,2122,1,1,0,0.591135,452,6824
1,0.664773,50976,36,1416,0.610429,1,0,28050.299187,48735.9,400.0,...,2.89,132,2080,2122,1,1,0,0.980347,496,6974
2,0.761364,184620,60,3077,0.567485,1,0,34785.534513,65517.7,1100.0,...,2.89,132,2080,2122,1,1,1,0.620033,606,5313
3,0.00142,260640,36,7240,0.139059,1,0,31518.182051,88731.8,-718.6,...,1.25,100,1089,1117,1,1,1,0.292414,490,1380
4,0.52983,136368,24,5682,0.754601,1,0,60593.840678,119446.2,600.0,...,1.25,100,1089,1117,1,1,0,0.749709,260,2938


In [12]:
# Using the model to get the 'status' predictions
display(test_dataset)
predictions_df = test_dataset.copy()
predictions_df['Predicted'] = classifier.predict(test_dataset)
predictions_df = ids.merge(predictions_df, on=['date', 'amount'])
predictions_df = predictions_df[['loan_id', 'Predicted']]\
                    .rename(columns={
                        'loan_id': 'Id'
                    })\
                    .drop_duplicates()

display(predictions_df)

Unnamed: 0,date,amount,duration,payments,account_creation_date,frequency_MI,frequency_WI,balance_mean,balance_max,balance_min,...,unemploymant rate '96,no. of enterpreneurs per 1000 inhabitants,no. of commited crimes '95,no. of commited crimes '96,owner_count,disponent_count,owner_gender,owner_birthdate,loan_to_account_age,salary_over_payments
0,0.000000,93960,60,1566,0.176892,1,0,54520.202247,88246.7,800.0,...,2.89,132,2080,2122,1,1,0,0.591135,452,6824
1,0.664773,50976,36,1416,0.610429,1,0,28050.299187,48735.9,400.0,...,2.89,132,2080,2122,1,1,0,0.980347,496,6974
2,0.761364,184620,60,3077,0.567485,1,0,34785.534513,65517.7,1100.0,...,2.89,132,2080,2122,1,1,1,0.620033,606,5313
3,0.001420,260640,36,7240,0.139059,1,0,31518.182051,88731.8,-718.6,...,1.25,100,1089,1117,1,1,1,0.292414,490,1380
4,0.529830,136368,24,5682,0.754601,1,0,60593.840678,119446.2,600.0,...,1.25,100,1089,1117,1,1,0,0.749709,260,2938
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
349,0.897727,39168,24,1632,0.930470,1,0,55438.851852,81495.6,300.0,...,2.87,130,4289,4846,1,1,1,0.965224,347,8288
350,0.913352,38496,12,3208,0.897751,1,0,42732.336111,92238.0,800.0,...,2.87,130,4289,4846,1,1,1,0.628360,390,6712
351,0.970170,99744,24,4156,0.802658,0,1,40126.968627,79967.9,700.0,...,2.87,130,4289,4846,1,1,1,0.797771,523,5764
352,0.889205,253560,60,4226,0.619632,1,0,39822.711111,76509.7,900.0,...,9.40,90,4947,4743,1,1,1,0.346293,645,6220


Unnamed: 0,Id,Predicted
0,5895,1
1,5172,1
2,6207,1
3,7122,-1
4,7067,1
...,...,...
361,7294,1
362,6321,1
363,6469,1
364,5614,1


In [13]:
# Outputting predictions to .csv
# CHANGE FILE NAME TO PRESERVE DIFFERENT INSTANCES
utils.write_df_to_csv(predictions_df, 'predictions', 'prediction.csv')