In [1]:
# necessary libraries for prediciton
import utils
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

# Useful Functions

In [2]:
def pretty_matrix(matrix, row_label, col_label):
    """Pretty print of the given matrix """

    # Restraining labels that are too big
    row_label = [el[:10] + '..' if len(el) > 10 else el
                for el in row_label]
    col_label = [el[:10] + '..' if len(el) > 10 else el
                for el in col_label]

    # Stringfying everything & Joining top label
    s_matrix = [list([" "] + (col_label))] + \
               [[row_label[row_idx]] + \
                [str(e) for e in row] for row_idx, row in enumerate(matrix)]

    # Length of each matrix column
    len_s = [max(map(len, col)) for col in zip(*s_matrix)]

    # Cell formatation
    formatation = '\t'.join('{{:{}}}'.format(x) for x in len_s)

    # Apply cell formation to each matrix element
    pretty_mat = [formatation.format(*row) for row in s_matrix]

    # Print Pretty Matrix
    print('\n'.join(pretty_mat))


def display_confusion_matrix(values):
    '''Display the given array as a confusion matrix'''
    pretty_matrix([values[0:2], values[2:4]],
                  ['Actual NO', 'Actual YES'],
                  ['Predic NO', 'Predic YES'])

# Prediction Algorithms

* Decision Tree
* Random Forest

In [3]:
def create_DT():
    '''Create a new Decision Tree'''
    # Useful DecisionTree tutorial:
    # https://www.datacamp.com/community/tutorials/decision-tree-classification-python
    return DecisionTreeClassifier()

In [4]:
def create_RF():
    '''Create a new Ranfom Forest model'''
    return RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)

# Prediction

* Predictions are done in this notebook.
* It is also useful to compare how serveral algorithms perform against one another.

In [5]:
dataset =  utils.read_csv_to_df('dataset/preprocessed_data.csv')
display(dataset.head())

Unnamed: 0,date,amount,duration,payments,account_creation_date,frequency_MI,frequency_WI,balance_mean,balance_max,balance_min,...,operation_CCW,mean_trans_profit,total_ops,total_trans,owner_count,disponent_count,owner_gender,owner_birthdate,loan_to_account_age,status
0,0.0,96396,12,8033,0.054011,0,1,12250.0,20100.0,1100.0,...,0,1194.2364,4,67.0,1,1,1,0.272076,105,-1
1,0.004721,165960,36,4610,0.024623,1,0,52083.859459,120512.8,700.0,...,0,4638.918824,32,37.0,1,1,0,0.730073,148,1
2,0.018096,127080,60,2118,0.020651,1,0,30060.954167,49590.4,800.0,...,0,-1087.266667,16,24.0,1,1,0,0.029255,170,1
3,0.022817,105804,36,2939,0.013503,1,0,41297.48,65898.5,1000.0,...,0,2086.5,20,25.0,1,1,1,0.113917,185,1
4,0.049567,274740,60,4579,0.025417,0,1,57188.211111,122893.1,600.0,...,0,4454.930769,21,27.0,1,1,0,0.950922,204,1


In [6]:
# Useful MACROS
STATUS_COL = dataset.columns.get_loc("status")
K_FOLD_NUM_SPLITS = 5
SEED = 42

In [7]:
# Setting X and Y
X = dataset.iloc[:, 0:STATUS_COL]
y = dataset.iloc[:, [STATUS_COL]]

In [8]:
# Cross validation settings
auc_scores = []
confusion_matrixes = []
cv = KFold(n_splits=K_FOLD_NUM_SPLITS, random_state=SEED, shuffle=False)

# CHANGE THIS LINE TO CHANGE THE USED CLASSIFICATION METHOD
classifier = create_DT()
# classifier = create_RF()

# Applying Cross validation
for train_index, test_index in cv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Training with this fold
    classifier.fit(X_train, y_train)
    
    # Testing & Measuring accuracy
    y_pred = classifier.predict(X_test)
    
    fpr, tpr, _ = metrics.roc_curve(y_test, y_pred)
    auc_scores.append(metrics.auc(fpr, tpr))
    confusion_matrixes.append(metrics.confusion_matrix(y_test, y_pred).ravel())

In [9]:
# Printing the obtained results
print('Classification Method used:', classifier, '\n')
print('AUC scores:', auc_scores)
print('> Average: ', sum(auc_scores)/len(auc_scores))
for cf in confusion_matrixes:
    display_confusion_matrix(cf)

Classification Method used: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best') 

AUC scores: [0.5233918128654971, 0.6712962962962963, 0.8146551724137931, 0.5966981132075472, 0.6583333333333332]
> Average:  0.6528749456232934
          	Predic NO	Predic YES
Actual NO 	2        	7         
Actual YES	10       	47        
          	Predic NO	Predic YES
Actual NO 	5        	7         
Actual YES	4        	50        
          	Predic NO	Predic YES
Actual NO 	6        	2         
Actual YES	7        	51        
          	Predic NO	Predic YES
Actual NO 	3        	9         
Actual YES	3        	50        
          	Predic NO	Predic YES
Actua

### After having our model trained we shall use the model on the data to be sumitted in the kaggle

In [10]:
test_dataset =  utils.read_csv_to_df('dataset/test_dataset.csv')
ids = utils.read_csv_to_df('dataset/ids.csv')
display(test_dataset.head())

Unnamed: 0,date,amount,duration,payments,account_creation_date,frequency_MI,frequency_WI,balance_mean,balance_max,balance_min,...,operation_CCW,mean_trans_profit,total_ops,total_trans,owner_count,disponent_count,owner_gender,owner_birthdate,loan_to_account_age,status
0,0.0,93960,60,1566,0.176892,1,0,54520.202247,88246.7,800.0,...,0,7116.203051,75,89.0,1,1,0,0.591135,452,
1,0.00142,260640,36,7240,0.139059,1,0,31518.182051,88731.8,-718.6,...,0,4124.511189,63,78.0,1,1,1,0.292414,490,
2,0.007102,232560,48,4845,0.0,0,0,40175.6125,79286.6,200.0,...,0,10699.080031,156,176.0,1,1,1,0.074634,630,
3,0.025568,221880,60,3698,0.339468,1,0,44440.912676,74216.8,1000.0,...,0,7620.902857,62,71.0,2,2,1,0.173575,311,
4,0.025568,38520,12,3210,0.047035,1,0,20231.313158,31302.0,900.0,...,0,667.010794,95,114.0,1,1,0,0.088288,597,


In [11]:
# We now remove the Y column with NaNs
test_dataset = test_dataset.iloc[:, 0:STATUS_COL]
display(test_dataset.head())

Unnamed: 0,date,amount,duration,payments,account_creation_date,frequency_MI,frequency_WI,balance_mean,balance_max,balance_min,...,operation_RAB,operation_CCW,mean_trans_profit,total_ops,total_trans,owner_count,disponent_count,owner_gender,owner_birthdate,loan_to_account_age
0,0.0,93960,60,1566,0.176892,1,0,54520.202247,88246.7,800.0,...,19,0,7116.203051,75,89.0,1,1,0,0.591135,452
1,0.00142,260640,36,7240,0.139059,1,0,31518.182051,88731.8,-718.6,...,23,0,4124.511189,63,78.0,1,1,1,0.292414,490
2,0.007102,232560,48,4845,0.0,0,0,40175.6125,79286.6,200.0,...,61,0,10699.080031,156,176.0,1,1,1,0.074634,630
3,0.025568,221880,60,3698,0.339468,1,0,44440.912676,74216.8,1000.0,...,24,0,7620.902857,62,71.0,2,2,1,0.173575,311
4,0.025568,38520,12,3210,0.047035,1,0,20231.313158,31302.0,900.0,...,16,0,667.010794,95,114.0,1,1,0,0.088288,597


In [12]:
# Using the model to get the 'status' predictions
display(test_dataset)
predictions_df = test_dataset.copy()
predictions_df['Predicted'] = classifier.predict(test_dataset)
predictions_df = ids.merge(predictions_df, on=['date', 'amount'])
predictions_df = predictions_df[['loan_id', 'Predicted']]\
                    .rename(columns={
                        'loan_id': 'Id'
                    })\
                    .drop_duplicates()

display(predictions_df)

Unnamed: 0,date,amount,duration,payments,account_creation_date,frequency_MI,frequency_WI,balance_mean,balance_max,balance_min,...,operation_RAB,operation_CCW,mean_trans_profit,total_ops,total_trans,owner_count,disponent_count,owner_gender,owner_birthdate,loan_to_account_age
0,0.000000,93960,60,1566,0.176892,1,0,54520.202247,88246.7,800.0,...,19,0,7116.203051,75,89.0,1,1,0,0.591135,452
1,0.001420,260640,36,7240,0.139059,1,0,31518.182051,88731.8,-718.6,...,23,0,4124.511189,63,78.0,1,1,1,0.292414,490
2,0.007102,232560,48,4845,0.000000,0,0,40175.612500,79286.6,200.0,...,61,0,10699.080031,156,176.0,1,1,1,0.074634,630
3,0.025568,221880,60,3698,0.339468,1,0,44440.912676,74216.8,1000.0,...,24,0,7620.902857,62,71.0,2,2,1,0.173575,311
4,0.025568,38520,12,3210,0.047035,1,0,20231.313158,31302.0,900.0,...,16,0,667.010794,95,114.0,1,1,0,0.088288,597
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
349,0.995739,352704,48,7348,0.831288,1,0,30797.487500,74693.6,800.0,...,0,0,465.253384,41,56.0,1,1,1,0.335578,513
350,0.995739,52512,12,4376,0.823108,1,0,59684.298851,120419.7,1000.0,...,0,0,4168.815467,74,87.0,1,1,0,0.200698,521
351,0.997159,139488,24,5812,0.770961,0,1,46755.305769,95976.0,1100.0,...,0,0,7925.646154,86,104.0,1,1,0,0.753505,573
352,0.997159,55632,24,2318,0.698364,1,0,29645.320175,52896.1,500.0,...,0,0,2191.168125,93,114.0,1,1,0,0.290332,644


Unnamed: 0,Id,Predicted
0,5895,1
1,7122,-1
2,6173,1
3,6142,1
4,5358,1
...,...,...
361,4989,1
362,5221,1
363,6402,1
364,5346,1


In [13]:
# Outputting predictions to .csv
# CHANGE FILE NAME TO PRESERVE DIFFERENT INSTANCES
utils.write_df_to_csv(predictions_df, 'predictions', 'prediction.csv')