In [22]:
# necessary libraries for prediciton
import utils
import pandas as pd
import numpy as np

from sklearn import metrics
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Useful Functions

In [2]:
def pretty_matrix(matrix, row_label, col_label):
    """Pretty print of the given matrix """

    # Restraining labels that are too big
    row_label = [el[:10] + '..' if len(el) > 10 else el
                for el in row_label]
    col_label = [el[:10] + '..' if len(el) > 10 else el
                for el in col_label]

    # Stringfying everything & Joining top label
    s_matrix = [list([" "] + (col_label))] + \
               [[row_label[row_idx]] + \
                [str(e) for e in row] for row_idx, row in enumerate(matrix)]

    # Length of each matrix column
    len_s = [max(map(len, col)) for col in zip(*s_matrix)]

    # Cell formatation
    formatation = '\t'.join('{{:{}}}'.format(x) for x in len_s)

    # Apply cell formation to each matrix element
    pretty_mat = [formatation.format(*row) for row in s_matrix]

    # Print Pretty Matrix
    print('\n'.join(pretty_mat))


def display_confusion_matrix(values):
    '''Display the given array as a confusion matrix'''
    pretty_matrix([values[0:2], values[2:4]],
                  ['Actual NO', 'Actual YES'],
                  ['Predic NO', 'Predic YES'])

In [3]:
# Useful Macros
K_FOLD_NUM_SPLITS = 5
SEED = 42

# Prediction Algorithms

* Decision Tree
* Random Forest
* Gradient Boosting

In [4]:
def create_DT():
    '''Create a new Decision Tree'''
    # Useful DecisionTree tutorial:
    # https://www.datacamp.com/community/tutorials/decision-tree-classification-python
    return DecisionTreeClassifier()

In [5]:
def create_RF():
    '''Create a new Ranfom Forest model'''
    return RandomForestClassifier(n_estimators=10, max_depth=2, random_state=SEED)

In [6]:
def create_GB():
    '''Create a new Gradient Boosting model'''
    return GradientBoostingClassifier(n_estimators=10)

# Prediction

* Predictions are done in this notebook.
* It is also useful to compare how serveral algorithms perform against one another.

In [7]:
dataset =  utils.read_csv_to_df('dataset/preprocessed_data.csv')
display(dataset.head())

Unnamed: 0,date,amount,payments,frequency,account_creation_date,balance_mean,balance_max,balance_min,last_ballance,credit_mean,...,average salary,unemploymant rate '95,unemploymant rate '96,no. of enterpreneurs per 1000 inhabitants,no. of commited crimes '95,no. of commited crimes '96,loan_to_account_age_days,owner_age_on_loan,salary_over_payments,status
0,0.0,96396,8033,2,0.054011,12250.0,20100.0,1100.0,20100.0,5025.0,...,9650,3.38,3.67,100,2985,2804,105,0.685425,1617,-1
1,0.004721,165960,4610,1,0.024623,52083.859459,120512.8,700.0,52208.9,13523.158824,...,8369,1.79,2.31,117,2854,2618,148,0.241205,3759,1
2,0.018096,127080,2118,1,0.020651,30060.954167,49590.4,800.0,20272.8,5009.733333,...,8390,2.28,2.89,132,2080,2122,170,0.922457,6272,1
3,0.022817,105804,2939,1,0.013503,41297.48,65898.5,1000.0,34307.3,9254.6,...,10045,1.42,1.71,135,6604,6295,185,0.840626,7106,1
4,0.049567,274740,4579,2,0.025417,57188.211111,122893.1,600.0,41112.9,21255.930769,...,8288,3.79,4.52,110,1562,1460,204,0.030136,3709,1


In [8]:
STATUS_COL = dataset.columns.get_loc("status")

In [20]:
# Setting X and Y
X = dataset.iloc[:, 0:STATUS_COL]
y = dataset.iloc[:, [STATUS_COL]]

# Hyper Parameter Tunning

## Grid Search

In [37]:
# Hyper Parameter Tunning - maximizing function => max(1 - (mean AUC - std AUC))
grid_param = {
    'n_estimators': [8, 9, 10, 11, 12],
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False],
    'max_depth': [8, 9, 10, 11, 12, None]
}

def auc_scorer(y_true, y_pred):
    fpr, tpr, _ = metrics.roc_curve(y_true, y_pred)
    return metrics.auc(fpr, tpr)

grid = GridSearchCV(estimator=create_RF(),
                    param_grid=grid_param,
                    scoring=metrics.make_scorer(auc_scorer,
                                                greater_is_better=True),
                    cv=K_FOLD_NUM_SPLITS,
                    n_jobs=-1)


grid_result = grid.fit(X, y)

print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)

Best Score:  0.7332932814760478
Best Params:  {'bootstrap': False, 'criterion': 'entropy', 'max_depth': 10, 'n_estimators': 10}


  self.best_estimator_.fit(X, y, **fit_params)


## Random Search

In [None]:
random = RandomizedSearchCV(estimator=create_RF(),
                           param_distributions=grid_param,
                           scoring='roc_auc',
                           cv=K_FOLD_NUM_SPLITS,
                           n_jobs=-1,
                           n_iter=1000)

random_result = random.fit(X, y)

print('Best Score: ', random_result.best_score_)
print('Best Params: ', random_result.best_params_)

In [38]:
# Cross validation settings
auc_scores = []
confusion_matrixes = []
cv = KFold(n_splits=K_FOLD_NUM_SPLITS, random_state=SEED, shuffle=False)

# CHANGE THIS LINE TO CHANGE THE USED CLASSIFICATION METHOD
#classifier = create_DT()
classifier = RandomForestClassifier(bootstrap=False, criterion='entropy', max_depth=10, n_estimators=10)

# Applying Cross validation
for train_index, test_index in cv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Training with this fold
    classifier.fit(X_train, y_train)
    
    # Testing & Measuring accuracy
    y_pred = classifier.predict(X_test)
    
    fpr, tpr, _ = metrics.roc_curve(y_test, y_pred)
    auc_scores.append(metrics.auc(fpr, tpr))
    confusion_matrixes.append(metrics.confusion_matrix(y_test, y_pred).ravel())

  app.launch_new_instance()
  app.launch_new_instance()
  app.launch_new_instance()
  app.launch_new_instance()
  app.launch_new_instance()


In [39]:
# Printing the obtained results
print('Classification Method used:', classifier, '\n')
print('AUC scores:', auc_scores)
print('> Average: ', sum(auc_scores)/len(auc_scores))
for cf in confusion_matrixes:
    display_confusion_matrix(cf)

Classification Method used: RandomForestClassifier(bootstrap=False, class_weight=None, criterion='entropy',
                       max_depth=10, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False) 

AUC scores: [0.5935672514619883, 0.7916666666666666, 0.6788793103448276, 0.6988993710691824, 0.8]
> Average:  0.712602519908533
          	Predic NO	Predic YES
Actual NO 	2        	7         
Actual YES	2        	55        
          	Predic NO	Predic YES
Actual NO 	7        	5         
Actual YES	0        	54        
          	Predic NO	Predic YES
Actual NO 	3        	5         
Actual YES	1        	57        
          	Predic NO	Predic YES
Actual NO 	5        	7        

### After having our model trained we shall use the model on the data to be sumitted in the kaggle

In [40]:
test_dataset =  utils.read_csv_to_df('dataset/test_dataset.csv')
ids = utils.read_csv_to_df('dataset/ids.csv')
display(test_dataset.head())

Unnamed: 0,date,amount,payments,frequency,account_creation_date,balance_mean,balance_max,balance_min,last_ballance,credit_mean,...,average salary,unemploymant rate '95,unemploymant rate '96,no. of enterpreneurs per 1000 inhabitants,no. of commited crimes '95,no. of commited crimes '96,loan_to_account_age_days,owner_age_on_loan,salary_over_payments,status
0,0.0,93960,1566,1,0.176892,54520.202247,88246.7,800.0,49548.5,12769.22,...,8390,2.28,2.89,132,2080,2122,452,0.389496,6824,
1,0.00142,260640,7240,1,0.139059,31518.182051,88731.8,-718.6,11565.4,9526.772414,...,8620,1.1,1.25,100,1089,1117,490,0.688919,1380,
2,0.007102,232560,4845,0,0.0,40175.6125,79286.6,200.0,45754.0,15302.486047,...,10045,1.42,1.71,135,6604,6295,630,0.907412,5200,
3,0.025568,221880,3698,1,0.339468,44440.912676,74216.8,1000.0,38913.4,11797.642857,...,8899,3.39,3.97,149,2987,2487,311,0.809056,5201,
4,0.025568,38520,3210,1,0.047035,20231.313158,31302.0,900.0,18914.3,2736.026923,...,8388,2.41,2.94,87,1658,1668,597,0.894527,5178,


In [41]:
# We now remove the Y column with NaNs
test_dataset = test_dataset.iloc[:, 0:STATUS_COL]
display(test_dataset.head())

Unnamed: 0,date,amount,payments,frequency,account_creation_date,balance_mean,balance_max,balance_min,last_ballance,credit_mean,...,ratio of urban inhabitants,average salary,unemploymant rate '95,unemploymant rate '96,no. of enterpreneurs per 1000 inhabitants,no. of commited crimes '95,no. of commited crimes '96,loan_to_account_age_days,owner_age_on_loan,salary_over_payments
0,0.0,93960,1566,1,0.176892,54520.202247,88246.7,800.0,49548.5,12769.22,...,53.5,8390,2.28,2.89,132,2080,2122,452,0.389496,6824
1,0.00142,260640,7240,1,0.139059,31518.182051,88731.8,-718.6,11565.4,9526.772414,...,52.4,8620,1.1,1.25,100,1089,1117,490,0.688919,1380
2,0.007102,232560,4845,0,0.0,40175.6125,79286.6,200.0,45754.0,15302.486047,...,74.8,10045,1.42,1.71,135,6604,6295,630,0.907412,5200
3,0.025568,221880,3698,1,0.339468,44440.912676,74216.8,1000.0,38913.4,11797.642857,...,55.3,8899,3.39,3.97,149,2987,2487,311,0.809056,5201
4,0.025568,38520,3210,1,0.047035,20231.313158,31302.0,900.0,18914.3,2736.026923,...,59.1,8388,2.41,2.94,87,1658,1668,597,0.894527,5178


In [42]:
# Using the model to get the 'status' predictions
display(test_dataset)
predictions_df = test_dataset.copy()
predictions_df['Predicted'] = classifier.predict(test_dataset)
predictions_df = ids.merge(predictions_df, on=['date', 'amount'])
predictions_df = predictions_df[['loan_id', 'Predicted']]\
                    .rename(columns={
                        'loan_id': 'Id'
                    })\
                    .drop_duplicates()

display(predictions_df)

Unnamed: 0,date,amount,payments,frequency,account_creation_date,balance_mean,balance_max,balance_min,last_ballance,credit_mean,...,ratio of urban inhabitants,average salary,unemploymant rate '95,unemploymant rate '96,no. of enterpreneurs per 1000 inhabitants,no. of commited crimes '95,no. of commited crimes '96,loan_to_account_age_days,owner_age_on_loan,salary_over_payments
0,0.000000,93960,1566,1,0.176892,54520.202247,88246.7,800.0,49548.5,12769.220000,...,53.5,8390,2.28,2.89,132,2080,2122,452,0.389496,6824
1,0.001420,260640,7240,1,0.139059,31518.182051,88731.8,-718.6,11565.4,9526.772414,...,52.4,8620,1.10,1.25,100,1089,1117,490,0.688919,1380
2,0.007102,232560,4845,0,0.000000,40175.612500,79286.6,200.0,45754.0,15302.486047,...,74.8,10045,1.42,1.71,135,6604,6295,630,0.907412,5200
3,0.025568,221880,3698,1,0.339468,44440.912676,74216.8,1000.0,38913.4,11797.642857,...,55.3,8899,3.39,3.97,149,2987,2487,311,0.809056,5201
4,0.025568,38520,3210,1,0.047035,20231.313158,31302.0,900.0,18914.3,2736.026923,...,59.1,8388,2.41,2.94,87,1658,1668,597,0.894527,5178
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
349,0.995739,352704,7348,1,0.831288,30797.487500,74693.6,800.0,35875.3,8806.779310,...,67.0,9104,1.51,2.07,123,2299,2354,513,0.688612,1756
350,0.995739,52512,4376,1,0.823108,59684.298851,120419.7,1000.0,41035.4,14839.705263,...,33.9,8743,1.88,2.43,111,3659,3894,521,0.823782,4367
351,0.997159,139488,5812,2,0.770961,46755.305769,95976.0,1100.0,47956.3,17969.630769,...,41.7,8980,1.95,2.21,111,2824,2813,573,0.269849,3168
352,0.997159,55632,2318,1,0.698364,29645.320175,52896.1,500.0,38449.2,7270.390000,...,89.9,10177,6.63,7.75,81,9878,10108,644,0.734016,7859


Unnamed: 0,Id,Predicted
0,5895,1
1,7122,-1
2,6173,1
3,6142,1
4,5358,1
...,...,...
361,4989,1
362,5221,1
363,6402,1
364,5346,1


In [43]:
# Outputting predictions to .csv
# CHANGE FILE NAME TO PRESERVE DIFFERENT INSTANCES
utils.write_df_to_csv(predictions_df, 'predictions', 'prediction.csv')