# Objective

# Potentential improvements

- Include more parameters as part of the Hyper Parameter Tuning process

# Preparation

## Load libraries

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import pylab as pl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
from sklearn import preprocessing
from sklearn import metrics as met
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.neighbors import RadiusNeighborsClassifier, KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold, train_test_split
from sklearn.utils import resample
from sklearn.metrics import roc_curve
from scipy.stats import randint
import os
import errno

In [2]:
%load_ext pycodestyle_magic
%flake8_on
%matplotlib inline

### Create folder structure

In [3]:
os.chdir('..')
output_folder = './reports/figures/'
cleaned_folder = './data/processed/'
external_data = './data/external/'

# Import data

In [4]:
x_train = pd.read_csv(cleaned_folder+'x_train.csv')
x_test = pd.read_csv(cleaned_folder+'x_test.csv')
y_train = pd.read_csv(cleaned_folder+'y_train.csv')
y_test = pd.read_csv(cleaned_folder+'y_test.csv')
test_df = pd.read_csv(cleaned_folder+'test_df.csv')
test_ids_df = pd.read_csv(cleaned_folder+'test_ids_df.csv')

In [5]:
y_test = y_test['TARGET']
y_train = y_train['TARGET']

# Balance dataset

In [6]:
# Verify that the dataset is balanced
y_train.value_counts(normalize=False)

0.0    168267
1.0     15238
Name: TARGET, dtype: int64

In [7]:
# Merge outcome variable & features
train_df = pd.concat([x_train, y_train], axis=1)

In [8]:
# Separate dataset based on outcome variable
no_pay_prob = train_df[train_df['TARGET'] == 0]
pay_prob = train_df[train_df['TARGET'] == 1]

In [9]:
# upsample - artificially add customers with payment difficulties
pay_prob2 = resample(pay_prob,
                     replace=True,  # sample with replacement
                     n_samples=len(no_pay_prob),  # dataset to match customers without payment problems
                     random_state=18)

4:80: E501 line too long (103 > 79 characters)


In [10]:
# Count of customers with payment difficulties
no_pay_prob.shape

(168267, 27)

In [11]:
# New count of customers without payment difficulties
pay_prob2.shape

(168267, 27)

In [12]:
# Combine dataset with added cases
train_df = pd.concat([pay_prob2, no_pay_prob])

In [13]:
# Separate dataset in preparation of modelling
y_train = train_df['TARGET']
x_train = train_df.drop('TARGET', axis=1)

# Algorithm selection

In [14]:
classifiers = {'Gradient Boosting Classifier': GradientBoostingClassifier(),
               'Ada Boost Classifier': AdaBoostClassifier(),
               'Linear Discriminant Analyis': LinearDiscriminantAnalysis(),
               'GaussianNB': GaussianNB(),
               'BernoulliNB': BernoulliNB(),
               'KNN': KNeighborsClassifier(),
               'Random Forest Classifier': RandomForestClassifier(),
               'Decision Tree Classifier': DecisionTreeClassifier(),
               'Logistic Regression': LogisticRegression()}

Recall is used to evaluate algorithm performance. The reason for this is that:
- The original dataset is unbalanced; most people don't have payment difficulties
- The objective of this analysis is to correctly identify customers with payment difficulties

Recall = the number of correctly classified customers with payment difficulties = True Positives / (True Positives + False Negatives)

In [15]:
# Re-evaluate strongest predicting algorithm
# (step can be removed by selected previous strongest algorithm to speed-up processing)
base_score = 0
model_outcomes = []
for Name, classify in classifiers.items():
    classify.fit(x_train, y_train)
    predicting_y = classify.predict(x_test)
    model_outcomes.append({
    'Algorithm': str(Name),
    'Recall_score': str(met.recall_score(y_test, predicting_y))
    })

    if met.recall_score(y_test, predicting_y) > base_score:
        # prediction = classify.predict(test_df)
        base_score = met.recall_score(y_test, predicting_y)

    else:
        continue

2:80: E501 line too long (87 > 79 characters)
9:5: E122 continuation line missing indentation or outdented
10:5: E122 continuation line missing indentation or outdented


In [16]:
model_scores = pd.DataFrame(model_outcomes, columns=['Algorithm', 'Recall_score'])
model_scores.sort_values(by=['Recall_score'], ascending=False)

Unnamed: 0,Algorithm,Recall_score
3,GaussianNB,0.7340789064926996
4,BernoulliNB,0.6680646163404784
1,Ada Boost Classifier,0.6169617893755824
2,Linear Discriminant Analyis,0.6149425287356322
8,Logistic Regression,0.6147872009940976
0,Gradient Boosting Classifier,0.6006523765144455
5,KNN,0.2819198508853681
7,Decision Tree Classifier,0.1143212177694936
6,Random Forest Classifier,0.0006213109661385


1:80: E501 line too long (82 > 79 characters)


Best algorithms according to 
- Random Forest
- Decision Tree
- K Nearest Neighbours

# Feature selection

In [17]:
rf_feature_select = SelectFromModel(RandomForestClassifier(n_estimators=100))
rf_feature_select.fit(x_train, y_train)

rf_sel_feature_count = rf_feature_select.get_support()
rf_selected_features = x_train.loc[:, rf_sel_feature_count].columns.tolist()
print(str(len(rf_selected_features)), 'selected features')

16 selected features


In [18]:
print('elected features:', rf_selected_features)

elected features: ['REGION_POPULATION_RELATIVE_DAYS_REGISTRATION', 'DAYS_REGISTRATION_DAYS_LAST_PHONE_CHANGE', 'DAYS_LAST_PHONE_CHANGE_AMT_ANNUITY', 'REGION_POPULATION_RELATIVE_DAYS_EMPLOYED', 'DAYS_EMPLOYED_AMT_INCOME_TOTAL', 'DAYS_BIRTH_AMT_INCOME_TOTAL', 'DAYS_REGISTRATION_AMT_GOODS_PRICE', 'DAYS_EMPLOYED_CNT_FAM_MEMBERS', 'DAYS_ID_PUBLISH_DAYS_LAST_PHONE_CHANGE', 'DAYS_ID_PUBLISH_AMT_GOODS_PRICE', 'CNT_FAM_MEMBERS_DAYS_LAST_PHONE_CHANGE', 'REGION_POPULATION_RELATIVE_DAYS_ID_PUBLISH', 'DAYS_EMPLOYED_AMT_ANNUITY', 'DAYS_EMPLOYED_DAYS_LAST_PHONE_CHANGE', 'DAYS_EMPLOYED_DAYS_REGISTRATION', 'REGION_POPULATION_RELATIVE_DAYS_LAST_PHONE_CHANGE']


In [19]:
# Select strongest features
x_train = x_train[rf_selected_features]
x_test = x_test[rf_selected_features]
test_df = test_df[rf_selected_features]

In [20]:
# Re-evaluate strongest predicting algorithm
# (step can be removed by selected previous strongest algorithm to speed-up processing)
base_score = 0
model_outcomes = []
for Name, classify in classifiers.items():
    classify.fit(x_train, y_train)
    predicting_y = classify.predict(x_test)
    model_outcomes.append({
    'Algorithm': str(Name),
    'Recall_score': str(met.recall_score(y_test, predicting_y))
    })

    if met.recall_score(y_test, predicting_y) > base_score:
        # prediction = classify.predict(test_df)
        base_score = met.recall_score(y_test, predicting_y)

    else:
        continue

2:80: E501 line too long (87 > 79 characters)
9:5: E122 continuation line missing indentation or outdented
10:5: E122 continuation line missing indentation or outdented


In [21]:
model_scores = pd.DataFrame(model_outcomes, columns=['Algorithm', 'Recall_score'])
model_scores.sort_values(by=['Recall_score'], ascending=False)

Unnamed: 0,Algorithm,Recall_score
3,GaussianNB,0.7691829760795278
4,BernoulliNB,0.666511338925132
2,Linear Discriminant Analyis,0.6439888164026095
8,Logistic Regression,0.6425908667287977
1,Ada Boost Classifier,0.6298539919229574
0,Gradient Boosting Classifier,0.5984777881329606
5,KNN,0.2775706741223983
7,Decision Tree Classifier,0.0986331158744951
6,Random Forest Classifier,0.0


1:80: E501 line too long (82 > 79 characters)


# Hyper parameter tuning

In [23]:
# Potentential classifiers to use

In [29]:
classifiers = {'Random_forest': {'model': RandomForestClassifier(),
                                 'params': {'n_estimators': [31, 35, 37]}},
               'Logistic_regression': {'model': LogisticRegression(solver='liblinear', multi_class='auto'),
                                       'params': {'C': [1, 10, 100, 1000],
                                                  'penalty': ['l1', 'l2'], }},
               'AdaBoostClassifier': {'model': AdaBoostClassifier(DecisionTreeClassifier(), random_state=7),
                                      'params': {'base_estimator__criterion': ["gini", "entropy"],
                                                 'base_estimator__splitter':   ["best", "random"],
                                                 'learning_rate':  [0.1, 0.3, 1.5]}},
               'GradientBoostingClassifier': {'model': GradientBoostingClassifier(),
                                              'params': {'loss': ["deviance"],
                                                         'n_estimators': [360, 380, 400],
                                                         'learning_rate': [0.015, 0.02, 0.03],
                                                         'max_depth': [2, 3, 4],
                                                         'min_samples_leaf': [60, 70, 80]}},
               'KNearestNeighbors': {'model': KNeighborsClassifier(),
                                     'params': {'n_neighbors': [2, 5, 7],
                                                'metric': ['euclidean', 'minkowski']}},
               'DecisionTreeClassifier': {'model': KNeighborsClassifier(),
                                          'params': {'criterion': ["gini", "entropy"],
                                                     'splitter': ['best', 'random'],
                                                     'max_depth': [3, None],
                                                     'max_features': [1, 5, 9],
                                                     'min_samples_leaf': [1, 5, 9]}}}

3:80: E501 line too long (107 > 79 characters)
6:80: E501 line too long (108 > 79 characters)
7:80: E501 line too long (98 > 79 characters)
8:80: E501 line too long (98 > 79 characters)
9:80: E501 line too long (85 > 79 characters)
10:80: E501 line too long (84 > 79 characters)
12:80: E501 line too long (89 > 79 characters)
13:80: E501 line too long (94 > 79 characters)
14:80: E501 line too long (80 > 79 characters)
15:80: E501 line too long (92 > 79 characters)
18:80: E501 line too long (87 > 79 characters)
20:80: E501 line too long (86 > 79 characters)
21:80: E501 line too long (84 > 79 characters)


In [25]:
# Selected classifiers based on model performance

In [65]:
classifiers = {'Logistic_regression': {'model': LogisticRegression(solver='liblinear', multi_class='auto'),
                                       'params': {'C': [1, 10, 100, 1000],
                                                  'penalty': ['l1', 'l2'], }},
               'LinearDiscriminant': {'model': LinearDiscriminantAnalysis(solver='svd'),
                                      'params': {'tol': [0.0001, 0.0002, 0.0003]}},
               'BernoulliNB': {'model': BernoulliNB(),
                               'params': {'alpha': [0.01, 0.1, 0.5, 1.0, 10.0]}}}

1:80: E501 line too long (107 > 79 characters)
4:80: E501 line too long (88 > 79 characters)
5:80: E501 line too long (83 > 79 characters)


In [62]:
# Select classifier algorithms to optimise
# NB: GaussianNB doesn't have any parameters to optimise
classifiers = {'Logistic_regression': {'model': LogisticRegression(solver='liblinear', multi_class='auto'),
                                       'params': {'C': [1, 10, 100, 1000],
                                                  'penalty': ['l1', 'l2'], }},
               'LinearDiscriminant': {'model': LinearDiscriminantAnalysis(solver='svd'),
                                      'params': {'tol': [0.0001, 0.0002, 0.0003]}},
               'BernoulliNB': {'model': BernoulliNB(),
                               'params': {'alpha': [0.01, 0.1, 0.5, 1.0, 10.0]}}}

scores = []
for model_name, mp in classifiers.items():
    grid = GridSearchCV(mp['model'],
                        mp['params'],
                        cv=10,
                        scoring='recall',
                        return_train_score=False,
                        n_jobs=-1)
    grid.fit(x_train, y_train)
    scores.append({
        'model': model_name,
        'best_score': grid.best_score_,
        'best_params': grid.best_params_
    })

1:80: E501 line too long (107 > 79 characters)
4:80: E501 line too long (88 > 79 characters)
5:80: E501 line too long (83 > 79 characters)


model_scores = pd.DataFrame(model_outcomes, columns=['Algorithm', 'Recall_score'])
model_scores.sort_values(by=['Recall_score'], ascending=False)

In [64]:
# Create table with best parameters per algorithm
model_parameters = pd.DataFrame(scores, columns=['model',
                                                 'best_score',
                                                 'best_params'])
model_parameters.sort_values(by=['best_score'], ascending=False)

Unnamed: 0,model,best_score,best_params
2,BernoulliNB,0.663963,{'alpha': 0.01}
1,LinearDiscriminant,0.641385,{'tol': 0.0001}
0,Logistic_regression,0.639781,"{'C': 1, 'penalty': 'l1'}"


# Modelling

Use the earlier identified top 3 algorithms with best performing parameters

In [None]:
# Random forest
model1 = RandomForestClassifier(
)
model1.fit(x_train, y_train)

In [None]:
# Decision Tree Classifier
model2 = DecisionTreeClassifier(criterion='gini',
                                max_depth=None, max_features=5,
                                min_samples_leaf=1, splitter='random')
model2.fit(x_train, y_train)

In [None]:
# KNN
model3 = KNeighborsClassifier('metric': 'euclidean', 'n_neighbors': 2
)
model3.fit(x_train, y_train)

# Model evaluation

In [None]:
# Evaluate model based on training data
predict_1 = model1.predict(x_train)
predict_2 = model2.predict(x_train)
predict_3 = model3.predict(x_train)

In [None]:
# roc curve for models
fpr1, tpr1, thresh1 = roc_curve(y_test, predict_1[:, 1], pos_label=1)
fpr2, tpr2, thresh2 = roc_curve(y_test, predict_2[:, 1], pos_label=1)
fpr2, tpr2, thresh2 = roc_curve(y_test, predict_3[:, 1], pos_label=1)

In [None]:
# roc curve for tpr = fpr 
random_probs = [0 for i in range(len(y_test))]
p_fpr, p_tpr, _ = roc_curve(y_test, random_probs, pos_label=1)

In [None]:
# auc scores
auc_score1 = roc_auc_score(y_test, pred_prob1[:, 1])
auc_score2 = roc_auc_score(y_test, pred_prob2[:, 1])

print(auc_score1, auc_score2)

In [None]:
# matplotlib
import matplotlib.pyplot as plt
plt.style.use('seaborn')

# plot roc curves
plt.plot(fpr1, tpr1, linestyle='--', color='orange', label='Random forest')
plt.plot(fpr2, tpr2, linestyle='--', color='green', label='Decision Tree')
plt.plot(fpr3, tpr3, linestyle='--', color='purple', label='KNN')
plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')
# title
plt.title('ROC curve')
# x label
plt.xlabel('False Positive Rate')
# y label
plt.ylabel('True Positive rate')

plt.legend(loc='best')
plt.savefig('ROC',dpi=300)
plt.show()

In [None]:
# Create confusion matrix
print("Confusion matrix")
y_actual = pd.Series(y_train, name='Actual')
y_predicted = pd.Series(y_pred_train, name='Predicted')
pd.crosstab(y_actual, y_predicted)

In [None]:
y_pred_train
#.value_counts()

In [None]:
print("Classification Report")
print(classification_report(y_pred_train, y_train))

In [None]:
# AUC evaluation


# Prediction

In [None]:
# Create predictions based on generated model
prediction = algorithm.predict(test_df)

In [None]:
# Generate Submission File
SK_ID_CURR = list(test_ids_df['SK_ID_CURR'])
predicted_test_values = pd.DataFrame({'SK_ID_CURR': SK_ID_CURR,'TARGET' :prediction})
predicted_test_values.to_csv(external_data + 'Submission_file.csv',index = False)