# Objective

# Potentential improvements

- Use unbalanced dataset vs artificially balanced
- Use a different method of feature selection (*e.g. correlation, Recursive feature selection*) 
- Include more or different algorithms
- Include more parameters as part of the Hyper Parameter Tuning process

# Preparation

## Load libraries

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import pylab as pl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
from sklearn import preprocessing
from sklearn import metrics as met
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.neighbors import RadiusNeighborsClassifier, KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold, train_test_split
from sklearn.utils import resample
from sklearn.metrics import roc_curve, roc_auc_score, f1_score
from scipy.stats import randint
import os
import errno

In [2]:
%load_ext pycodestyle_magic
%flake8_on
%matplotlib inline

### Create folder structure

In [3]:
os.chdir('..')
output_folder = './reports/figures/'
cleaned_folder = './data/processed/'
external_data = './data/external/'

# Import data

In [4]:
x_train = pd.read_csv(cleaned_folder+'x_train.csv')
x_test = pd.read_csv(cleaned_folder+'x_test.csv')
y_train = pd.read_csv(cleaned_folder+'y_train.csv', dtype='int64')
y_test = pd.read_csv(cleaned_folder+'y_test.csv', dtype='int64')
test_df = pd.read_csv(cleaned_folder+'test_df.csv')
test_ids_df = pd.read_csv(cleaned_folder+'test_ids_df.csv')

1:37: E226 missing whitespace around arithmetic operator
2:36: E226 missing whitespace around arithmetic operator
3:37: E226 missing whitespace around arithmetic operator
4:36: E226 missing whitespace around arithmetic operator
5:37: E226 missing whitespace around arithmetic operator
6:41: E226 missing whitespace around arithmetic operator


In [5]:
y_test = y_test['TARGET']
y_train = y_train['TARGET']

# Balance dataset

In [6]:
# Verify that the dataset is balanced
y_train.value_counts(normalize=False)

0    168267
1     15238
Name: TARGET, dtype: int64

**Result**: dataset unbalanced with only 9% customers with payment difficulties, which can be expected.

*While some algorithms can work with unbalanced datasets, we will be balancing the dataset to allow for algorithms that require a balanced sample.*

In [7]:
# Merge outcome variable & features
train_df = pd.concat([x_train, y_train], axis=1)

In [8]:
# Separate dataset based on outcome variable
no_pay_prob = train_df[train_df['TARGET'] == 0]
pay_prob = train_df[train_df['TARGET'] == 1]

In [9]:
# upsample - artificially add customers with payment difficulties
# Reason for upsampling is that our dataset is relatively small
pay_prob2 = resample(pay_prob,
                     replace=True,  # sample with replacement
                     n_samples=len(no_pay_prob),  # dataset to match customers without payment problems
                     random_state=18)

In [10]:
# Count of customers with payment difficulties
pay_prob.shape

(15238, 27)

In [11]:
# New count of customers without payment difficulties
pay_prob2.shape

(168267, 27)

In [12]:
# Combine dataset with added cases
train_df = pd.concat([pay_prob2, no_pay_prob])

In [13]:
# Separate dataset in preparation of modelling
y_train = train_df['TARGET']
x_train = train_df.drop('TARGET', axis=1)

# Feature selection

In [14]:
rf_feature_select = SelectFromModel(RandomForestClassifier(n_estimators=100))
rf_feature_select.fit(x_train, y_train)

rf_sel_feature_count = rf_feature_select.get_support()
rf_selected_features = x_train.loc[:, rf_sel_feature_count].columns.tolist()
print(str(len(rf_selected_features)), 'selected features')

16 selected features


In [15]:
print('Selected features:', rf_selected_features)

Selected features: ['REGION_POPULATION_RELATIVE_DAYS_REGISTRATION', 'DAYS_REGISTRATION_DAYS_LAST_PHONE_CHANGE', 'DAYS_LAST_PHONE_CHANGE_AMT_ANNUITY', 'REGION_POPULATION_RELATIVE_DAYS_EMPLOYED', 'DAYS_EMPLOYED_AMT_INCOME_TOTAL', 'DAYS_BIRTH_AMT_INCOME_TOTAL', 'DAYS_REGISTRATION_AMT_GOODS_PRICE', 'DAYS_EMPLOYED_CNT_FAM_MEMBERS', 'DAYS_ID_PUBLISH_DAYS_LAST_PHONE_CHANGE', 'DAYS_ID_PUBLISH_AMT_GOODS_PRICE', 'CNT_FAM_MEMBERS_DAYS_LAST_PHONE_CHANGE', 'REGION_POPULATION_RELATIVE_DAYS_ID_PUBLISH', 'DAYS_EMPLOYED_AMT_ANNUITY', 'DAYS_EMPLOYED_DAYS_LAST_PHONE_CHANGE', 'DAYS_EMPLOYED_DAYS_REGISTRATION', 'REGION_POPULATION_RELATIVE_DAYS_LAST_PHONE_CHANGE']


In [16]:
# Select strongest features
x_train = x_train[rf_selected_features]
x_test = x_test[rf_selected_features]
test_df = test_df[rf_selected_features]

# Algorithm selection

In [17]:
classifiers = {'KNN': KNeighborsClassifier(),
               'Random Forest Classifier': RandomForestClassifier(),
               'Decision Tree Classifier': DecisionTreeClassifier(),
               'Logistic Regression': LogisticRegression()}

In [19]:
# Evaluate strongest predicting algorithm in default setting
base_score = 0
model_outcomes = []
for Name, classify in classifiers.items():
    classify.fit(x_train, y_train)
    predicting_y = classify.predict(x_test)
    model_outcomes.append({
                           'Algorithm': str(Name),
                           'f1_score': str(met.f1_score(y_test, predicting_y))})
    if met.f1_score(y_test, predicting_y) > base_score:
        # prediction = classify.predict(test_df)
        base_score = met.f1_score(y_test, predicting_y)

    else:
        continue

8:5: E122 continuation line missing indentation or outdented
9:5: E122 continuation line missing indentation or outdented


In [20]:
model_scores = pd.DataFrame(model_outcomes, columns=['Algorithm', 'f1_score'])
model_scores.sort_values(by=['f1_score'], ascending=False)

Unnamed: 0,Algorithm,f1_score
0,Gradient Boosting Classifier,0.1891646414807178
1,Ada Boost Classifier,0.1850415259651364
8,Logistic Regression,0.1785421431962366
2,Linear Discriminant Analyis,0.1782727409541419
4,BernoulliNB,0.1736192595589722
3,GaussianNB,0.1709561044655032
5,KNN,0.1441535917396039
7,Decision Tree Classifier,0.0967517938430676
6,Random Forest Classifier,0.0003105590062111


Best algorithms according to model evaluation:
<ol>
<li> GaussianNB </li>
<li> BernoulliNB </li>
<li> Linear Discriminant Analyis </li>
<li> Logistic regression </li>

# Hyper parameter tuning
Optimise the top performing algorithms to create the best possible prediction

In [None]:
classifiers = {'Random_forest': {'model': RandomForestClassifier(),
                                 'params': {'n_estimators': [31, 35, 37]}},
               'Logistic_regression': {'model': LogisticRegression(solver='liblinear', multi_class='auto'),
                                       'params': {'C': [1, 10, 100, 1000],
                                                  'penalty': ['l1', 'l2'], }},
               'KNearestNeighbors': {'model': KNeighborsClassifier(),
                                     'params': {'n_neighbors': [2, 5, 7],
                                                'metric': ['euclidean', 'minkowski']}},
               'DecisionTreeClassifier': {'model': DecisionTreeClassifier(),
                                          'params': {'criterion': ["gini", "entropy"],
                                                     'splitter': ['best', 'random'],
                                                     'max_depth': [3, None],
                                                     'max_features': [1, 5, 9],
                                                     'min_samples_leaf': [1, 5, 9]}}}

In [None]:
# Select classifier algorithms to optimise
scores = []
for model_name, mp in classifiers.items():
    grid = GridSearchCV(mp['model'],
                        mp['params'],
                        cv=10,
                        scoring='f1_score',
                        return_train_score=False,
                        n_jobs=-1)
    grid.fit(x_train, y_train)
    scores.append({
        'model': model_name,
        'best_score': grid.best_score_,
        'best_params': grid.best_params_
    })

In [None]:
# Create table with best parameters per algorithm
model_parameters = pd.DataFrame(scores, columns=['model',
                                                 'best_score',
                                                 'best_params'])
model_parameters.sort_values(by=['best_score'], ascending=False)

**Result:** models perform slightly better than before parameter optimisation

In [None]:
qqq

# Modelling

Use the earlier identified top 3 algorithms with best performing parameters

In [None]:
model1 = GaussianNB()
model1.fit(x_train, y_train)

In [None]:
model2 = BernoulliNB(alpha=0.01)
model2.fit(x_train, y_train)

In [None]:
model3 = LinearDiscriminantAnalysis(tol=0.0001)
model3.fit(x_train, y_train)

In [None]:
model4 = LogisticRegression(solver='liblinear',
                            multi_class='auto',
                            C=1,
                            penalty='l1')
model4.fit(x_train, y_train)

# Model evaluation

In [None]:
# Create predictions from the training data on the test data
y_pred_mod_1 = model1.predict(x_test)
y_pred_mod_2 = model2.predict(x_test)
y_pred_mod_3 = model3.predict(x_test)
y_pred_mod_4 = model4.predict(x_test)

In [None]:
# Evaluate model based on training data
pred_prob_1 = model1.predict_proba(x_test)
pred_prob_2 = model2.predict_proba(x_test)
pred_prob_3 = model3.predict_proba(x_test)
pred_prob_4 = model4.predict_proba(x_test)

In [None]:
# roc curve for models
fpr1, tpr1, thresh1 = roc_curve(y_test, pred_prob_1[:, 1], pos_label=1)
fpr2, tpr2, thresh2 = roc_curve(y_test, pred_prob_2[:, 1], pos_label=1)
fpr3, tpr3, thresh3 = roc_curve(y_test, pred_prob_3[:, 1], pos_label=1)
fpr4, tpr4, thresh4 = roc_curve(y_test, pred_prob_4[:, 1], pos_label=1)

In [None]:
# Create benchmark predictions based on random chance for ROC comparison
random_probs = [0 for i in range(len(y_test))]
p_fpr, p_tpr, _ = roc_curve(y_test, random_probs, pos_label=1)

In [None]:
# Calculate auc scores
auc_score1 = roc_auc_score(y_test, pred_prob_1[:, 1])
auc_score2 = roc_auc_score(y_test, pred_prob_2[:, 1])
auc_score3 = roc_auc_score(y_test, pred_prob_3[:, 1])
auc_score4 = roc_auc_score(y_test, pred_prob_4[:, 1])

print('Model1 AUC:', auc_score1, 'Model2 AUC:', auc_score2, 'Model3 AUC:', auc_score3, 'Model4 AUC:', auc_score4)

In [None]:
# matplotlib
import matplotlib.pyplot as plt
plt.style.use('seaborn')

# plot roc curves
plt.plot(fpr1, tpr1, linestyle='--', color='orange', label='GaussianNB')
plt.plot(fpr2, tpr2, linestyle='--', color='green', label='BernoulliNB')
plt.plot(fpr3, tpr3, linestyle='--', color='purple', label='LinearDiscriminantAnalysis')
plt.plot(fpr4, tpr4, linestyle='--', color='blue', label='LogisticRegression')
plt.plot(p_fpr, p_tpr, linestyle='--', color='red')
plt.title('ROC curve')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')

plt.legend(loc='best')
plt.savefig(cleaned_folder+'ROC_graph', dpi=300)
plt.show()

Based on the above graph, the Gaussia Naive Bayes is the best at predicting customers with payment difficulties, but only slightly better than the other models.

In [None]:
# Create confusion matrix
print("Confusion matrix model 1")
y_actual = pd.Series(y_test, name='Actual')
y_predicted = pd.Series(y_pred_mod_1, name='Predicted')
pd.crosstab(y_actual, y_predicted)

In [None]:
print(classification_report(y_predicted, y_actual))

In [None]:
# Create confusion matrix
print("Confusion matrix model 2")
y_actual = pd.Series(y_test, name='Actual')
y_predicted = pd.Series(y_pred_mod_2, name='Predicted')
pd.crosstab(y_actual, y_predicted)

In [None]:
print(classification_report(y_predicted, y_actual))

In [None]:
# Create confusion matrix
print("Confusion matrix model 3")
y_actual = pd.Series(y_test, name='Actual')
y_predicted = pd.Series(y_pred_mod_3, name='Predicted')
pd.crosstab(y_actual, y_predicted)

In [None]:
print(classification_report(y_predicted, y_actual))

In [None]:
# Create confusion matrix
print("Confusion matrix model 4")
y_actual = pd.Series(y_test, name='Actual')
y_predicted = pd.Series(y_pred_mod_4, name='Predicted')
pd.crosstab(y_actual, y_predicted)

In [None]:
print(classification_report(y_predicted, y_actual))

# Create final model

In [None]:
# Combine train and test datasets
x_df = pd.concat([x_train, x_test])
y_df = pd.concat([y_train, y_test])

In [None]:
x_df.shape

In [None]:
y_df.shape

In [None]:
final_model = GaussianNB()
final_model.fit(x_df, y_df)

# Submission

In [None]:
# Create predictions based on generated model
prediction = final_model.predict(test_df)

In [None]:
# Generate Submission File
SK_ID_CURR = list(test_ids_df['SK_ID_CURR'])
predicted_test_values = pd.DataFrame({'SK_ID_CURR': SK_ID_CURR, 'TARGET': prediction})
predicted_test_values.to_csv(external_data + 'Submission_file.csv', index=False)

In [None]:
prediction.shape