# Objective

# Preparation

## Load libraries

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import pylab as pl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
from sklearn import preprocessing
from sklearn import metrics as met
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.neighbors import RadiusNeighborsClassifier, KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold, train_test_split
import os
import errno

### Create folder structure

In [2]:
os.chdir('..')
output_folder = './reports/figures/'
cleaned_folder = './data/processed/'
external_data = './data/external/'

# Import data

In [6]:
x_train = pd.read_csv(cleaned_folder+'x_train.csv')
x_test = pd.read_csv(cleaned_folder+'x_test.csv')
y_train = pd.read_csv(cleaned_folder+'y_train.csv')
y_test = pd.read_csv(cleaned_folder+'y_test.csv')
test_df = pd.read_csv(cleaned_folder+'test_df.csv')
test_ids_df = pd.read_csv(cleaned_folder+'test_ids_df.csv')

In [7]:
y_test = y_test['TARGET']
y_train = y_train['TARGET']

# Algorithm selection

In [8]:
classifiers = {'Gradient Boosting Classifier': GradientBoostingClassifier(), 
               'Ada Boost Classifier':AdaBoostClassifier(),
               'Linear Discriminant Analyis': LinearDiscriminantAnalysis(),
               'GaussianNB':GaussianNB(),
               'BerNB':BernoulliNB(),
               'KNN':KNeighborsClassifier(),
               'Random Forest Classifier': RandomForestClassifier(),
               'Decision Tree Classifier' : DecisionTreeClassifier(),
               'Logistic Regression': LogisticRegression()}

In [9]:
base_accuracy = 0
model_outcomes = []
for Name,classify in classifiers.items():
    classify.fit(x_train, y_train)
    predicting_y = classify.predict(x_test)
    model_outcomes.append({
    'Algorithm': str(Name),
    'Score': str(met.accuracy_score(y_test, predicting_y))
    })

    if met.accuracy_score(y_test,predicting_y) > base_accuracy:
        base_accuracy = met.accuracy_score(y_test,predicting_y)

    else:
        continue

In [10]:
model_scores = pd.DataFrame(model_outcomes, columns=['Algorithm','Score'])
model_scores.sort_values(by=['Score'] , ascending=False)

Unnamed: 0,Algorithm,Score
0,Gradient Boosting Classifier,0.9181384703414076
1,Ada Boost Classifier,0.9181384703414076
2,Linear Discriminant Analyis,0.9181384703414076
8,Logistic Regression,0.9181384703414076
6,Random Forest Classifier,0.9181003242418464
5,KNN,0.9118062178142284
4,BerNB,0.8690825863055502
7,Decision Tree Classifier,0.8275287685167525
3,GaussianNB,0.7573145145908831


# Feature selection

In [11]:
rf_feature_select = SelectFromModel(RandomForestClassifier(n_estimators=100))
rf_feature_select.fit(x_train, y_train)

rf_sel_feature_count = rf_feature_select.get_support()
rf_selected_features = x_train.loc[:, rf_sel_feature_count].columns.tolist()
print(str(len(rf_selected_features)), 'selected features')

16 selected features


In [12]:
print('elected features:', rf_selected_features)

elected features: ['REGION_POPULATION_RELATIVE_DAYS_REGISTRATION', 'DAYS_REGISTRATION_DAYS_LAST_PHONE_CHANGE', 'DAYS_LAST_PHONE_CHANGE_AMT_ANNUITY', 'REGION_POPULATION_RELATIVE_DAYS_EMPLOYED', 'DAYS_EMPLOYED_AMT_INCOME_TOTAL', 'DAYS_BIRTH_AMT_INCOME_TOTAL', 'DAYS_REGISTRATION_AMT_GOODS_PRICE', 'DAYS_EMPLOYED_CNT_FAM_MEMBERS', 'DAYS_ID_PUBLISH_DAYS_LAST_PHONE_CHANGE', 'DAYS_ID_PUBLISH_AMT_GOODS_PRICE', 'CNT_FAM_MEMBERS_DAYS_LAST_PHONE_CHANGE', 'REGION_POPULATION_RELATIVE_DAYS_ID_PUBLISH', 'DAYS_EMPLOYED_AMT_ANNUITY', 'DAYS_EMPLOYED_DAYS_LAST_PHONE_CHANGE', 'DAYS_EMPLOYED_DAYS_REGISTRATION', 'REGION_POPULATION_RELATIVE_DAYS_LAST_PHONE_CHANGE']


In [13]:
# Select strongest features
x_train = x_train[rf_selected_features]
x_test = x_test[rf_selected_features]
test_df = test_df[rf_selected_features]

In [15]:
# Re-evaluate strongest predicting algorithm
# (step can be removed by selected previous strongest algorithm to speed-up processing)
base_accuracy = 0
model_outcomes = []
for Name,classify in classifiers.items():
    classify.fit(x_train, y_train)
    predicting_y = classify.predict(x_test)
    model_outcomes.append({
    'Algorithm': str(Name),
    'Score': str(met.accuracy_score(y_test,predicting_y))
    })

    if met.accuracy_score(y_test,predicting_y) > base_accuracy:
        #prediction = classify.predict(test_df)
        base_accuracy = met.accuracy_score(y_test,predicting_y)

    else:
        continue

In [16]:
model_scores = pd.DataFrame(model_outcomes, columns=['Algorithm','Score'])
model_scores.sort_values(by=['Score'] , ascending=False)

Unnamed: 0,Algorithm,Score
1,Ada Boost Classifier,0.9181384703414076
2,Linear Discriminant Analyis,0.9181384703414076
4,BerNB,0.9181384703414076
8,Logistic Regression,0.9181384703414076
0,Gradient Boosting Classifier,0.9181003242418464
6,Random Forest Classifier,0.9181003242418464
5,KNN,0.9125309937058936
7,Decision Tree Classifier,0.8221247377455655
3,GaussianNB,0.6184499968211584


# Hyper parameter tuning

In [17]:
classifiers = {'Random_forest': {'model': RandomForestClassifier(), 'params' : {'n_estimators': [31, 35, 37]}},
               'Logistic_regression': {'model': LogisticRegression(solver='liblinear', multi_class='auto'), 
                                       'params' : {'C': [1, 10, 100, 1000],
                                       'penalty': ['l1','l2'],}},
               'AdaBoostClassifier': {'model': AdaBoostClassifier(DecisionTreeClassifier(), random_state=7), 'params' :
                                     {"base_estimator__criterion" : ["gini", "entropy"],
                                      "base_estimator__splitter" :   ["best", "random"],
                                      "learning_rate":  [0.1, 0.3, 1.5]}},
               'GradientBoostingClassifier': {'model' : GradientBoostingClassifier(), 'params' :
                                     {'loss' : ["deviance"],
                                      'n_estimators' : [360, 380, 400],
                                      'learning_rate': [0.015, 0.02, 0.03],
                                      'max_depth': [2, 3, 4],
                                      'min_samples_leaf': [60, 70, 80],
                                     }},
               'KNearestNeighbors': {'model': KNeighborsClassifier(),
                                     'params' : {'n_neighbors':[2, 5, 7],
                                         'metric':['euclidean', 'minkowski']}},

              }

In [None]:
scores = []
for model_name, mp in classifiers.items():
    grid = GridSearchCV(mp['model'], mp['params'], cv=10, return_train_score=False, n_jobs=-1)
    grid.fit(x_train,y_train)
    scores.append({
        'model': model_name,
        'best_score': grid.best_score_,
        'best_params': grid.best_params_
    })

In [None]:
# Create table with best paramers per algorithm
model_parameters = pd.DataFrame(scores, columns=['algorithm','best_score','best_params'])
model_parameters.sort_values(by=['best_score'], ascending=False)

# Model creation

In [None]:
# Select all features resulting from the parameter tuning process
algorithm = GradientBoostingClassifier(learning_rate=0.02, 
                                       loss='deviance', 
                                       max_depth=3, 
                                       min_samples_leaf=70, 
                                       n_estimators=380)
algorithm.fit(x_train, y_train)

In [None]:
# Evaluate model based on training data
y_pred_train = algorithm.predict(x_train)

In [None]:
# Create confusion matrix
print("Confusion matrix")
y_actual = pd.Series(y_train, name='Actual')
y_predicted = pd.Series(y_pred_train, name='Predicted')
pd.crosstab(y_actual, y_predicted)

In [None]:
print("Classification Report")
print(classification_report(y_pred_train, y_train))

# Prediction

In [None]:
# Create predictions based on generated model
prediction = algorithm.predict(test_df)

In [None]:
# Generate Submission File
SK_ID_CURR = list(test_ids_df['SK_ID_CURR'])
predicted_test_values = pd.DataFrame({'SK_ID_CURR': SK_ID_CURR,'TARGET' :prediction})
predicted_test_values.to_csv(external_data + 'Submission_file.csv',index = False)