# Predictive Analtics Exam 2

### 15. Considering the train.csv and test.csv data files containing information on default payments, demographic factors, credit data, history of payment, and bill statements of credit card clients in Taiwan from April 2005 to September 2005. The goal is to predict default payment next month on the test.csv data file

In [1]:
## Importing necessary libraries

import boto3
import pandas as pd; pd.set_option('display.max_columns', 50)
import numpy as np
import matplotlib.pyplot as plt
import precision_recall_cutoff_exam2 as prc
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import f1_score

In [2]:
## a) Using the pandas library to read the train.csv and test.csv data files and create two data-frames called train and test

## Defining the bucket
s3 = boto3.resource('s3')
bucket_name = 'data-448-bucket-callaghan'
bucket = s3.Bucket(bucket_name)

file_key = 'train(1).csv'
file_key2 = 'test(1).csv'

bucket_object = bucket.Object(file_key)
bucket_object2 = bucket.Object(file_key2)

file_object = bucket_object.get()
file_object2 = bucket_object2.get()

file_content_stream = file_object.get('Body')
file_content_stream2 = file_object2.get('Body')

train = pd.read_csv(file_content_stream)
test = pd.read_csv(file_content_stream2)

train.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,400000,1,1,2,32,0,0,0,0,0,0,55773,55917,51389,48272,49478,51242,3028,3023,3000,3000,3000,38662,0
1,120000,2,2,2,30,-1,-1,-1,-1,-1,-1,140,3230,3011,1964,1883,1538,3230,3011,1964,1883,1538,1911,0
2,270000,2,2,2,32,0,0,0,0,0,0,59710,49986,104390,94856,86461,83650,1808,69563,2891,2689,3012,2771,0
3,280000,2,2,1,27,0,0,0,0,0,0,280913,283222,273160,257689,193231,191143,11052,9563,15017,5374,5420,6021,0
4,30000,2,1,2,27,0,0,-1,0,0,-2,1512,2458,664,1814,0,0,1000,664,1500,0,0,0,0


In [3]:
## Engineering features from Exam 1

## Train set:

## Most common repayment status
train['Most_Common'] = np.nan
for i in range(0, train.shape[0]):
    train.at[i, 'Most_Common'] = train[['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']].loc[i].mode()[0]

## From plot tree:
train['Tree2'] = np.where((train['PAY_0'] <= 1.5) & (train['PAY_2'] <= 1.5) & (train['PAY_AMT3'] > 395.0), 1, 0)
train['Tree6'] = np.where((train['PAY_0'] > 1.5) & (train['PAY_6'] <= 1.0) & (train['BILL_AMT1'] > 649.5), 1, 0)
train['Tree7'] = np.where((train['PAY_0'] > 1.5) & (train['PAY_6'] > 1.0) & (train['PAY_AMT3'] <= 14177.0), 1, 0)



## Test set:

## Most common repayment status
test['Most_Common'] = np.nan
for i in range(0, test.shape[0]):
    test.at[i, 'Most_Common'] = test[['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']].loc[i].mode()[0]

## From plot tree:
test['Tree2'] = np.where((test['PAY_0'] <= 1.5) & (test['PAY_2'] <= 1.5) & (test['PAY_AMT3'] > 395.0), 1, 0)
test['Tree6'] = np.where((test['PAY_0'] > 1.5) & (test['PAY_6'] <= 1.0) & (test['BILL_AMT1'] > 649.5), 1, 0)
test['Tree7'] = np.where((test['PAY_0'] > 1.5) & (test['PAY_6'] > 1.0) & (test['PAY_AMT3'] <= 14177.0), 1, 0)

In [4]:
## b) Splitting the train data-frame intro training (80%) and validation (20%) (taking into account the proportions of 0s and 1s)

## Defining the input and target variables
X = train.drop(columns = ['default payment next month'])
Y = train['default payment next month']

## Splitting the data
X_training, X_validation, Y_training, Y_validation = train_test_split(X, Y, test_size = 0.2, stratify = Y)

#### Random Forest:

In [10]:
## c) Using the top 7 variables from Exercise 15 part (e) in Exam 1 to  build a model on the training data-frame

## Redefining the input and target variables
X_training = X_training[['PAY_0', 'PAY_2', 'Most_Common', 'Tree2', 'Tree6', 'PAY_3', 'Tree7']]
X_validation = X_validation[['PAY_0', 'PAY_2', 'Most_Common', 'Tree2', 'Tree6', 'PAY_3', 'Tree7']]
test = test[['PAY_0', 'PAY_2', 'Most_Common', 'Tree2', 'Tree6', 'PAY_3', 'Tree7']]

## Building a Random Forest Model with default hyper-parameters
rf_md = RandomForestClassifier().fit(X_training, Y_training)

## Predicting on the validation set
rf_preds = rf_md.predict_proba(X_validation)[:, 1]

## Extracting estimated labels using the PRC function
rf_labels = prc.precision_recall_cutoff(Y_validation, rf_preds)

## Extracting optimal cutoff using the PRC function
rf_cutoff = prc.precision_recall_cutoff_cutoff(Y_validation, rf_preds)

## Reporting the optimal cutoff value
print('Optimal Cutoff of Random Forest Model:', round(rf_cutoff, 3))

## Reporting the F1-Score of the model
print('\nF1-Score of Random Forest Model:', round(f1_score(Y_validation, rf_labels) * 100, 2), '%')

Optimal Cutoff of Random Forest Model: 0.284

F1-Score of Random Forest Model: 55.26 %


In [6]:
## Tuning the Random Forest model on the validation data-frame

## Defining the parameter dictionary
rf_param_grid = {'n_estimators': [100, 300, 500], 'max_depth': [3, 5, 7], 'min_samples_split': [5, 10, 15], 
                  'min_samples_leaf': [5, 10, 15]}

## Running GridSearchCV with 3 folds
rf_grid_search = GridSearchCV(RandomForestClassifier(), rf_param_grid, cv = 3, scoring = 'f1', n_jobs = -1).fit(X_validation, Y_validation)

## Extracting the best hyper-parameters
print('Optimal hyper-parameters for Random Forest Model: \n', rf_grid_search.best_params_)
print('\nOptimal F1-Score:\n', rf_grid_search.best_score_)

Optimal hyper-parameters for Random Forest Model: 
 {'max_depth': 3, 'min_samples_leaf': 10, 'min_samples_split': 15, 'n_estimators': 100}

Optimal F1-Score:
 0.48631723859203585


In [11]:
## Building a Random Forest model with the optimal hyper-parameters
rf_md = RandomForestClassifier(max_depth = 3, min_samples_leaf = 10, min_samples_split = 15, 
                               n_estimators = 100).fit(X_training, Y_training)

## Predicting on the validation set
rf_preds = rf_md.predict_proba(X_validation)[:, 1]

## Extracting estimated labels using the PRC function
rf_labels = prc.precision_recall_cutoff(Y_validation, rf_preds)

## Extracting optimal cutoff using the PRC function
rf_cutoff = prc.precision_recall_cutoff_cutoff(Y_validation, rf_preds)

## Reporting the optimal cutoff value
print('Optimal Cutoff of Random Forest Model:', round(rf_cutoff, 3))

## Reporting the F1-Score of the model
print('\nF1-Score of Random Forest Model:', round(f1_score(Y_validation, rf_labels) * 100, 2), '%')

Optimal Cutoff of Random Forest Model: 0.228

F1-Score of Random Forest Model: 55.52 %


In [12]:
## Finally, using the optimal model to predict the likelihood of default payment next month on the test

rf_test_preds = rf_md.predict_proba(test)[:, 1]

#### AdaBoost:

In [14]:
## d) Using the top 7 variables from Exercise 15 part (e) in Exam 1 to  build a model on the training data-frame

## Building anAdaBoost Model with default hyper-parameters
ada_md = AdaBoostClassifier().fit(X_training, Y_training)

## Predicting on the validation set
ada_preds = ada_md.predict_proba(X_validation)[:, 1]

## Extracting estimated labels using the PRC function
ada_labels = prc.precision_recall_cutoff(Y_validation, ada_preds)

## Extracting optimal cutoff using the PRC function
ada_cutoff = prc.precision_recall_cutoff_cutoff(Y_validation, ada_preds)

## Reporting the optimal cutoff value
print('Optimal Cutoff of AdaBoost Model:', round(ada_cutoff, 3))

## Reporting the F1-Score of the model
print('\nF1-Score of AdaBoost Model:', round(f1_score(Y_validation, ada_labels) * 100, 2), '%')

Optimal Cutoff of AdaBoost Model: 0.496

F1-Score of AdaBoost Model: 55.2 %


In [None]:
## Tuning the AdaBoost model on the validation data-frame

## Defining the parameter dictionary
ada_param_grid = {'n_estimators': [100, 300, 500], 'base_estimator__min_samples_split': [5, 10, 15], 
                  'base_estimator__min_samples_leaf': [5, 10, 15], 'base_estimator__max_depth': [3, 5, 7], 
                  'learning_rate': [0.001, 0.01, 0.1, 1]}

## Running GridSearchCV with 3 folds
ada_grid_search = GridSearchCV(AdaBoostClassifier(base_estimator = DecisionTreeClassifier()), ada_param_grid, 
                               cv = 3, scoring = 'f1', n_jobs = -1).fit(X_validation, Y_validation)

## Extracting the best hyper-parameters
print('Optimal hyper-parameters for AdaBoost Model: \n', ada_grid_search.best_params_)
print('\nOptimal F1-Score:\n', ada_grid_search.best_score_)

In [None]:
## Building a AdaBoost model with the optimal hyper-parameters
ada_md = AdaBoostClassifier(max_depth = 3, min_samples_leaf = 10, min_samples_split = 15, 
                               n_estimators = 100).fit(X_training, Y_training)

## Predicting on the validation set
ada_preds = ada_md.predict_proba(X_validation)[:, 1]

## Extracting estimated labels using the PRC function
ada_labels = prc.precision_recall_cutoff(Y_validation, ada_preds)

## Extracting optimal cutoff using the PRC function
ada_cutoff = prc.precision_recall_cutoff_cutoff(Y_validation, ada_preds)

## Reporting the optimal cutoff value
print('Optimal Cutoff of AdaBoost Model:', round(ada_cutoff, 3))

## Reporting the F1-Score of the model
print('\nF1-Score of AdaBoost Model:', round(f1_score(Y_validation, ada_labels) * 100, 2), '%')

In [None]:
## Finally, using the optimal model to predict the likelihood of default payment next month on the test

ada_test_preds = ada_md.predict_proba(test)[:, 1]

#### XGBoost:

In [None]:
Use the provided precision recall cutoff.py (posted under the Exam
2 link) file to estimate the optimal cutoff value. Report the F1-score of the model.
Finally, use the optimal model to predict the likelihood of default payment next
month on the test.

In [None]:
## Model: RandomForestClassifier

## Defining the parameter dictionary
rf_param_grid = {'n_estimators': [100, 300, 500], 'max_depth': [3, 5, 7], 'min_samples_split': [5, 10, 15], 
                  'min_samples_leaf': [5, 10, 15]}

## Running GridSearchCV with 3 folds
rf_grid_search = GridSearchCV(RandomForestClassifier(), rf_param_grid, cv = 3, scoring = 'f1', n_jobs = -1).fit(X_validation, Y_validation)

## Extracting the best hyper-parameters
print('Optimal hyper-parameters for Random Forest Model: \n', rf_grid_search.best_params_)
print('\nOptimal F1-Score:\n', rf_grid_search.best_score_)