# Homework Assignment 8

### Topics: Ensemble methods, GridSearchCV

In [1]:
import boto3
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import dmc_cost_function
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
pd.set_option('display.max_columns', 50)

In [2]:
## 1. Using pandas to read the training and testing data files

## Defining the bucket
s3 = boto3.resource('s3')
bucket_name = 'data-448-bucket-callaghan'
bucket = s3.Bucket(bucket_name)

file_key = 'train.csv'
file_key2 = 'test.csv'

bucket_object = bucket.Object(file_key)
bucket_object2 = bucket.Object(file_key2)

file_object = bucket_object.get()
file_object2 = bucket_object2.get()

file_content_stream = file_object.get('Body')
file_content_stream2 = file_object2.get('Body')

train = pd.read_csv(file_content_stream, sep = '|')
test = pd.read_csv(file_content_stream2, sep = '|')

train.head()

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,fraud
0,5,1054,54.7,7,0,3,0.027514,0.051898,0.241379,0
1,3,108,27.36,5,2,4,0.12963,0.253333,0.357143,0
2,3,1516,62.16,3,10,5,0.008575,0.041003,0.230769,0
3,6,1791,92.31,8,4,4,0.016192,0.051541,0.275862,0
4,5,430,81.53,3,7,2,0.062791,0.189605,0.111111,0


In [3]:
## Creating variables from previous homework assignments

## Training set:

## Variable 1 (from decision tree)
train['Interaction_1'] = np.where((train['trustLevel'] <= 1.5) & (train['scannedLineItemsPerSecond'] <= 0.012) & 
                                  (train['lineItemVoids'] <= 10.5), 0, 1)
## Variable 2 (from decision tree)
train['Interaction_2'] = np.where((train['trustLevel'] <= 1.5) & (train['scannedLineItemsPerSecond'] > 0.012) & 
                                  (train['totalScanTimeInSeconds'] <= 895.0), 0, 1)
## Variable 3 (from decision tree)
train['Interaction_3'] = np.where((train['trustLevel'] > 1.5) & (train['grandTotal'] <= 99.145) & 
                                  train['trustLevel'] <= 2.5, 1, 0)
## Variable 4 (from decision tree)
train['Interaction_4'] = np.where((train['trustLevel'] > 1.5) & (train['grandTotal'] > 99.145) & 
                                  train['valuePerSecond'] <= 0.06, 1, 0)
## Variable 5 - Low trustLevel (all frauds came from trustLevel = 1 or 2)
train['lowTrust'] = np.where(train['trustLevel'] <= 2, 1, 0)

## Variable 6 - Made a quantity modification
train['madeModification'] = np.where(train['quantityModifications'] > 0, 1, 0)

## Variable 7 - Attempted a scan without registration
train['madeScansWithoutRegistration'] = np.where(train['scansWithoutRegistration'] > 0, 1, 0)

## Variable 8 - High or low totalScanTimeInSeconds (huge differnece in mean and median values for fraud/not fraud in this field)
train['lowTotalScanTime'] = np.where(train['totalScanTimeInSeconds'] < 1000, 1, 0)

## Varibales from strong heredity principle
train['Heredity_Feature_1'] = train['trustLevel'] * train['lowTrust']
train['Heredity_Feature_2'] = train['trustLevel'] * train['scannedLineItemsPerSecond']
train['Heredity_Feature_3'] = train['lowTrust'] * train['scannedLineItemsPerSecond']

## Heredity_Feature_3: all observations less than 0.012 are not fraud in this tree
train['New_Interaction_1'] = np.where(train['Heredity_Feature_3'] <= 0.012, 1, 0)

## Lots of positove observations when Heredity_Feature_3 > 0.012, totalScanTimeInSeconds > 1298.0, and trustLevel <= 1.5
train['New_Interaction_2'] = np.where((train['Heredity_Feature_3'] > 0.012) & (train['totalScanTimeInSeconds'] > 1298.0) 
                                      & (train['trustLevel'] < 1.5), 1, 0)

## Mostly all negative observations when Heredity_Feature_3 > 0.012, totalScanTimeInSeconds <= 1298.0, and Heredity_Feature_1 > 0.119
train['New_Interaction_3'] = np.where((train['Heredity_Feature_3'] > 0.012) & (train['totalScanTimeInSeconds'] <= 1298.0) 
                                      & (train['Heredity_Feature_1'] > 1.5), 1, 0)

In [4]:
## Testing set:

## Variable 1
test['Interaction_1'] = np.where((test['trustLevel'] <= 1.5) & (test['scannedLineItemsPerSecond'] <= 0.012) & 
                                  (test['lineItemVoids'] <= 10.5), 0, 1)
## Variable 2
test['Interaction_2'] = np.where((test['trustLevel'] <= 1.5) & (test['scannedLineItemsPerSecond'] > 0.012) & 
                                  (test['totalScanTimeInSeconds'] <= 895.0), 0, 1)
## Variable 3
test['Interaction_3'] = np.where((test['trustLevel'] > 1.5) & (test['grandTotal'] <= 99.145) & 
                                  test['trustLevel'] <= 2.5, 1, 0)
## Variable 4
test['Interaction_4'] = np.where((test['trustLevel'] > 1.5) & (test['grandTotal'] > 99.145) & 
                                  test['valuePerSecond'] <= 0.06, 1, 0)
## Variable 5 - Low trustLevel
test['lowTrust'] = np.where(test['trustLevel'] <= 2, 1, 0)

## Variable 6 - Made a quantity modification
test['madeModification'] = np.where(test['quantityModifications'] > 0, 1, 0)

## Variable 7 - Attempted a scan without registration
test['madeScansWithoutRegistration'] = np.where(test['scansWithoutRegistration'] > 0, 1, 0)

## Variable 8 - High or low totalScanTimeInSeconds
test['lowTotalScanTime'] = np.where(test['totalScanTimeInSeconds'] < 1000, 1, 0)

## Varibales from strong heredity principle
test['Heredity_Feature_1'] = test['trustLevel'] * test['lowTrust']
test['Heredity_Feature_2'] = test['trustLevel'] * test['scannedLineItemsPerSecond']
test['Heredity_Feature_3'] = test['lowTrust'] * test['scannedLineItemsPerSecond']

test['New_Interaction_1'] = np.where(test['Heredity_Feature_3'] <= 0.012, 1, 0)

test['New_Interaction_2'] = np.where((test['Heredity_Feature_3'] > 0.012) & (test['totalScanTimeInSeconds'] > 1298.0) 
                                      & (test['trustLevel'] < 1.5), 1, 0)

test['New_Interaction_3'] = np.where((test['Heredity_Feature_3'] > 0.012) & (test['totalScanTimeInSeconds'] <= 1298.0) 
                                      & (test['Heredity_Feature_1'] > 1.5), 1, 0)

In [5]:
## 2. Using the train data-frame and the models from homework assignment 7 exercise 2 to split the train data-frame 
## into two data-frames: training (80%) and validation (20%) taking into account the proportions of 0s and 1s

## Defining the input and target variables
X = train.drop(columns = ['fraud'])
Y = train['fraud']

## Splitting the data
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size = 0.2, stratify = Y)

## Printing the first five observations
X_train.head()

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,Interaction_1,Interaction_2,Interaction_3,Interaction_4,lowTrust,madeModification,madeScansWithoutRegistration,lowTotalScanTime,Heredity_Feature_1,Heredity_Feature_2,Heredity_Feature_3,New_Interaction_1,New_Interaction_2,New_Interaction_3
89,6,1790,70.66,5,1,0,0.007263,0.039475,0.384615,1,1,1,1,0,0,1,0,0,0.043575,0.0,1,0,0
955,5,276,98.07,11,2,1,0.101449,0.355326,0.392857,1,1,1,1,0,1,1,1,0,0.507246,0.0,1,0,0
641,4,933,82.79,6,9,4,0.002144,0.088735,3.0,1,1,1,1,0,1,1,1,0,0.008574,0.0,1,0,0
1856,5,1764,79.93,8,0,3,0.005669,0.045312,0.8,1,1,1,1,0,1,0,0,0,0.028345,0.0,1,0,0
346,3,136,42.61,6,0,4,0.161765,0.313309,0.272727,1,1,1,1,0,1,0,1,0,0.485294,0.0,1,0,0


In [6]:
## i. Building the best model from your homework assignment 7 exercise 2(i) on the training data-frame

## Defining the input variables
X_train1 = X_train[['trustLevel', 'New_Interaction_1', 'Heredity_Feature_3', 'totalScanTimeInSeconds', 'New_Interaction_2']]
X_validation1 = X_validation[['trustLevel', 'New_Interaction_1', 'Heredity_Feature_3', 'totalScanTimeInSeconds', 'New_Interaction_2']]
X_test1 = test[['trustLevel', 'New_Interaction_1', 'Heredity_Feature_3', 'totalScanTimeInSeconds', 'New_Interaction_2']]

## Building the best model
rf_md = RandomForestClassifier(max_depth = 5, min_samples_leaf = 5, min_samples_split = 5, n_estimators = 500).fit(X_train1, Y_train)

## Predicting the likelihood of fraud on the validation and test data-frames
rf_val_preds = rf_md.predict_proba(X_validation1)[:, 1]
rf_test_preds = rf_md.predict_proba(X_test1)[:, 1]

## Computing the cost for the validation test predictions
print('Cost function score of RF model:', dmc_cost_function.cost_function(Y_validation, rf_val_preds))

Cost function score of RF model: -75.0


In [7]:
## ii. Building the best model from your homework assignment 7 exercise 2(ii) on the training data-frame

## Defining the input variables
X_train2 = X_train[['trustLevel', 'New_Interaction_1', 'Heredity_Feature_3', 'totalScanTimeInSeconds', 
                   'New_Interaction_2', 'valuePerSecond', 'New_Interaction_3']]
X_validation2 = X_validation[['trustLevel', 'New_Interaction_1', 'Heredity_Feature_3', 'totalScanTimeInSeconds', 
                   'New_Interaction_2', 'valuePerSecond', 'New_Interaction_3']]
X_test2 = test[['trustLevel', 'New_Interaction_1', 'Heredity_Feature_3', 'totalScanTimeInSeconds', 
                   'New_Interaction_2', 'valuePerSecond', 'New_Interaction_3']]

## Building the best model
ada_md = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth = 5, min_samples_leaf = 5, min_samples_split = 15),
                        learning_rate = 0.1, n_estimators = 100).fit(X_train2, Y_train)

## Predicting the likelihood of fraud on the validation and test data-frames
ada_val_preds = ada_md.predict_proba(X_validation2)[:, 1]
ada_test_preds = ada_md.predict_proba(X_test2)[:, 1]

## Computing the cost for the validation test predictions
print('Cost function score of Ada model:', dmc_cost_function.cost_function(Y_validation, ada_val_preds))

Cost function score of Ada model: -55.0


In [8]:
## iii. Building the best model from your homework assignment 7 exercise 2(iii) on the training data-frame

## Defining a MinMaxScaler
scaler = MinMaxScaler(feature_range = (0,1))

## Defining the input variables
X_train3 = scaler.fit_transform(X_train[['trustLevel', 'New_Interaction_1', 'Heredity_Feature_3', 
                                            'totalScanTimeInSeconds', 'New_Interaction_2', 'valuePerSecond']])
X_validation3 = scaler.fit_transform(X_validation[['trustLevel', 'New_Interaction_1', 'Heredity_Feature_3', 
                                                   'totalScanTimeInSeconds', 'New_Interaction_2', 'valuePerSecond']])
X_test3 = scaler.fit_transform(test[['trustLevel', 'New_Interaction_1', 'Heredity_Feature_3', 
                                            'totalScanTimeInSeconds', 'New_Interaction_2', 'valuePerSecond']])

## Building the best model
svm_md = SVC(probability = True, C = 0.1, gamma = 1, kernel = 'poly').fit(X_train3, Y_train)

## Predicting the likelihood of fraud on the validation and test data-frames
svm_val_preds = svm_md.predict_proba(X_validation3)[:, 1]
svm_test_preds = svm_md.predict_proba(X_test3)[:, 1]

## Computing the cost for the validation test predictions
print('Cost function score of SVM model:', dmc_cost_function.cost_function(Y_validation, svm_val_preds))

Cost function score of SVM model: -85.0


In [9]:
## 3. Using the prediction on the validation data-frame as inputs from parts (i)-(ii)-(iii) and the actual fraud values 
## from the validation data-frame to build a meta-learner to predict fraud

## Creating a data-frame with the base learners predictions
X_ensemble = pd.DataFrame({'RF': rf_val_preds, 'Ada': ada_val_preds, 'SVM': svm_val_preds})
X_ensemble_test = pd.DataFrame({'RF': rf_test_preds, 'Ada': ada_test_preds, 'SVM': svm_test_preds})

## Defining the custom scorer
my_scorer = make_scorer(dmc_cost_function.cost_function, greater_is_better = True, needs_proba = True)

## Defining the parameter dictionary
ensemble_param_grid = {'n_estimators': [100, 300, 500], 'max_depth': [3, 5, 7], 'min_samples_split': [5, 10, 15], 
                  'min_samples_leaf': [5, 10, 15]}

## Building the meta learner and performing hyper-parameter tuning
ensemble_grid_search = GridSearchCV(RandomForestClassifier(), ensemble_param_grid, cv = 3, scoring = my_scorer, 
                              n_jobs = -1).fit(X_ensemble, Y_validation)

## Identifying the optimal model
ensemble_md = ensemble_grid_search.best_estimator_

## Extracting the best hyper-parameters, best score, and the best cutoff value
print('Best hyper-parameters:\n', ensemble_grid_search.best_params_)
print('\nBest score:\n', ensemble_grid_search.best_score_)
print('\nCutoff value:', dmc_cost_function.cost_function_cutoff(Y_validation, ensemble_md.predict_proba(X_ensemble)[:, 1]))

## Finally, use the best meta-learner to predict the likelihood of fraud in the test data-frame
ensemble_test_preds = ensemble_md.predict_proba(X_ensemble_test)[:, 1]

## Exporting the test predictions as a csv file
likelihoods = pd.DataFrame({'Likelihoods': ensemble_test_preds})
likelihoods.to_csv('likelihoods.csv', index = False)

## Associated cutoff value = 

Best hyper-parameters:
 {'max_depth': 3, 'min_samples_leaf': 5, 'min_samples_split': 5, 'n_estimators': 100}

Best score:
 -15.0

Cutoff value: 0.47
