# Homework Assignment 6

## Topics: RFE (recursive feature elimination), feature selection

In [1]:
import boto3
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import precision_recall_cutoff as prc
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE, RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report

## 1. Using pandas to read the data files

## Defining the bucket
s3 = boto3.resource('s3')
bucket_name = 'data-448-bucket-callaghan'
bucket = s3.Bucket(bucket_name)

file_key = 'train.csv'
file_key2 = 'test.csv'

bucket_object = bucket.Object(file_key)
bucket_object2 = bucket.Object(file_key2)

file_object = bucket_object.get()
file_object2 = bucket_object2.get()

file_content_stream = file_object.get('Body')
file_content_stream2 = file_object2.get('Body')

train = pd.read_csv(file_content_stream, sep = '|')
test = pd.read_csv(file_content_stream2, sep = '|')

train.head()

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,fraud
0,5,1054,54.7,7,0,3,0.027514,0.051898,0.241379,0
1,3,108,27.36,5,2,4,0.12963,0.253333,0.357143,0
2,3,1516,62.16,3,10,5,0.008575,0.041003,0.230769,0
3,6,1791,92.31,8,4,4,0.016192,0.051541,0.275862,0
4,5,430,81.53,3,7,2,0.062791,0.189605,0.111111,0


In [2]:
## Creating variables from previous homework assignments

## Training set:

## Variable 1 (from decision tree)
train['Interaction_1'] = np.where((train['trustLevel'] <= 1.5) & (train['scannedLineItemsPerSecond'] <= 0.012) & 
                                  (train['lineItemVoids'] <= 10.5), 0, 1)
## Variable 2 (from decision tree)
train['Interaction_2'] = np.where((train['trustLevel'] <= 1.5) & (train['scannedLineItemsPerSecond'] > 0.012) & 
                                  (train['totalScanTimeInSeconds'] <= 895.0), 0, 1)
## Variable 3 (from decision tree)
train['Interaction_3'] = np.where((train['trustLevel'] > 1.5) & (train['grandTotal'] <= 99.145) & 
                                  train['trustLevel'] <= 2.5, 1, 0)
## Variable 4 (from decision tree)
train['Interaction_4'] = np.where((train['trustLevel'] > 1.5) & (train['grandTotal'] > 99.145) & 
                                  train['valuePerSecond'] <= 0.06, 1, 0)
## Variable 5 - Low trustLevel (all frauds came from trustLevel = 1 or 2)
train['lowTrust'] = np.where(train['trustLevel'] <= 2, 1, 0)

## Variable 6 - Made a quantity modification
train['madeModification'] = np.where(train['quantityModifications'] > 0, 1, 0)

## Variable 7 - Attempted a scan without registration
train['madeScansWithoutRegistration'] = np.where(train['scansWithoutRegistration'] > 0, 1, 0)

## Variable 8 - High or low totalScanTimeInSeconds (huge differnece in mean and median values for fraud/not fraud in this field)
train['lowTotalScanTime'] = np.where(train['totalScanTimeInSeconds'] < 1000, 1, 0)

## Varibales from strong heredity principle
train['Heredity_Feature_1'] = train['trustLevel'] * train['lowTrust']
train['Heredity_Feature_2'] = train['trustLevel'] * train['scannedLineItemsPerSecond']
train['Heredity_Feature_3'] = train['lowTrust'] * train['scannedLineItemsPerSecond']

## Heredity_Feature_3: all observations less than 0.012 are not fraud in this tree
train['New_Interaction_1'] = np.where(train['Heredity_Feature_3'] <= 0.012, 1, 0)

## Lots of positove observations when Heredity_Feature_3 > 0.012, totalScanTimeInSeconds > 1298.0, and trustLevel <= 1.5
train['New_Interaction_2'] = np.where((train['Heredity_Feature_3'] > 0.012) & (train['totalScanTimeInSeconds'] > 1298.0) 
                                      & (train['trustLevel'] < 1.5), 1, 0)

## Mostly all negative observations when Heredity_Feature_3 > 0.012, totalScanTimeInSeconds <= 1298.0, and Heredity_Feature_1 > 0.119
train['New_Interaction_3'] = np.where((train['Heredity_Feature_3'] > 0.012) & (train['totalScanTimeInSeconds'] <= 1298.0) 
                                      & (train['Heredity_Feature_1'] > 1.5), 1, 0)

In [3]:
## Testing set:

## Variable 1
test['Interaction_1'] = np.where((test['trustLevel'] <= 1.5) & (test['scannedLineItemsPerSecond'] <= 0.012) & 
                                  (test['lineItemVoids'] <= 10.5), 0, 1)
## Variable 2
test['Interaction_2'] = np.where((test['trustLevel'] <= 1.5) & (test['scannedLineItemsPerSecond'] > 0.012) & 
                                  (test['totalScanTimeInSeconds'] <= 895.0), 0, 1)
## Variable 3
test['Interaction_3'] = np.where((test['trustLevel'] > 1.5) & (test['grandTotal'] <= 99.145) & 
                                  test['trustLevel'] <= 2.5, 1, 0)
## Variable 4
test['Interaction_4'] = np.where((test['trustLevel'] > 1.5) & (test['grandTotal'] > 99.145) & 
                                  test['valuePerSecond'] <= 0.06, 1, 0)
## Variable 5 - Low trustLevel
test['lowTrust'] = np.where(test['trustLevel'] <= 2, 1, 0)

## Variable 6 - Made a quantity modification
test['madeModification'] = np.where(test['quantityModifications'] > 0, 1, 0)

## Variable 7 - Attempted a scan without registration
test['madeScansWithoutRegistration'] = np.where(test['scansWithoutRegistration'] > 0, 1, 0)

## Variable 8 - High or low totalScanTimeInSeconds
test['lowTotalScanTime'] = np.where(test['totalScanTimeInSeconds'] < 1000, 1, 0)

## Varibales from strong heredity principle
test['Heredity_Feature_1'] = test['trustLevel'] * test['lowTrust']
test['Heredity_Feature_2'] = test['trustLevel'] * test['scannedLineItemsPerSecond']
test['Heredity_Feature_3'] = test['lowTrust'] * test['scannedLineItemsPerSecond']

test['New_Interaction_1'] = np.where(test['Heredity_Feature_3'] <= 0.012, 1, 0)

test['New_Interaction_2'] = np.where((test['Heredity_Feature_3'] > 0.012) & (test['totalScanTimeInSeconds'] > 1298.0) 
                                      & (test['trustLevel'] < 1.5), 1, 0)

test['New_Interaction_3'] = np.where((test['Heredity_Feature_3'] > 0.012) & (test['totalScanTimeInSeconds'] <= 1298.0) 
                                      & (test['Heredity_Feature_1'] > 1.5), 1, 0)

In [4]:
## 2. Using the train data-frame to do the following:

## i) Splitting the train data-frame into training (80%) and testing (20%)

## Defining the input and target variables
X = train.drop(columns = ['fraud'])
Y = train['fraud']

## Defining empty list to store results
results_rfe = []
logit_support = []
rf_support = []
ada_support = []

## Repeating steps 100 times:
for i in tqdm(range(0, 100)):
    
    ## Splitting the data
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)
    
    
    ## Runing RFE (recursive feature elimination) with logistic regression as a base algorithm (with n_features_to_select = 5)
    logit_rfe = RFE(estimator = LogisticRegression(max_iter = 10000), n_features_to_select = 5).fit(X_train, Y_train)
    
    ## Appending the features to be selected
    results_rfe.append(logit_rfe.support_)
    logit_support.append(logit_rfe.support_)
    
    
    ## Runing RFE (recursive feature elimination) with random forest (with 500 trees and max depth equal to 3) as a base algorithm 
    ## (with n_features_to_select = 5)
    rf_rfe = RFE(estimator = RandomForestClassifier(max_depth = 3, n_estimators = 500), n_features_to_select = 5).fit(X_train, Y_train)
    
    ## Appending the features to be selected
    results_rfe.append(rf_rfe.support_)
    rf_support.append(rf_rfe.support_)
    
    
    ## Runing RFE (recursive feature elimination) with AdaBoost (with 500 trees, max depth equal to 3, and learning rate equal to 0.01) 
    ## as a base algorithm (with n_features_to_select = 5)
    ada_rfe = RFE(estimator = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth = 3), n_estimators = 500, 
                                                learning_rate = 0.01), n_features_to_select = 5).fit(X_train, Y_train)
    
    ## Appending the features to be selected
    results_rfe.append(ada_rfe.support_)
    ada_support.append(ada_rfe.support_)

100%|██████████| 100/100 [1:37:51<00:00, 58.72s/it]


In [5]:
## Overall:

## Changing the results list to a data-frame
results_rfe = pd.DataFrame(results_rfe, columns = X.columns)

## Identifying features to be selected
results = pd.DataFrame(100 * results_rfe.apply(np.sum, axis = 0) / results_rfe.shape[0], columns = ['Percentage']).reset_index()

## Printing results
results.sort_values('Percentage', ascending = False)

Unnamed: 0,index,Percentage
0,trustLevel,70.0
20,New_Interaction_1,66.333333
19,Heredity_Feature_3,66.333333
1,totalScanTimeInSeconds,66.333333
21,New_Interaction_2,44.666667
7,valuePerSecond,33.333333
3,lineItemVoids,29.333333
22,New_Interaction_3,27.0
4,scansWithoutRegistration,26.333333
8,lineItemVoidsPerPosition,19.333333


In [6]:
## Logistic Regression feature selection:

## Changing the results list to a data-frame
logit_support = pd.DataFrame(logit_support, columns = X.columns)

## Identifying features to be selected
logit_support = pd.DataFrame(100 * logit_support.apply(np.sum, axis = 0) / logit_support.shape[0], columns = ['Percentage']).reset_index()

## Printing results
logit_support.sort_values('Percentage', ascending = False)

Unnamed: 0,index,Percentage
20,New_Interaction_1,100.0
7,valuePerSecond,87.0
22,New_Interaction_3,81.0
0,trustLevel,67.0
16,lowTotalScanTime,53.0
8,lineItemVoidsPerPosition,39.0
21,New_Interaction_2,34.0
9,Interaction_1,32.0
10,Interaction_2,7.0
4,scansWithoutRegistration,0.0


In [7]:
## Random Forest Classifier feature selection:

## Changing the results list to a data-frame
rf_support = pd.DataFrame(rf_support, columns = X.columns)

## Identifying features to be selected
rf_support = pd.DataFrame(100 * rf_support.apply(np.sum, axis = 0) / rf_support.shape[0], columns = ['Percentage']).reset_index()

## Printing results
rf_support.sort_values('Percentage', ascending = False)

Unnamed: 0,index,Percentage
0,trustLevel,100.0
21,New_Interaction_2,100.0
19,Heredity_Feature_3,100.0
1,totalScanTimeInSeconds,100.0
20,New_Interaction_1,99.0
18,Heredity_Feature_2,1.0
12,Interaction_4,0.0
17,Heredity_Feature_1,0.0
16,lowTotalScanTime,0.0
15,madeScansWithoutRegistration,0.0


In [8]:
## AdaBoost Classifier feature selection:

## Changing the results list to a data-frame
ada_support = pd.DataFrame(ada_support, columns = X.columns)

## Identifying features to be selected
ada_support = pd.DataFrame(100 * ada_support.apply(np.sum, axis = 0) / ada_support.shape[0], columns = ['Percentage']).reset_index()

## Printing results
ada_support.sort_values('Percentage', ascending = False)

Unnamed: 0,index,Percentage
19,Heredity_Feature_3,99.0
1,totalScanTimeInSeconds,99.0
3,lineItemVoids,88.0
4,scansWithoutRegistration,79.0
2,grandTotal,55.0
0,trustLevel,43.0
8,lineItemVoidsPerPosition,19.0
7,valuePerSecond,13.0
18,Heredity_Feature_2,4.0
5,quantityModifications,1.0


#### Top-5 Variables: trustLevel, New_Interaction_1, Heredity_Feature_3, totalScanTimeInSeconds, New_Interaction_2

## Modeling with all variables vs. Top-5 overall from RFE

In [15]:
## All variables

## Defining the input and target variables
X = train.drop(columns = ['fraud'])
Y = train['fraud']

## Splitting the data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)

## Building the models
rf_md = RandomForestClassifier(max_depth = 5, n_estimators = 500).fit(X_train, Y_train)

## Predicting on the test set
rf_preds = rf_md.predict_proba(X_test)[:, 1]

## Using the precision-recall curve function to extract prediction labels
rf_labels = prc.precision_recall_cutoff(Y_test, rf_preds)

## Computing the classification report
print(classification_report(Y_test, rf_labels))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       355
           1       0.78      0.86      0.82        21

    accuracy                           0.98       376
   macro avg       0.89      0.92      0.90       376
weighted avg       0.98      0.98      0.98       376



In [16]:
## Top-5 variables

## Defining the input and target variables
X = train[['trustLevel', 'New_Interaction_1', 'Heredity_Feature_3', 'totalScanTimeInSeconds', 'New_Interaction_2']]
Y = train['fraud']

## Splitting the data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)

## Building the models
rf_md = RandomForestClassifier(max_depth = 5, n_estimators = 500).fit(X_train, Y_train)

## Predicting on the test set
rf_preds = rf_md.predict_proba(X_test)[:, 1]

## Using the precision-recall curve function to extract prediction labels
rf_labels = prc.precision_recall_cutoff(Y_test, rf_preds)

## Computing the classification report
print(classification_report(Y_test, rf_labels))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       355
           1       0.82      0.86      0.84        21

    accuracy                           0.98       376
   macro avg       0.90      0.92      0.91       376
weighted avg       0.98      0.98      0.98       376



#### Based on the classification reports, we can see that the Radom Forest Classifier model with the top-5 input variables had slightly better prediction power over the model with all possible input variables. 