In [9]:
import boto3
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import cost_function
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import make_scorer, confusion_matrix
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
pd.set_option('display.max_columns', 50)
# Defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'data-445-wagner'
bucket = s3.Bucket(bucket_name)

# Defining the file to be read from s3 bucket
file_key = "train.csv"

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

# Reading the csv file
train = pd.read_csv(file_content_stream, sep = '|')

# Defining the file to be read from s3 bucket
file_key1 = "test.csv"

bucket_object1 = bucket.Object(file_key1)
file_object1 = bucket_object1.get()
file_content_stream1 = file_object1.get('Body')

# Reading the csv file
test = pd.read_csv(file_content_stream1, sep = '|')
test.head()

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition
0,4,467,88.48,4,8,4,0.014989,0.189465,0.571429
1,3,1004,58.99,7,6,1,0.026892,0.058755,0.259259
2,1,162,14.0,4,5,4,0.006173,0.08642,4.0
3,5,532,84.79,9,3,4,0.026316,0.15938,0.642857
4,5,890,42.16,4,0,0,0.021348,0.047371,0.210526


In [10]:
#Creating interactions and features from past assignments for train dataset
train['Interaction_1'] = np.where((train['trustLevel'] <= 1.5) & (train['scannedLineItemsPerSecond'] <= 0.012) & (train['lineItemVoids'] <= 10.5), 1, 0)
train['Interaction_2'] = np.where((train['trustLevel'] <= 1.5) & (train['scannedLineItemsPerSecond'] <= 0.012) & (train['lineItemVoids'] >= 10.5), 1, 0)
train['Interaction_3'] = np.where((train['trustLevel'] <= 1.5) & (train['scannedLineItemsPerSecond'] >= 0.012) & (train['totalScanTimeInSeconds'] <= 895.0), 1, 0)
train['Interaction_4'] = np.where((train['trustLevel'] <= 1.5) & (train['scannedLineItemsPerSecond'] >= 0.012) & (train['totalScanTimeInSeconds'] >= 895.0), 1, 0)
train['Interaction_5'] = np.where((train['trustLevel'] >= 1.5) & (train['trustLevel'] <= 2.5) & (train['totalScanTimeInSeconds'] <= 1298.0), 1, 0)
train['Interaction_6'] = np.where((train['trustLevel'] >= 1.5) & (train['trustLevel'] <= 2.5) & (train['totalScanTimeInSeconds'] >= 1298.0), 1, 0)
train['Interaction_7'] = np.where((train['trustLevel'] >= 1.5) & (train['trustLevel'] >= 2.5), 1, 0)
train['newInteraction_1'] = np.where((train['Interaction_4'] <= 0.5) & (train['Interaction_3'] <= 0.5) & (train['Interaction_6'] <= 0.5), 1, 0)
train['newInteraction_2'] = np.where((train['Interaction_4'] <= 0.5) & (train['Interaction_3'] <= 0.5) & (train['Interaction_6'] >= 0.5), 1, 0)
train['newInteraction_3'] = np.where((train['Interaction_4'] <= 0.5) & (train['Interaction_3'] >= 0.5) & (train['lineItemVoidsPerPosition'] <= 0.429), 1, 0)
train['feature1'] = train['Interaction_4'] * train['trustLevel']
train['feature2'] = train['Interaction_4'] * train['Interaction_7']
train['feature3'] = train['trustLevel'] * train['Interaction_7']

In [11]:
#creating variables from past assignments for the test dataset
test['Interaction_1'] = np.where((test['trustLevel'] <= 1.5) & (test['scannedLineItemsPerSecond'] <= 0.012) & (test['lineItemVoids'] <= 10.5), 1, 0)
test['Interaction_2'] = np.where((test['trustLevel'] <= 1.5) & (test['scannedLineItemsPerSecond'] <= 0.012) & (test['lineItemVoids'] >= 10.5), 1, 0)
test['Interaction_3'] = np.where((test['trustLevel'] <= 1.5) & (test['scannedLineItemsPerSecond'] >= 0.012) & (test['totalScanTimeInSeconds'] <= 895.0), 1, 0)
test['Interaction_4'] = np.where((test['trustLevel'] <= 1.5) & (test['scannedLineItemsPerSecond'] >= 0.012) & (test['totalScanTimeInSeconds'] >= 895.0), 1, 0)
test['Interaction_5'] = np.where((test['trustLevel'] >= 1.5) & (test['trustLevel'] <= 2.5) & (test['totalScanTimeInSeconds'] <= 1298.0), 1, 0)
test['Interaction_6'] = np.where((test['trustLevel'] >= 1.5) & (test['trustLevel'] <= 2.5) & (test['totalScanTimeInSeconds'] >= 1298.0), 1, 0)
test['Interaction_7'] = np.where((test['trustLevel'] >= 1.5) & (test['trustLevel'] >= 2.5), 1, 0)
test['newInteraction_1'] = np.where((test['Interaction_4'] <= 0.5) & (test['Interaction_3'] <= 0.5) & (test['Interaction_6'] <= 0.5), 1, 0)
test['newInteraction_2'] = np.where((test['Interaction_4'] <= 0.5) & (test['Interaction_3'] <= 0.5) & (test['Interaction_6'] >= 0.5), 1, 0)
test['newInteraction_3'] = np.where((test['Interaction_4'] <= 0.5) & (test['Interaction_3'] >= 0.5) & (test['lineItemVoidsPerPosition'] <= 0.429), 1, 0)
test['feature1'] = test['Interaction_4'] * test['trustLevel']
test['feature2'] = test['Interaction_4'] * test['Interaction_7']
test['feature3'] = test['trustLevel'] * test['Interaction_7']

In [12]:
#Defining variables
X_train = train.drop(columns = ['fraud'])
Y_train = train['fraud']

In [None]:
#Top 7 variables from homework 6

#1 Interaction 4
#2 Feature1
#3 NewInteraction_1
#4 scannedLineItemsPerSecond
#5 totalScanTimeInSeconds
#6 valuePerSecond
#7 NewInteraction_3

In [22]:
#Creating an AdaBoost model to predict fraud with the top 5, 6, and 7 features

# Ada boost parameters
ada_param_grid = {'n_estimators': [100, 300, 500], 'base_estimator__min_samples_split': [5, 10, 15], 'base_estimator__min_samples_leaf': [5, 10, 15], 'base_estimator__max_depth': [3, 5, 7], 'learning_rate': [0.001, 0.01, 0.1, 1]}

#Score label
model_score = make_scorer(cost_function.cost_function, greater_is_better = True, needs_proba = True)

######################################################################################

#Top 5 variables
X_train1 = X_train[['Interaction_4','feature1', 'newInteraction_1', 'scannedLineItemsPerSecond', 'totalScanTimeInSeconds' ]]

#Running GridSearchCV with 3 folds with top 5 variables
AdaBoost_GridSearch = GridSearchCV(AdaBoostClassifier(base_estimator = DecisionTreeClassifier()), ada_param_grid, cv = 3, scoring = model_score, n_jobs = -1).fit(X_train1, Y_train)

# Printing labels
print('The best parameters for an AdaBoostClassifier Model with the top 5 variables are- \n', AdaBoost_GridSearch.best_params_)
print('\nBest Score:\n', AdaBoost_GridSearch.best_score_)

#######################################################################################

#Top 6 variables
X_train1 = X_train[['Interaction_4','feature1', 'newInteraction_1', 'scannedLineItemsPerSecond', 'totalScanTimeInSeconds', 'valuePerSecond' ]]

#Running GridSearchCV with 3 folds with top 6 variables
AdaBoost_GridSearch1 = GridSearchCV(AdaBoostClassifier(base_estimator = DecisionTreeClassifier()), ada_param_grid, cv = 3, scoring = model_score, n_jobs = -1).fit(X_train1, Y_train)

# Printing labels
print('The best parameters for an AdaBoostClassifier Model with the top 6 variables are- \n', AdaBoost_GridSearch1.best_params_)
print('\nBest Score:\n', AdaBoost_GridSearch1.best_score_)

########################################################################################

#Top 7 variables
X_train1 = X_train[['Interaction_4','feature1', 'newInteraction_1', 'scannedLineItemsPerSecond', 'totalScanTimeInSeconds', 'valuePerSecond', 'newInteraction_3' ]]

#Running GridSearchCV with 3 folds with top 7 variables
AdaBoost_GridSearch2 = GridSearchCV(AdaBoostClassifier(base_estimator = DecisionTreeClassifier()), ada_param_grid, cv = 3, scoring = model_score, n_jobs = -1).fit(X_train1, Y_train)

# Printing labels
print('The best parameters for an AdaBoostClassifier Model with the top 7 variables are- \n', AdaBoost_GridSearch2.best_params_)
print('\nBest Score:\n', AdaBoost_GridSearch2.best_score_)

The best parameters for an AdaBoostClassifier Model with the top 5 variables are- 
 {'base_estimator__max_depth': 3, 'base_estimator__min_samples_leaf': 10, 'base_estimator__min_samples_split': 5, 'learning_rate': 0.01, 'n_estimators': 300}

Best Score:
 -1166.6666666666667
The best parameters for an AdaBoostClassifier Model with the top 6 variables are- 
 {'base_estimator__max_depth': 5, 'base_estimator__min_samples_leaf': 10, 'base_estimator__min_samples_split': 5, 'learning_rate': 1, 'n_estimators': 500}

Best Score:
 -666.6666666666666
The best parameters for an AdaBoostClassifier Model with the top 7 variables are- 
 {'base_estimator__max_depth': 5, 'base_estimator__min_samples_leaf': 5, 'base_estimator__min_samples_split': 10, 'learning_rate': 1, 'n_estimators': 100}

Best Score:
 -166.66666666666666


In [19]:
#Creating a RandomForrest model to predict fraud with the top 5, 6, and 7 features

#Random forrest parameters
rf_param_grid = {'n_estimators': [100, 300, 500], 'max_depth': [3, 5, 7], 'min_samples_split': [5, 10, 15], 'min_samples_leaf': [5, 10, 15]}

#Score label
model_score = make_scorer(cost_function.cost_function, greater_is_better = True, needs_proba = True)

######################################################################################
#Top 5 variables
X_train1 = X_train[['Interaction_4','feature1', 'newInteraction_1', 'scannedLineItemsPerSecond', 'totalScanTimeInSeconds' ]]

#Running GridSearchCV with 3 folds with top 5 variables
RandomForrest_GridSearch = GridSearchCV(RandomForestClassifier(), rf_param_grid, cv = 3, scoring = model_score, n_jobs = -1).fit(X_train1, Y_train)

# Printing labels
print('The best parameters for an AdaBoostClassifier Model with the top 5 variables are- \n', RandomForrest_GridSearch.best_params_)
print('\nBest Score:\n', RandomForrest_GridSearch.best_score_)

#######################################################################################

#Top 6 variables
X_train1 = X_train[['Interaction_4','feature1', 'newInteraction_1', 'scannedLineItemsPerSecond', 'totalScanTimeInSeconds', 'valuePerSecond' ]]

#Running GridSearchCV with 3 folds with top 6 variables
RandomForrest_GridSearch1 = GridSearchCV(RandomForestClassifier(), rf_param_grid, cv = 3, scoring = model_score, n_jobs = -1).fit(X_train1, Y_train)

# Printing labels
print('The best parameters for an AdaBoostClassifier Model with the top 6 variables are- \n', RandomForrest_GridSearch1.best_params_)
print('\nBest Score:\n', RandomForrest_GridSearch1.best_score_)

########################################################################################

#Top 7 variables
X_train1 = X_train[['Interaction_4','feature1', 'newInteraction_1', 'scannedLineItemsPerSecond', 'totalScanTimeInSeconds', 'valuePerSecond', 'newInteraction_3' ]]

#Running GridSearchCV with 3 folds with top 7 variables
RandomForrest_GridSearch2 = GridSearchCV(RandomForestClassifier(), rf_param_grid, cv = 3, scoring = model_score, n_jobs = -1).fit(X_train1, Y_train)

# Printing labels
print('The best parameters for an AdaBoostClassifier Model with the top 7 variables are- \n', RandomForrest_GridSearch2.best_params_)
print('\nBest Score:\n', RandomForrest_GridSearch2.best_score_)

The best parameters for an AdaBoostClassifier Model with the top 5 variables are- 
 {'max_depth': 7, 'min_samples_leaf': 5, 'min_samples_split': 5, 'n_estimators': 300}

Best Score:
 -3666.6666666666665
The best parameters for an AdaBoostClassifier Model with the top 6 variables are- 
 {'max_depth': 7, 'min_samples_leaf': 5, 'min_samples_split': 5, 'n_estimators': 500}

Best Score:
 -5666.666666666667
The best parameters for an AdaBoostClassifier Model with the top 7 variables are- 
 {'max_depth': 7, 'min_samples_leaf': 5, 'min_samples_split': 5, 'n_estimators': 500}

Best Score:
 -6666.666666666667


In [21]:
#Creating an SVC model to predict fraud with the top 5, 6, and 7 features

#SVC parameters
svc_param_grid = {'kernel': ['rbf', 'poly', 'sigmoid'], 'C': [0.01, 0.1, 1, 10], 'gamma': [0.01, 0.1, 1]}

#Score label
model_score = make_scorer(cost_function.cost_function, greater_is_better = True, needs_proba = True)

######################################################################################
#Top 5 variables
scaler = MinMaxScaler(feature_range = (0, 1))
X_train1 = scaler.fit_transform(X_train[['Interaction_4','feature1', 'newInteraction_1', 'scannedLineItemsPerSecond', 'totalScanTimeInSeconds' ]])

#Running GridSearchCV with 3 folds with top 5 variables
SVC_GridSearch = GridSearchCV(SVC(probability = True), svc_param_grid, cv = 3, scoring = model_score, n_jobs = -1).fit(X_train1, Y_train)

# Printing labels
print('The best parameters for an SVC Model with the top 5 variables are- \n', SVC_GridSearch.best_params_)
print('\nBest Score:\n', SVC_GridSearch.best_score_)

#######################################################################################

#Top 6 variables
X_train1 = scaler.fit_transform(X_train[['Interaction_4','feature1', 'newInteraction_1', 'scannedLineItemsPerSecond', 'totalScanTimeInSeconds', 'valuePerSecond' ]])

#Running GridSearchCV with 3 folds with top 6 variables
SVC_GridSearch1 = GridSearchCV(SVC(probability = True), svc_param_grid, cv = 3, scoring = model_score, n_jobs = -1).fit(X_train1, Y_train)

# Printing labels
print('The best parameters for an SVC Model with the top 6 variables are- \n', SVC_GridSearch1.best_params_)
print('\nBest Score:\n', SVC_GridSearch1.best_score_)

########################################################################################

#Top 7 variables
X_train1 = scaler.fit_transform(X_train[['Interaction_4','feature1', 'newInteraction_1', 'scannedLineItemsPerSecond', 'totalScanTimeInSeconds', 'valuePerSecond', 'newInteraction_3' ]])

#Running GridSearchCV with 3 folds with top 7 variables
SVC_GridSearch2 = GridSearchCV(SVC(probability = True), svc_param_grid, cv = 3, scoring = model_score, n_jobs = -1).fit(X_train1, Y_train)

# Printing labels
print('The best parameters for an SVC Model with the top 7 variables are- \n', SVC_GridSearch2.best_params_)
print('\nBest Score:\n', SVC_GridSearch2.best_score_)

The best parameters for an SVC Model with the top 5 variables are- 
 {'C': 0.01, 'gamma': 0.01, 'kernel': 'poly'}

Best Score:
 -15666.666666666666
The best parameters for an SVC Model with the top 6 variables are- 
 {'C': 1, 'gamma': 0.1, 'kernel': 'poly'}

Best Score:
 -15166.666666666666
The best parameters for an SVC Model with the top 7 variables are- 
 {'C': 0.1, 'gamma': 0.1, 'kernel': 'poly'}

Best Score:
 -15166.666666666666


In [None]:
#After looking at all the models it is clear that I did something wrong or my variables are just straight trash, either way
# the best model to use is the AdaBoost model with 7 variables with a score of negative 166 (-166).