In [1]:
import boto3
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import cost_function
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import make_scorer, confusion_matrix
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
pd.set_option('display.max_columns', 50)
# Defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'data-445-wagner'
bucket = s3.Bucket(bucket_name)

# Defining the file to be read from s3 bucket
file_key = "train.csv"

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

# Reading the csv file
train = pd.read_csv(file_content_stream, sep = '|')

# Defining the file to be read from s3 bucket
file_key1 = "test.csv"

bucket_object1 = bucket.Object(file_key1)
file_object1 = bucket_object1.get()
file_content_stream1 = file_object1.get('Body')

# Reading the csv file
test = pd.read_csv(file_content_stream1, sep = '|')
test.head()

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition
0,4,467,88.48,4,8,4,0.014989,0.189465,0.571429
1,3,1004,58.99,7,6,1,0.026892,0.058755,0.259259
2,1,162,14.0,4,5,4,0.006173,0.08642,4.0
3,5,532,84.79,9,3,4,0.026316,0.15938,0.642857
4,5,890,42.16,4,0,0,0.021348,0.047371,0.210526


In [2]:
#Creating interactions and features from past assignments for train dataset
train['Interaction_1'] = np.where((train['trustLevel'] <= 1.5) & (train['scannedLineItemsPerSecond'] <= 0.012) & (train['lineItemVoids'] <= 10.5), 1, 0)
train['Interaction_2'] = np.where((train['trustLevel'] <= 1.5) & (train['scannedLineItemsPerSecond'] <= 0.012) & (train['lineItemVoids'] >= 10.5), 1, 0)
train['Interaction_3'] = np.where((train['trustLevel'] <= 1.5) & (train['scannedLineItemsPerSecond'] >= 0.012) & (train['totalScanTimeInSeconds'] <= 895.0), 1, 0)
train['Interaction_4'] = np.where((train['trustLevel'] <= 1.5) & (train['scannedLineItemsPerSecond'] >= 0.012) & (train['totalScanTimeInSeconds'] >= 895.0), 1, 0)
train['Interaction_5'] = np.where((train['trustLevel'] >= 1.5) & (train['trustLevel'] <= 2.5) & (train['totalScanTimeInSeconds'] <= 1298.0), 1, 0)
train['Interaction_6'] = np.where((train['trustLevel'] >= 1.5) & (train['trustLevel'] <= 2.5) & (train['totalScanTimeInSeconds'] >= 1298.0), 1, 0)
train['Interaction_7'] = np.where((train['trustLevel'] >= 1.5) & (train['trustLevel'] >= 2.5), 1, 0)
train['newInteraction_1'] = np.where((train['Interaction_4'] <= 0.5) & (train['Interaction_3'] <= 0.5) & (train['Interaction_6'] <= 0.5), 1, 0)
train['newInteraction_2'] = np.where((train['Interaction_4'] <= 0.5) & (train['Interaction_3'] <= 0.5) & (train['Interaction_6'] >= 0.5), 1, 0)
train['newInteraction_3'] = np.where((train['Interaction_4'] <= 0.5) & (train['Interaction_3'] >= 0.5) & (train['lineItemVoidsPerPosition'] <= 0.429), 1, 0)
train['feature1'] = train['Interaction_4'] * train['trustLevel']
train['feature2'] = train['Interaction_4'] * train['Interaction_7']
train['feature3'] = train['trustLevel'] * train['Interaction_7']

In [3]:
#creating variables from past assignments for the test dataset
test['Interaction_1'] = np.where((test['trustLevel'] <= 1.5) & (test['scannedLineItemsPerSecond'] <= 0.012) & (test['lineItemVoids'] <= 10.5), 1, 0)
test['Interaction_2'] = np.where((test['trustLevel'] <= 1.5) & (test['scannedLineItemsPerSecond'] <= 0.012) & (test['lineItemVoids'] >= 10.5), 1, 0)
test['Interaction_3'] = np.where((test['trustLevel'] <= 1.5) & (test['scannedLineItemsPerSecond'] >= 0.012) & (test['totalScanTimeInSeconds'] <= 895.0), 1, 0)
test['Interaction_4'] = np.where((test['trustLevel'] <= 1.5) & (test['scannedLineItemsPerSecond'] >= 0.012) & (test['totalScanTimeInSeconds'] >= 895.0), 1, 0)
test['Interaction_5'] = np.where((test['trustLevel'] >= 1.5) & (test['trustLevel'] <= 2.5) & (test['totalScanTimeInSeconds'] <= 1298.0), 1, 0)
test['Interaction_6'] = np.where((test['trustLevel'] >= 1.5) & (test['trustLevel'] <= 2.5) & (test['totalScanTimeInSeconds'] >= 1298.0), 1, 0)
test['Interaction_7'] = np.where((test['trustLevel'] >= 1.5) & (test['trustLevel'] >= 2.5), 1, 0)
test['newInteraction_1'] = np.where((test['Interaction_4'] <= 0.5) & (test['Interaction_3'] <= 0.5) & (test['Interaction_6'] <= 0.5), 1, 0)
test['newInteraction_2'] = np.where((test['Interaction_4'] <= 0.5) & (test['Interaction_3'] <= 0.5) & (test['Interaction_6'] >= 0.5), 1, 0)
test['newInteraction_3'] = np.where((test['Interaction_4'] <= 0.5) & (test['Interaction_3'] >= 0.5) & (test['lineItemVoidsPerPosition'] <= 0.429), 1, 0)
test['feature1'] = test['Interaction_4'] * test['trustLevel']
test['feature2'] = test['Interaction_4'] * test['Interaction_7']
test['feature3'] = test['trustLevel'] * test['Interaction_7']

In [4]:
#Defining variables
X = train.drop(columns = ['fraud'])
Y = train['fraud']

#splitting data
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size = 0.2, stratify = Y)
X_train.head()

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,Interaction_1,Interaction_2,Interaction_3,Interaction_4,Interaction_5,Interaction_6,Interaction_7,newInteraction_1,newInteraction_2,newInteraction_3,feature1,feature2,feature3
58,2,351,50.75,1,7,0,0.045584,0.144587,0.0625,0,0,0,0,1,0,0,1,0,0,0,0,0
1116,4,1399,26.46,2,2,4,0.019299,0.018914,0.074074,0,0,0,0,0,0,1,1,0,0,0,0,4
1622,4,1785,65.03,4,3,4,0.007843,0.036431,0.285714,0,0,0,0,0,0,1,1,0,0,0,0,4
782,4,479,29.84,4,1,3,0.002088,0.062296,4.0,0,0,0,0,0,0,1,1,0,0,0,0,4
1106,2,93,85.04,2,3,4,0.032258,0.914409,0.666667,0,0,0,0,1,0,0,1,0,0,0,0,0


In [9]:
#using best model with top 7 variables from homework 7 part 2i

#Top 5 variables for training test and validation
X_train_ada = X_train[['Interaction_4','feature1', 'newInteraction_1', 'scannedLineItemsPerSecond', 'totalScanTimeInSeconds' ]]
X_val_ada = X_validation[['Interaction_4','feature1', 'newInteraction_1', 'scannedLineItemsPerSecond', 'totalScanTimeInSeconds' ]]
X_test_ada= test[['Interaction_4','feature1', 'newInteraction_1', 'scannedLineItemsPerSecond', 'totalScanTimeInSeconds' ]]

#Running GridSearchCV with 3 folds with top 7 variables
AdaBoost = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth = 5, min_samples_leaf = 5, min_samples_split = 10), learning_rate = 1, n_estimators = 100).fit(X_train_ada, Y_train)

# Predicting fraud
ada_val_predictions = AdaBoost.predict_proba(X_val_ada)[:, 1]
ada_test_predictions = AdaBoost.predict_proba(X_test_ada)[:, 1]

# Printing labels
print('AdaBoost Score is:', cost_function.cost_function(Y_validation, ada_val_predictions))

AdaBoost Score is: -7500.0


In [10]:
#using best model with top 5 variables from homework 7 part 2ii

#Top 5 variables for training test and validation
X_train_rf = X_train[['Interaction_4','feature1', 'newInteraction_1', 'scannedLineItemsPerSecond', 'totalScanTimeInSeconds' ]]
X_val_rf = X_validation[['Interaction_4','feature1', 'newInteraction_1', 'scannedLineItemsPerSecond', 'totalScanTimeInSeconds' ]]
X_test_rf = test[['Interaction_4','feature1', 'newInteraction_1', 'scannedLineItemsPerSecond', 'totalScanTimeInSeconds' ]]

#Running GridSearchCV with 3 folds with top 5 variables
RandomForrest = RandomForestClassifier(max_depth = 7, min_samples_leaf = 5, min_samples_split = 5, n_estimators = 300).fit(X_train_rf, Y_train)

# Predicting fraud
rf_val_predictions = RandomForrest.predict_proba(X_val_rf)[:, 1]
rf_test_predictions = RandomForrest.predict_proba(X_test_rf)[:, 1]

print('RandomForrest Score is:', cost_function.cost_function(Y_validation, rf_val_predictions))


RandomForrest Score is: -6000.0


In [11]:
#using best model with top 6 variables from homework 7 part 2iii

# Defining a MinMaxScaler
scaler = MinMaxScaler(feature_range = (0,1))

#Top 5 variables for training test and validation
X_train_svm = scaler.fit_transform(X_train[['Interaction_4','feature1', 'newInteraction_1', 'scannedLineItemsPerSecond', 'totalScanTimeInSeconds' ]])
X_val_svm = scaler.fit_transform(X_validation[['Interaction_4','feature1', 'newInteraction_1', 'scannedLineItemsPerSecond', 'totalScanTimeInSeconds' ]])
X_test_svm = scaler.fit_transform(test[['Interaction_4','feature1', 'newInteraction_1', 'scannedLineItemsPerSecond', 'totalScanTimeInSeconds' ]])

#Running GridSearchCV with 3 folds with top 5 variables
SVC_MD = SVC(probability = True, C = 1, gamma = 0.1, kernel = 'poly').fit(X_train_svc, Y_train)

# Predicting fraud
svm_val_predictions = SVC_MD.predict_proba(X_val_svc)[:, 1]
svm_test_predictions = SVC_MD.predict_proba(X_test_svc)[:, 1]

print('SVM Score is:', cost_function.cost_function(Y_validation, svm_val_predictions))


SVM Score is: -7500.0


In [19]:
#Creating data-frame 
X_ensemble = pd.DataFrame({'RF': rf_val_predictions,'ADA': ada_val_predictions,'SVM': svm_val_predictions})
X_ensemble_test = pd.DataFrame({'RF': rf_test_predictions, 'Ada': ada_test_predictions, 'SVM': svm_test_predictions})

#Score label
model_score = make_scorer(cost_function.cost_function, greater_is_better = True, needs_proba = True)

# Defining the parameter dictionary
Ensemble_param_grid = {'n_estimators': [100, 300, 500], 'max_depth': [3, 5, 7], 'min_samples_split': [5, 10, 15], 'min_samples_leaf': [5, 10, 15]}

# Building hyperparameters
Ensemble_grid_search = GridSearchCV(RandomForestClassifier(), Ensemble_param_grid, cv = 3, scoring = model_score, n_jobs = -1).fit(X_ensemble, Y_validation)

#dentifying the optimal model
ensemble_md = Ensemble_grid_search.best_estimator_

#Printing parameters
print('Best hyper-parameters:', Ensemble_grid_search.best_params_)

#Printing score
print('Best score:', Ensemble_grid_search.best_score_)

#Printing cutoff
print('Cutoff value is:', cost_function.cost_function_cutoff(Y_validation, ensemble_md.predict_proba(X_ensemble)[:, 1]))

Best hyper-parameters: {'max_depth': 3, 'min_samples_leaf': 5, 'min_samples_split': 10, 'n_estimators': 100}
Best score: -500.0
Cutoff value is: 0.53


In [20]:
#Predicting likelyhood
best_predictions = ensemble_md.predict_proba(X_ensemble_test)[:, 1]
#exporting csv
likelihoods = pd.DataFrame({'Likelihoods': best_predictions})
likelihoods.to_csv('homework8_likelihoods.csv', index = False)