# Preprocessing and feature engineering

In [1]:
import pandas as pd
import numpy as np
from google.colab import drive
from numpy import inf

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
df=pd.read_csv('/content/drive/My Drive/HW3/train.csv',sep='|')
test=pd.read_csv('/content/drive/My Drive/HW3/test.csv',sep='|')
test_true=pd.read_csv('/content/drive/My Drive/HW3/test_real.csv',sep='|')

In [4]:
df.head()

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,fraud
0,5,1054,54.7,7,0,3,0.027514,0.051898,0.241379,0
1,3,108,27.36,5,2,4,0.12963,0.253333,0.357143,0
2,3,1516,62.16,3,10,5,0.008575,0.041003,0.230769,0
3,6,1791,92.31,8,4,4,0.016192,0.051541,0.275862,0
4,5,430,81.53,3,7,2,0.062791,0.189605,0.111111,0


In [5]:
df.info() #no missing values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1879 entries, 0 to 1878
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   trustLevel                 1879 non-null   int64  
 1   totalScanTimeInSeconds     1879 non-null   int64  
 2   grandTotal                 1879 non-null   float64
 3   lineItemVoids              1879 non-null   int64  
 4   scansWithoutRegistration   1879 non-null   int64  
 5   quantityModifications      1879 non-null   int64  
 6   scannedLineItemsPerSecond  1879 non-null   float64
 7   valuePerSecond             1879 non-null   float64
 8   lineItemVoidsPerPosition   1879 non-null   float64
 9   fraud                      1879 non-null   int64  
dtypes: float64(4), int64(6)
memory usage: 146.9 KB


In [None]:
y=df['fraud']
X=df.drop(columns='fraud')

1. Number of items: This is an important feature because the customers could think, that if they have a lot of items in their shopping cart, it would be less obvious to not scan a few of these items.  


2. Total scans-no registration ratio: This could be an indication for fraud if there is an unsual ratio of attempted scans, which could be done to pretend to scan an item.

In [None]:
from numpy import NaN

In [None]:
X['number of items'] = X['totalScanTimeInSeconds']*X['scannedLineItemsPerSecond'] 

X['items scanned-void ratio'] = X['number of items'] / X['lineItemVoids']
X['items scanned-void ratio'] = X ['items scanned-void ratio'].apply(lambda x: 50 if x==inf else x)  # The largest number in this column, beside inf, is 30 so we relplace the inf with
                                                                                                   # a number, which is larger than 30, but also not too large to distort the data.

X['items scanned-no registration ratio'] = X['number of items'] / X['scansWithoutRegistration']
X['items scanned-no registration ratio'] = X['items scanned-no registration ratio'].apply(lambda x: 50 if x==inf else x) # same as above

X['grandTotal-no registration ratio'] =  X['grandTotal'] / X['scansWithoutRegistration'] 
X['grandTotal-no registration ratio'] = X['grandTotal-no registration ratio'].apply(lambda x: 0 if x=='NaN' else ( 100 if x==inf else x)) 
# the Na values are a in the rows, where zero was divided by zero, so setting these as zero seems reasonable 

In [None]:
test['number of items'] = test['totalScanTimeInSeconds']*test['scannedLineItemsPerSecond']

test['items scanned-void ratio'] = test['number of items'] / test['lineItemVoids']
test['items scanned-void ratio'] = test['items scanned-void ratio'].apply(lambda x: 50 if x==inf else x)

test['items scanned-no registration ratio'] = test['number of items'] / test['scansWithoutRegistration']
test['items scanned-no registration ratio'] = test['items scanned-no registration ratio'].apply(lambda x: 50 if x==inf else x)



test['grandTotal-no registration ratio'] = test['grandTotal'] / test['scansWithoutRegistration']
test['grandTotal-no registration ratio'] = test['grandTotal-no registration ratio'].apply(lambda x: 0 if x==NaN else ( 100 if x==inf else x))

In [None]:
from sklearn import preprocessing   # scaling the data for better results


min_max_scaler = preprocessing.MinMaxScaler()

x = X
x_scaled = min_max_scaler.fit_transform(x)
X = pd.DataFrame(x_scaled,columns=X.columns)

test_1 = test
test_scaled = min_max_scaler.fit_transform(test_1)
test = pd.DataFrame(test_scaled,columns=test.columns)

In [None]:
test['grandTotal-no registration ratio']=test['grandTotal-no registration ratio'].fillna(0)

# Training classifiers

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score 
from matplotlib import pyplot as plt
from sklearn.svm import SVC
from skopt import BayesSearchCV

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,stratify=y,test_size=0.25)

In [None]:
rf = RandomForestClassifier(max_depth=4,n_estimators=375,max_features=7,criterion='entropy',class_weight='balanced',min_samples_leaf=4)

# I got these hyperparameters by manualy trying different values, gridsearch gave me values which lead to overfitting my training data and a low test score.

In [None]:
rf.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='entropy', max_depth=4, max_features=7,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=4, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=375,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
y_pred = rf.predict(X_test)
y_pred_train = rf.predict(X_train)
print('RandomForest roc_auc test score:', roc_auc_score(y_test,y_pred))
print('RandomForest roc_auc train score:' ,roc_auc_score(y_train,y_pred_train))

RandomForest roc_auc test score: 0.937976437976438
RandomForest roc_auc train score: 0.964312546957175


In [None]:
X_svm=X.drop(columns = 'grandTotal-no registration ratio')        # the svm works better without this column 
test_svm=test.drop(columns = 'grandTotal-no registration ratio')

In [None]:
X_train2,X_test2,y_train2,y_test2 = train_test_split(X_svm,y,stratify=y,test_size=0.25)

In [None]:
svm=SVC(C = 55,gamma = 0.03, probability = True,class_weight = 'balanced')
svm.fit(X_train2,y_train2)

SVC(C=55, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.03, kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [None]:
params={'gamma':[0.02,0.03,0.05,0.1,],'C':[45,50,55,60]}  # this is only the last itteration of using gridsearch, i narrowed down the parameters to these values

grid=GridSearchCV(svm,params,'roc_auc',cv=5)

In [None]:
grid.fit(X_train2,y_train2)

GridSearchCV(cv=5, error_score=nan,
             estimator=SVC(C=55, break_ties=False, cache_size=200,
                           class_weight='balanced', coef0=0.0,
                           decision_function_shape='ovr', degree=3, gamma=0.03,
                           kernel='rbf', max_iter=-1, probability=True,
                           random_state=None, shrinking=True, tol=0.001,
                           verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [45, 50, 55, 60],
                         'gamma': [0.02, 0.03, 0.05, 0.1]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='roc_auc', verbose=0)

In [None]:
grid.best_params_

{'C': 55, 'gamma': 0.1}

In [None]:
y_pred=svm.predict(X_test2)
y_pred_train=svm.predict(X_train2)
print('SVM roc-auc test score:',roc_auc_score(y_test2,y_pred))
print('SVM roc-auc train score:',roc_auc_score(y_train2,y_pred_train))

SVM roc-auc test score: 0.9898648648648648
SVM roc-auc train score: 0.9834710743801653


In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import  DecisionTreeClassifier

In [None]:
ada=AdaBoostClassifier(learning_rate=1.1,n_estimators=550)

In [None]:
ada.fit(X_train,y_train)
y_pred_ada=ada.predict(X_test)
print(roc_auc_score(y_test,y_pred_ada))


0.9581600831600832


# Evaluation

In [None]:
svm.fit(X_svm,y)    # i used an ensemble of three different classifiers so that if one model makes a mistake,
                    # that prediction will will be outweighed by the other two  
rf.fit(X,y)
ada.fit(X,y)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.1,
                   n_estimators=550, random_state=None)

In [None]:
svm_prob=svm.predict_proba(test_svm)
rf_proba=rf.predict_proba(test)
ada_proba=ada.predict_proba(test)

In [None]:
mean=(ada_proba+svm_prob+rf_proba)/3  

In [None]:
from numpy import around

In [None]:
mean_rounded=around(mean[:,1])

In [None]:
roc_auc_score(test_true,mean_rounded)

0.9782312685255157

In [None]:
from sklearn.metrics import confusion_matrix


In [None]:
confusion_matrix(test_true,mean_rounded)

array([[470275,   4119],
       [   827,  22900]])

According to the ROC-AUC score the model is quite good, but this is misleading. Looking at the confusion matrix we see, even though the model finds more than 97% of the fraudulent customers, we also accuse a bit less than 1% of the honest customers. Over time this could lead to a bad reputation of the store and a loss in customers. This could be fixed by lowering the decision boundary by a few percents, but this would increase the fals negatives and also lower the ROC AUC score. 