In [1]:
import warnings 
warnings.filterwarnings('ignore')

# for data analytics
import pandas as pd
import numpy as np
from collections import Counter

# for visualizations
import seaborn as sns
import matplotlib.pyplot as plt

# for data preparation
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score

# imblean provides tools for us to deal with imbalanced class sizes
from imblearn.over_sampling import SMOTE 
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.combine import SMOTEENN

# example of random oversampling to balance the class distribution
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler

from imblearn.over_sampling import ADASYN

# machine learning models
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

# for evaluation of machine learning models
from sklearn import metrics

## Loading Files

In [2]:
X_train = pd.read_csv('../datasets/glove/train_data_imputed_FINAL.csv')
X_test = pd.read_csv('../datasets/glove/test_data_imputed_FINAL.csv')
y_train = pd.read_csv('../datasets/glove/y_train_FINAL.csv')
y_test = pd.read_csv('../datasets/glove/y_test_FINAL.csv')

In [3]:
X_train_wordvec = pd.read_csv('../datasets/word2vec/train_data_imputed_FINAL.csv')
X_test_wordvec = pd.read_csv('../datasets/word2vec/test_data_imputed_FINAL.csv')
y_train_wordvec = pd.read_csv('../datasets/word2vec/y_train_FINAL.csv')['fraudulent']
y_test_wordvec = pd.read_csv('../datasets/word2vec/y_test_FINAL.csv')['fraudulent']

In [4]:
def evaluate(y_train, y_test, y_train_pred, y_test_pred):
    train_results = {}
    train_results['Accuracy'] = metrics.accuracy_score(y_train, y_train_pred)
    train_results['Precision'] = metrics.precision_score(y_train, y_train_pred)
    train_results['Recall'] = metrics.recall_score(y_train, y_train_pred)
    train_results['F1'] = metrics.f1_score(y_train, y_train_pred)

    test_results = {}
    test_results['Accuracy'] = metrics.accuracy_score(y_test, y_test_pred)
    test_results['Precision'] = metrics.precision_score(y_test, y_test_pred)
    test_results['Recall'] = metrics.recall_score(y_test, y_test_pred)
    test_results['F1'] = metrics.f1_score(y_test, y_test_pred)

    # print('-----  TRAIN METRICS -----')
    # print(f'Accuracy: {metrics.accuracy_score(y_train, y_train_pred)}')
    # print(f'Precision: {metrics.precision_score(y_train, y_train_pred)}')
    # print(f'Recall: {metrics.recall_score(y_train, y_train_pred)}')
    # print(f'F1: {metrics.f1_score(y_train, y_train_pred)}')
    # print(f'Confusion Matrix:\n {metrics.confusion_matrix(y_train, y_train_pred)}')
    
    # print('-----  TEST METRICS -----')
    # print(f'Accuracy: {metrics.accuracy_score(y_test, y_test_pred)}')
    # print(f'Precision: {metrics.precision_score(y_test, y_test_pred)}')
    # print(f'Recall: {metrics.recall_score(y_test, y_test_pred)}')
    # print(f'F1: {metrics.f1_score(y_test, y_test_pred)}')
    # print(f'Confusion Matrix:\n {metrics.confusion_matrix(y_test, y_test_pred)}')

    return train_results, test_results

## Oversampling

In [5]:
print(len(y_train[y_train['fraudulent'] == 1]))
print(len(y_train[y_train['fraudulent'] == 0]))

223
11293


In [12]:
def SMOTE_ENN(X_train, y_train, n_neighbours, k_neighbours, sampling_strategy):
    # SMOTE ENN oversampling
    smote_only = SMOTE(random_state=42, sampling_strategy=sampling_strategy, k_neighbors = k_neighbours)
    ennObj = EditedNearestNeighbours(n_neighbors=n_neighbours)
    smote_enn = SMOTEENN(random_state=42, smote=smote_only , enn= ennObj)
    X_res_smoteENN, y_res_smoteENN = smote_enn.fit_resample(X_train, y_train)
    return X_res_smoteENN, y_res_smoteENN

In [13]:
def random_oversampler(X_train, y_train):
    oversample = RandomOverSampler(sampling_strategy='minority')
    X_over, y_over = oversample.fit_resample(X_train, y_train)

    return X_over, y_over

In [14]:
def adasyn(X_train, y_train, n_neighbors):
    ada = ADASYN(sampling_strategy = 'minority', n_neighbors = n_neighbors)
    X_resampled, y_resampled = ADASYN().fit_sample(X_train, y_train)
    return X_resampled, y_resampled

In [15]:
def log_reg(X_train,y_train, X_test, y_test):
    lr = LogisticRegression()
    lr_model = lr.fit(X_train, y_train)
    y_train_pred = lr_model.predict(X_train)
    y_test_pred = lr_model.predict(X_test)
    train_results, test_results = evaluate(y_train,y_test,y_train_pred,y_test_pred)
    return train_results, test_results
    

## Tune SMOTE ENN

In [16]:
n_neighbors = (1,10)
k_neighbors = [1,10]
sampling_strategy = [0.2,0.8]

#### Glove

In [10]:
nneighbors = []
kneighbors = []
samplingstrat = []
train_accuracy = []
train_precision = []
train_recall = []
train_f1 = []

test_accuracy = []
test_precision = []
test_recall = []
test_f1 = []

for i in list(range(n_neighbors[0], n_neighbors[1])):
    print(i)
    for k in list(range(k_neighbors[0], k_neighbors[1])):
        for j in [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]:
            X_train_SMOTE, y_train_SMOTE = SMOTE_ENN(X_train, y_train, n_neighbours = i, k_neighbours = k, sampling_strategy = j)
            train_results, test_results = log_reg(X_train_SMOTE,y_train_SMOTE,X_test,y_test)
            nneighbors.append(i)
            kneighbors.append(k)
            samplingstrat.append(j)
            train_accuracy.append(train_results['Accuracy'])
            train_precision.append(train_results['Precision'])
            train_recall.append(train_results['Recall'])
            train_f1.append(train_results['F1'])

            test_accuracy.append(test_results['Accuracy'])
            test_precision.append(test_results['Precision'])
            test_recall.append(test_results['Recall'])
            test_f1.append(test_results['F1'])

1
2
3
4
5
6
7
8
9


In [11]:
smote_results = pd.DataFrame()
smote_results['nneighbors'] = nneighbors
smote_results['kneighbors'] = kneighbors
smote_results['samplingstrat'] = samplingstrat
smote_results['train_accuracy'] = train_accuracy
smote_results['train_precision'] = train_precision
smote_results['train_recall'] = train_recall
smote_results['train_f1'] = train_f1
smote_results['test_accuracy'] = test_accuracy
smote_results['test_precision'] = test_precision
smote_results['test_recall'] = test_recall
smote_results['test_f1'] = test_f1

In [14]:
smote_results[smote_results['test_f1'] == smote_results['test_f1'].max()]

Unnamed: 0,nneighbors,kneighbors,samplingstrat,train_accuracy,train_precision,train_recall,train_f1,test_accuracy,test_precision,test_recall,test_f1
162,3,1,0.1,0.938055,0.75102,0.488928,0.592275,0.972917,0.328125,0.375,0.35
180,3,3,0.1,0.938506,0.757453,0.495128,0.598822,0.972917,0.328125,0.375,0.35


#### Word2Vec

In [17]:
nneighbors = []
kneighbors = []
samplingstrat = []
train_accuracy = []
train_precision = []
train_recall = []
train_f1 = []

test_accuracy = []
test_precision = []
test_recall = []
test_f1 = []

for i in list(range(n_neighbors[0], n_neighbors[1])):
    print(i)
    for k in list(range(k_neighbors[0], k_neighbors[1])):
        for j in [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]:
            X_train_SMOTE, y_train_SMOTE = SMOTE_ENN(X_train_wordvec, y_train_wordvec, n_neighbours = i, k_neighbours = k, sampling_strategy = j)
            train_results, test_results = log_reg(X_train_SMOTE,y_train_SMOTE,X_test_wordvec,y_test_wordvec)
            nneighbors.append(i)
            kneighbors.append(k)
            samplingstrat.append(j)
            train_accuracy.append(train_results['Accuracy'])
            train_precision.append(train_results['Precision'])
            train_recall.append(train_results['Recall'])
            train_f1.append(train_results['F1'])

            test_accuracy.append(test_results['Accuracy'])
            test_precision.append(test_results['Precision'])
            test_recall.append(test_results['Recall'])
            test_f1.append(test_results['F1'])

1
2
3
4
5
6
7
8
9


In [19]:
smote_results_wordvec = pd.DataFrame()
smote_results_wordvec['nneighbors'] = nneighbors
smote_results_wordvec['kneighbors'] = kneighbors
smote_results_wordvec['samplingstrat'] = samplingstrat
smote_results_wordvec['train_accuracy'] = train_accuracy
smote_results_wordvec['train_precision'] = train_precision
smote_results_wordvec['train_recall'] = train_recall
smote_results_wordvec['train_f1'] = train_f1
smote_results_wordvec['test_accuracy'] = test_accuracy
smote_results_wordvec['test_precision'] = test_precision
smote_results_wordvec['test_recall'] = test_recall
smote_results_wordvec['test_f1'] = test_f1

In [20]:
smote_results_wordvec[smote_results_wordvec['test_f1'] == smote_results_wordvec['test_f1'].max()]

Unnamed: 0,nneighbors,kneighbors,samplingstrat,train_accuracy,train_precision,train_recall,train_f1,test_accuracy,test_precision,test_recall,test_f1
288,4,6,0.1,0.96123,0.819149,0.750221,0.783172,0.976042,0.430108,0.714286,0.536913


## Random oversampler

#### Glove

In [16]:
X_train_oversampler, y_train_oversampler = random_oversampler(X_train, y_train)
train_results, test_results = log_reg(X_train_oversampler,y_train_oversampler,X_test,y_test)
print(train_results)
print(test_results)

{'Accuracy': 0.8747011423005402, 'Precision': 0.8689510855349202, 'Recall': 0.8824935800938635, 'F1': 0.8756699762762499}
{'Accuracy': 0.8517361111111111, 'Precision': 0.08685968819599109, 'Recall': 0.6964285714285714, 'F1': 0.15445544554455445}


#### Word2vec

In [34]:
X_train_oversampler, y_train_oversampler = random_oversampler(X_train_wordvec, y_train_wordvec)
train_results, test_results = log_reg(X_train_oversampler,y_train_oversampler,X_test_wordvec,y_test_wordvec)
print(train_results)
print(test_results)

{'Accuracy': 0.936730718143983, 'Precision': 0.9178952719877986, 'Recall': 0.9592668024439919, 'F1': 0.9381251353106733}
{'Accuracy': 0.9024305555555555, 'Precision': 0.14953271028037382, 'Recall': 0.8571428571428571, 'F1': 0.2546419098143236}


## Tune ADASYN

#### Glove

In [17]:
nneighbors = []
train_accuracy = []
train_precision = []
train_recall = []
train_f1 = []

test_accuracy = []
test_precision = []
test_recall = []
test_f1 = []

for i in list(range(n_neighbors[0], n_neighbors[1])):
    print(i)
    X_train_ADASYN, y_train_ADASYN = adasyn(X_train, y_train, n_neighbors = i)
    train_results, test_results = log_reg(X_train_ADASYN,y_train_ADASYN,X_test,y_test)
    nneighbors.append(i)
    kneighbors.append(k)
    samplingstrat.append(j)
    train_accuracy.append(train_results['Accuracy'])
    train_precision.append(train_results['Precision'])
    train_recall.append(train_results['Recall'])
    train_f1.append(train_results['F1'])

    test_accuracy.append(test_results['Accuracy'])
    test_precision.append(test_results['Precision'])
    test_recall.append(test_results['Recall'])
    test_f1.append(test_results['F1'])

1
2
3
4
5
6
7
8
9


In [18]:
adasyn_results = pd.DataFrame()
adasyn_results['nneighbors'] = nneighbors
adasyn_results['train_accuracy'] = train_accuracy
adasyn_results['train_precision'] = train_precision
adasyn_results['train_recall'] = train_recall
adasyn_results['train_f1'] = train_f1
adasyn_results['test_accuracy'] = test_accuracy
adasyn_results['test_precision'] = test_precision
adasyn_results['test_recall'] = test_recall
adasyn_results['test_f1'] = test_f1

In [21]:
adasyn_results[adasyn_results['test_f1'] == adasyn_results['test_f1'].max()]

Unnamed: 0,nneighbors,train_accuracy,train_precision,train_recall,train_f1,test_accuracy,test_precision,test_recall,test_f1
7,8,0.908749,0.875813,0.952672,0.912627,0.855903,0.091116,0.714286,0.161616


#### Word2vec

In [35]:
nneighbors = []
train_accuracy = []
train_precision = []
train_recall = []
train_f1 = []

test_accuracy = []
test_precision = []
test_recall = []
test_f1 = []

for i in list(range(n_neighbors[0], n_neighbors[1])):
    print(i)
    X_train_ADASYN, y_train_ADASYN = adasyn(X_train_wordvec, y_train_wordvec, n_neighbors = i)
    train_results, test_results = log_reg(X_train_ADASYN,y_train_ADASYN,X_test_wordvec,y_test_wordvec)
    nneighbors.append(i)
    kneighbors.append(k)
    samplingstrat.append(j)
    train_accuracy.append(train_results['Accuracy'])
    train_precision.append(train_results['Precision'])
    train_recall.append(train_results['Recall'])
    train_f1.append(train_results['F1'])

    test_accuracy.append(test_results['Accuracy'])
    test_precision.append(test_results['Precision'])
    test_recall.append(test_results['Recall'])
    test_f1.append(test_results['F1'])

1
2
3
4
5
6
7
8
9


In [36]:
adasyn_results_word2vec = pd.DataFrame()
adasyn_results_word2vec['nneighbors'] = nneighbors
adasyn_results_word2vec['train_accuracy'] = train_accuracy
adasyn_results_word2vec['train_precision'] = train_precision
adasyn_results_word2vec['train_recall'] = train_recall
adasyn_results_word2vec['train_f1'] = train_f1
adasyn_results_word2vec['test_accuracy'] = test_accuracy
adasyn_results_word2vec['test_precision'] = test_precision
adasyn_results_word2vec['test_recall'] = test_recall
adasyn_results_word2vec['test_f1'] = test_f1

In [37]:
adasyn_results_word2vec[adasyn_results_word2vec['test_f1'] == adasyn_results_word2vec['test_f1'].max()]

Unnamed: 0,nneighbors,train_accuracy,train_precision,train_recall,train_f1,test_accuracy,test_precision,test_recall,test_f1
8,9,0.950649,0.925427,0.98023,0.952041,0.909375,0.147766,0.767857,0.247839


## Dimensionality Reduction

In [39]:
from sklearn.decomposition import PCA
def pca(n_components,X_train,X_test):
    pca = PCA(n_components=n_components)

    X_train = pca.fit_transform(X_train)
    X_test = pca.transform(X_test)
    return X_train, X_test

In [40]:
from sklearn.feature_selection import SelectKBest
def kbest(k,X_train,X_test,y_train,y_test):
    kbest = SelectKBest(k=k)
    X_train = kbest.fit_transform(X_train, y_train)
    X_test = kbest.transform(X_test)
    return X_train, X_test


#### Glove

In [52]:
## Best SMOTE 
results_pca = {}
for i in range(5,30):
    print(f'N COMPONENTS: {i}')
    X_train_SMOTE, y_train_SMOTE = SMOTE_ENN(X_train, y_train, n_neighbours = 3, k_neighbours = 1, sampling_strategy = 0.1)
    X_train_pca, X_test_pca = pca(i, X_train_SMOTE, X_test)
    results_pca[i] = log_reg(X_train_pca,y_train_SMOTE, X_test_pca, y_test)[1]['F1']

N COMPONENTS: 5
N COMPONENTS: 6
N COMPONENTS: 7
N COMPONENTS: 8
N COMPONENTS: 9
N COMPONENTS: 10
N COMPONENTS: 11
N COMPONENTS: 12
N COMPONENTS: 13
N COMPONENTS: 14
N COMPONENTS: 15
N COMPONENTS: 16
N COMPONENTS: 17
N COMPONENTS: 18
N COMPONENTS: 19
N COMPONENTS: 20
N COMPONENTS: 21
N COMPONENTS: 22
N COMPONENTS: 23
N COMPONENTS: 24
N COMPONENTS: 25
N COMPONENTS: 26
N COMPONENTS: 27
N COMPONENTS: 28
N COMPONENTS: 29


In [53]:
best_n = max(results_pca, key=results_pca.get)
results_pca[best_n]

0.1782178217821782

In [68]:
results_kbest = {}
for i in range(5,30):
    print(f'N COMPONENTS: {i}')
    X_train_SMOTE, y_train_SMOTE = SMOTE_ENN(X_train, y_train, n_neighbours = 3, k_neighbours = 1, sampling_strategy = 0.1)
    X_train_kbest, X_test_kbest = kbest(i, X_train_SMOTE, X_test,y_train_SMOTE,y_test)
    results_kbest[i] = log_reg(X_train_kbest,y_train_SMOTE, X_test_kbest, y_test)[1]['F1']

N COMPONENTS: 5
N COMPONENTS: 6
N COMPONENTS: 7
N COMPONENTS: 8
N COMPONENTS: 9
N COMPONENTS: 10
N COMPONENTS: 11
N COMPONENTS: 12
N COMPONENTS: 13
N COMPONENTS: 14
N COMPONENTS: 15
N COMPONENTS: 16
N COMPONENTS: 17
N COMPONENTS: 18
N COMPONENTS: 19
N COMPONENTS: 20
N COMPONENTS: 21
N COMPONENTS: 22
N COMPONENTS: 23
N COMPONENTS: 24
N COMPONENTS: 25
N COMPONENTS: 26
N COMPONENTS: 27
N COMPONENTS: 28
N COMPONENTS: 29


In [69]:
best_k = max(results_kbest, key=results_kbest.get)
results_kbest[best_k]

0.24742268041237114

In [70]:
best_k

29

#### Word2Vec

In [64]:
results_pca = {}
for i in range(5,30):
    print(f'N COMPONENTS: {i}')
    X_train_SMOTE, y_train_SMOTE = SMOTE_ENN(X_train_wordvec, y_train_wordvec, n_neighbours = 2, k_neighbours = 5, sampling_strategy = 0.1)
    X_train_pca, X_test_pca = pca(i, X_train_SMOTE, X_test_wordvec)
    results_pca[i] = log_reg(X_train_pca,y_train_SMOTE, X_test_pca, y_test_wordvec)[1]['F1']

N COMPONENTS: 5
N COMPONENTS: 6
N COMPONENTS: 7
N COMPONENTS: 8
N COMPONENTS: 9
N COMPONENTS: 10
N COMPONENTS: 11
N COMPONENTS: 12
N COMPONENTS: 13
N COMPONENTS: 14
N COMPONENTS: 15
N COMPONENTS: 16
N COMPONENTS: 17
N COMPONENTS: 18
N COMPONENTS: 19
N COMPONENTS: 20
N COMPONENTS: 21
N COMPONENTS: 22
N COMPONENTS: 23
N COMPONENTS: 24
N COMPONENTS: 25
N COMPONENTS: 26
N COMPONENTS: 27
N COMPONENTS: 28
N COMPONENTS: 29


In [65]:
best_n = max(results_pca, key=results_pca.get)
results_pca[best_n]

0.22857142857142854

In [66]:
results_kbest = {}
for i in range(5,30):
    print(f'N COMPONENTS: {i}')
    X_train_SMOTE, y_train_SMOTE = SMOTE_ENN(X_train_wordvec, y_train_wordvec, n_neighbours = 3, k_neighbours = 1, sampling_strategy = 0.1)
    X_train_kbest, X_test_kbest = kbest(i, X_train_SMOTE, X_test_wordvec,y_train_SMOTE,y_test_wordvec)
    results_kbest[i] = log_reg(X_train_kbest,y_train_SMOTE, X_test_kbest, y_test_wordvec)[1]['F1']

N COMPONENTS: 5
N COMPONENTS: 6
N COMPONENTS: 7
N COMPONENTS: 8
N COMPONENTS: 9
N COMPONENTS: 10
N COMPONENTS: 11
N COMPONENTS: 12
N COMPONENTS: 13
N COMPONENTS: 14
N COMPONENTS: 15
N COMPONENTS: 16
N COMPONENTS: 17
N COMPONENTS: 18
N COMPONENTS: 19
N COMPONENTS: 20
N COMPONENTS: 21
N COMPONENTS: 22
N COMPONENTS: 23
N COMPONENTS: 24
N COMPONENTS: 25
N COMPONENTS: 26
N COMPONENTS: 27
N COMPONENTS: 28
N COMPONENTS: 29


In [67]:
best_k = max(results_kbest, key=results_kbest.get)
results_kbest[best_k]

0.24175824175824176