In [1]:
# Imports
# Basic Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

# Preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

# Model Selection / Cross Validation
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, StratifiedKFold, cross_val_score, cross_validate, RepeatedStratifiedKFold

# Pipelines
from sklearn.pipeline import make_pipeline
from imblearn.pipeline import Pipeline as ImbPipeline

# Imbalanced Dataset
from imblearn.under_sampling import NearMiss, RandomUnderSampler
from imblearn.over_sampling import SMOTE, SMOTENC
from imblearn.combine import SMOTEENN

# Metrics
from sklearn.metrics import accuracy_score, balanced_accuracy_score, classification_report, confusion_matrix, make_scorer
from sklearn.metrics import f1_score, precision_score, recall_score, roc_curve, auc, roc_auc_score

# Models
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Other Techniques
from sklearn.decomposition import PCA
from imblearn.pipeline import make_pipeline, Pipeline
from scipy.stats import uniform


In [2]:
# Load Base dataframe
df = pd.read_csv('data/Base.csv')

# Separate out the fraud labels
y = df['fraud_bool']

# Keep features minus fraud label
X = df.drop('fraud_bool', axis=1)

In [3]:
# Drop 'device_fraud_count' from X_data as it only has a single unique value in the dataset
X = X.drop(labels='device_fraud_count', axis=1)

In [4]:
# One Hot Vector Encoding for Category Features
cat_columns = X.select_dtypes('object').columns
print(cat_columns)
print(X.shape)

X = pd.get_dummies(X, columns=cat_columns)

Index(['payment_type', 'employment_status', 'housing_status', 'source',
       'device_os'],
      dtype='object')
(1000000, 30)


In [7]:
# values to evaluate
k_values = [1, 2, 3, 4, 5, 6, 7]
for k in k_values:
 # define pipeline
 model = DecisionTreeClassifier()
 over = SMOTE(sampling_strategy=0.1, k_neighbors=k)
 under = RandomUnderSampler(sampling_strategy=0.5)
 steps = [('over', over), ('under', under), ('model', model)]
 pipeline = Pipeline(steps=steps)
 # evaluate pipeline
 cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
 scores = cross_val_score(pipeline, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
 score = np.mean(scores)
 print('> k=%d, Mean ROC AUC: %.3f' % (k, score))

> k=1, Mean ROC AUC: 0.611




> k=2, Mean ROC AUC: 0.613




> k=3, Mean ROC AUC: 0.614




> k=4, Mean ROC AUC: 0.615




> k=5, Mean ROC AUC: 0.616




> k=6, Mean ROC AUC: 0.618




> k=7, Mean ROC AUC: 0.617


In [8]:
# values to evaluate
k_values = [1, 2, 3, 4, 5, 6, 7]
for k in k_values:
    # define pipeline
    model = RandomForestClassifier()
    over = SMOTE(sampling_strategy=0.1, k_neighbors=k)
    under = RandomUnderSampler(sampling_strategy=0.5)
    steps = [('over', over), ('under', under), ('model', model)]
    pipeline = Pipeline(steps=steps)
    # evaluate pipeline
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(pipeline, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
    score = np.mean(scores)
    print('> k=%d, Mean ROC AUC: %.3f' % (k, score))

> k=1, Mean ROC AUC: 0.868




> k=2, Mean ROC AUC: 0.869




> k=3, Mean ROC AUC: 0.869




> k=4, Mean ROC AUC: 0.868




> k=5, Mean ROC AUC: 0.869




> k=6, Mean ROC AUC: 0.868




> k=7, Mean ROC AUC: 0.868


In [27]:
# values to evaluate
k_values = [1, 2, 3, 4, 5, 6, 7]
for k in k_values:
    # define pipeline
    model = XGBClassifier()
    over = SMOTE(sampling_strategy=0.1, k_neighbors=k)
    under = RandomUnderSampler(sampling_strategy=0.5)
    steps = [('over', over), ('under', under), ('model', model)]
    pipeline = Pipeline(steps=steps)
    # evaluate pipeline
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(pipeline, X, y, scoring='roc_auc', cv=cv, n_jobs=4)
    score = np.mean(scores)
    print('> k=%d, Mean ROC AUC: %.3f' % (k, score))

TypeError: RepeatedStratifiedKFold.__init__() got an unexpected keyword argument 'shuffled'

In [14]:
# values to evaluate
k_values = [1, 2, 3, 4, 5, 6, 7]
for k in k_values:
    # define pipeline
    model = LGBMClassifier(verbose=-1)
    over = SMOTE(sampling_strategy=0.1, k_neighbors=k)
    under = RandomUnderSampler(sampling_strategy=0.5)
    steps = [('over', over), ('under', under), ('model', model)]
    pipeline = Pipeline(steps=steps)
    # evaluate pipeline
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(pipeline, X, y, scoring='roc_auc', cv=cv, n_jobs=1)
    score = np.mean(scores)
    print('> k=%d, Mean ROC AUC: %.3f' % (k, score))

> k=1, Mean ROC AUC: 0.891
> k=2, Mean ROC AUC: 0.892
> k=3, Mean ROC AUC: 0.892
> k=4, Mean ROC AUC: 0.892
> k=5, Mean ROC AUC: 0.892
> k=6, Mean ROC AUC: 0.892
> k=7, Mean ROC AUC: 0.892


In [15]:
# values to evaluate
k_values = [1, 2, 3, 4, 5, 6, 7]
for k in k_values:
    # define pipeline
    model = LGBMClassifier(verbose=-1)
    under = NearMiss(sampling_strategy=0.1, n_neighbors=k)
    steps = [('under', under), ('model', model)]
    pipeline = Pipeline(steps=steps)
    # evaluate pipeline
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(pipeline, X, y, scoring='roc_auc', cv=cv, n_jobs=1)
    score = np.mean(scores)
    print('> k=%d, Mean ROC AUC: %.3f' % (k, score))

> k=1, Mean ROC AUC: 0.797
> k=2, Mean ROC AUC: 0.726
> k=3, Mean ROC AUC: 0.693
> k=4, Mean ROC AUC: 0.678
> k=5, Mean ROC AUC: 0.666
> k=6, Mean ROC AUC: 0.659
> k=7, Mean ROC AUC: 0.654


In [16]:
# values to evaluate
k_values = [1, 2, 3, 4, 5, 6, 7]
for k in k_values:
    # define pipeline
    model = XGBClassifier()
    under = NearMiss(sampling_strategy=0.1, n_neighbors=k)
    steps = [('under', under), ('model', model)]
    pipeline = Pipeline(steps=steps)
    # evaluate pipeline
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(pipeline, X, y, scoring='roc_auc', cv=cv, n_jobs=1)
    score = np.mean(scores)
    print('> k=%d, Mean ROC AUC: %.3f' % (k, score))

> k=1, Mean ROC AUC: 0.793
> k=2, Mean ROC AUC: 0.722
> k=3, Mean ROC AUC: 0.691
> k=4, Mean ROC AUC: 0.675
> k=5, Mean ROC AUC: 0.664
> k=6, Mean ROC AUC: 0.655
> k=7, Mean ROC AUC: 0.651


In [19]:
# values to evaluate
k_values = [1, 2, 3, 4, 5, 6, 7]
confusion_matrices = []
for k in k_values:
    # define pipeline
    model = LGBMClassifier(verbose=-1)
    over = SMOTE(sampling_strategy=0.1, k_neighbors=k)
    under = RandomUnderSampler(sampling_strategy=0.5)
    steps = [('over', over), ('under', under), ('model', model)]
    pipeline = Pipeline(steps=steps)
    # evaluate pipeline
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_validate(pipeline, X, y, scoring=['balanced_accuracy', 'recall'], cv=cv, n_jobs=1)
    confusion_matrices.append(scores)
    print('> k=%d, Mean ROC AUC: %.3f' % (k, score))

> k=1, Mean ROC AUC: 0.651
> k=2, Mean ROC AUC: 0.651
> k=3, Mean ROC AUC: 0.651
> k=4, Mean ROC AUC: 0.651
> k=5, Mean ROC AUC: 0.651
> k=6, Mean ROC AUC: 0.651
> k=7, Mean ROC AUC: 0.651


In [24]:
def make_classification_report(y_true, y_pred):
    print(classification_report(y_true, y_pred))
    print(roc_auc_score(y_true, y_pred))
    return roc_auc_score

In [25]:
k_values = [1, 2, 3, 4, 5, 6, 7]
roc_auc_score_lst = []
for k in k_values:
    print('Starting ', k)
    # define pipeline
    model = LGBMClassifier(verbose=-1)
    over = SMOTE(sampling_strategy=0.1, k_neighbors=k)
    under = RandomUnderSampler(sampling_strategy=0.5)
    steps = [('over', over), ('under', under), ('model', model)]
    pipeline = Pipeline(steps=steps)
    # evaluate pipeline
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_validate(pipeline, X, y, scoring=make_scorer(make_classification_report), cv=cv, n_jobs=1)
    roc_auc_score_lst.append(scores)
    print('> k=%d, Mean ROC AUC: %.3f' % (k, score))

Starting  1
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     98898
           1       0.19      0.23      0.21      1102

    accuracy                           0.98    100000
   macro avg       0.59      0.61      0.60    100000
weighted avg       0.98      0.98      0.98    100000

0.6092097803456523


Traceback (most recent call last):
  File "/Users/anthony/.pyenv/versions/3.12.1/envs/orie5741/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 982, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthony/.pyenv/versions/3.12.1/envs/orie5741/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 253, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthony/.pyenv/versions/3.12.1/envs/orie5741/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 350, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
           ~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
TypeError: unsupported operand type(s) for *: 'int' and 'function'



              precision    recall  f1-score   support

           0       0.99      0.99      0.99     98897
           1       0.21      0.24      0.22      1103

    accuracy                           0.98    100000
   macro avg       0.60      0.62      0.61    100000
weighted avg       0.98      0.98      0.98    100000

0.6154536899205857


Traceback (most recent call last):
  File "/Users/anthony/.pyenv/versions/3.12.1/envs/orie5741/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 982, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthony/.pyenv/versions/3.12.1/envs/orie5741/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 253, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthony/.pyenv/versions/3.12.1/envs/orie5741/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 350, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
           ~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
TypeError: unsupported operand type(s) for *: 'int' and 'function'



              precision    recall  f1-score   support

           0       0.99      0.99      0.99     98897
           1       0.21      0.25      0.23      1103

    accuracy                           0.98    100000
   macro avg       0.60      0.62      0.61    100000
weighted avg       0.98      0.98      0.98    100000

0.6206709461388124


Traceback (most recent call last):
  File "/Users/anthony/.pyenv/versions/3.12.1/envs/orie5741/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 982, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthony/.pyenv/versions/3.12.1/envs/orie5741/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 253, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anthony/.pyenv/versions/3.12.1/envs/orie5741/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 350, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
           ~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
TypeError: unsupported operand type(s) for *: 'int' and 'function'



KeyboardInterrupt: 