In [1]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.metrics import roc_auc_score
import joblib
from dask.distributed import Client, progress
from dask_ml.model_selection import train_test_split
import dask.dataframe as dd
import pandas as pd
import warnings

warnings.filterwarnings("ignore")

In [2]:
client = Client(n_workers=4, threads_per_worker=2, memory_limit='2GB')
client

0,1
Client  Scheduler: tcp://127.0.0.1:52331  Dashboard: http://127.0.0.1:52330/status,Cluster  Workers: 4  Cores: 8  Memory: 7.45 GiB


In [4]:
df = dd.read_csv('creditcard.csv', dtype={'Time': 'float64'})

## 1. use all of the variables except Class as your feature set. The Class variable will be your target variable.

In [5]:
# This is our feature set
X = df.drop("Class", axis=1)

# This is our target variable
Y = df["Class"]

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

# Since our data can fit into memory
# we persist them to the RAM.
X_train.persist()
X_test.persist()
y_train.persist()
y_test.persist()

Dask Series Structure:
npartitions=3
    int64
      ...
      ...
      ...
Name: Class, dtype: int64
Dask Name: split, 3 tasks

In [9]:
lr = LogisticRegression()

with joblib.parallel_backend('dask'):
    lr.fit(X_train.compute(), y_train.compute())
    
preds_train = lr.predict(X_train.values.compute())
preds_test = lr.predict(X_test.values.compute())

print(f'logistic regression train score:', roc_auc_score(preds_train, y_train.values.compute()))
print(f'logistic regression test score:', roc_auc_score(preds_test, y_test.values.compute()))

train score: 0.8505188745838435
test score: 0.8491808043342269


In [10]:
rfc = RandomForestClassifier()

with joblib.parallel_backend('dask'):
    rfc.fit(X_train.compute(), y_train.compute())
    
preds_train = rfc.predict(X_train.values.compute())
preds_test = rfc.predict(X_test.values.compute())

print("random forest training score: ", roc_auc_score(preds_train, y_train.values.compute()))
print("random forest test score: ", roc_auc_score(preds_test, y_test.values.compute()))

random forest training score:  1.0
random forest test score:  0.9873328612396417


In [11]:
gbc = GradientBoostingClassifier()

with joblib.parallel_backend('dask'):
    gbc.fit(X_train.compute(), y_train.compute())
    
preds_train = gbc.predict(X_train.values.compute())
preds_test = gbc.predict(X_test.values.compute())

print("gradient boosting tree training score: ", roc_auc_score(preds_train, y_train.values.compute()))
print("gradient boosting tree test score: ", roc_auc_score(preds_test, y_test.values.compute()))

gradient boosting tree training score:  0.9211210837960387
gradient boosting tree test score:  0.898435092063888


### 2. Compare the results of your models

Random forest performed the best with a 98.7% accuracy in the test score.

Logistic Regression performed the worst with an 84.5% accuracy

Gradient Boosting did a decent job with an 89.8% accuracy, however, it overfit worse.

However, notice that the class imbalance is huge between fraud and non-fraud transactions leading to a high accuracy score<br>
Using class imbalance techniques such as SMOTE could help with this issue.

In [13]:
client.close()