In [1]:
import pandas as pd
import numpy as np
import sklearn
from datasets import Fraud_Dataset
from data_pipeline import ETL_Pipeline


In [20]:
import torch
import torch.nn as nn
import torch.optim as optim

print(f'PyTorch version= {torch.__version__}')
print(f'CUDA available= {torch.cuda.is_available()}')  # Need CUDA and GPU present

PyTorch version= 2.1.1
CUDA available= True


In [3]:
# CUDA Installation
print('CUDA Version')
!nvcc --version
print()

# CUDNN Installation
print(f'CUDNN Version: {torch.backends.cudnn.version()}')
print(f'Number of CUDA Devices: {torch.cuda.device_count()}')
print(f'Active CUDA Device: {torch.cuda.current_device()}')
print(f'Available devices: {torch.cuda.device_count()}, Name: {torch.cuda.get_device_name(0)}')
print(f'Current CUDA device: {torch.cuda.current_device()}')

CUDA Version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Fri_Nov__3_17:51:05_Pacific_Daylight_Time_2023
Cuda compilation tools, release 12.3, V12.3.103
Build cuda_12.3.r12.3/compiler.33492891_0

CUDNN Version: 8801
Number of CUDA Devices: 1
Active CUDA Device: 0
Available devices: 1, Name: NVIDIA GeForce RTX 2070 with Max-Q Design
Current CUDA device: 0


In [4]:
df = pd.read_csv('transactions.csv')

In [5]:
df.dtypes

Unnamed: 0                 int64
trans_date_trans_time     object
cc_num                     int64
merchant                  object
category                  object
amt                      float64
first                     object
last                      object
sex                       object
street                    object
city                      object
state                     object
zip                        int64
lat                      float64
long                     float64
city_pop                   int64
job                       object
dob                       object
trans_num                 object
unix_time                  int64
merch_lat                float64
merch_long               float64
is_fraud                   int64
dtype: object

In [6]:
etl = ETL_Pipeline()


In [7]:
etl.extract('transactions.csv')

In [8]:
etl.transform()

In [9]:
etl.load("transformed_transactions.csv")

In [10]:
len(df)

1852394

In [11]:
df_used = pd.read_csv('transformed_transactions.csv')

In [12]:
len(df_used)

1852394

In [38]:
df_used.dtypes

merchant        float64
category        float64
amt             float64
first           float64
last            float64
sex             float64
lat             float64
long            float64
city_pop        float64
job             float64
merch_lat       float64
merch_long      float64
is_fraud        float64
day_of_week     float64
day_of_month    float64
time            float64
generation      float64
dtype: object

In [14]:
df_used

Unnamed: 0,merchant,category,amt,first,last,sex,lat,long,city_pop,job,merch_lat,merch_long,is_fraud,day_of_week,day_of_month,time,generation
0,0.742775,0.615385,0.000137,0.463277,0.037113,0.0,0.343968,0.864638,0.001194,0.750000,0.350307,0.848603,0.0,0.833333,0.0,0.000000,0.8
1,0.348266,0.307692,0.003670,0.881356,0.331959,0.0,0.618450,0.485682,0.000043,0.868952,0.621490,0.486209,0.0,0.833333,0.0,0.000000,0.6
2,0.563584,0.000000,0.007569,0.327684,0.795876,1.0,0.474727,0.546553,0.001421,0.620968,0.497563,0.546698,0.0,0.833333,0.0,0.000000,0.4
3,0.520231,0.153846,0.001520,0.466102,0.964948,1.0,0.561509,0.548070,0.000659,0.665323,0.577666,0.542621,0.0,0.833333,0.0,0.000695,0.6
4,0.429191,0.692308,0.001415,0.957627,0.315464,1.0,0.394153,0.882190,0.000026,0.233871,0.405248,0.882857,0.0,0.833333,0.0,0.002085,0.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1852389,0.732659,0.384615,0.001477,0.692090,0.665979,1.0,0.438562,0.755010,0.000171,0.961694,0.431481,0.755493,0.0,0.666667,1.0,1.000000,0.6
1852390,0.381503,0.538462,0.003829,0.508475,0.903093,1.0,0.193121,0.718694,0.009879,0.417339,0.219328,0.706824,0.0,0.666667,1.0,1.000000,1.0
1852391,0.716763,0.538462,0.002967,0.053672,0.501031,0.0,0.560781,0.478609,0.001260,0.618952,0.569911,0.470881,0.0,0.666667,1.0,1.000000,0.6
1852392,0.108382,1.000000,0.000241,0.338983,0.723711,1.0,0.527114,0.503704,0.000036,0.127016,0.524786,0.497296,0.0,0.666667,1.0,1.000000,0.4


In [15]:
X = df_used.drop('is_fraud', axis=1)
y = df_used['is_fraud']

fraud_dataset = Fraud_Dataset(X, y)
fraud_dataset.stratified_split()

In [16]:
test_x, test_y = fraud_dataset.get_testing_dataset()

In [39]:
train_x, train_y = fraud_dataset.get_training_dataset()

In [17]:
len(test_x)

185239

In [18]:
len(test_y)

185239

In [51]:
class CustomMLP_GPGPU(nn.Module):
    """ A PyTorch neural network model for GPGPU """
    def __init__(self, n_hidden=30, epochs=100, eta=0.05, minibatch_size=50):
        super(CustomMLP_GPGPU, self).__init__()
        self.n_hidden = n_hidden  # size of the hidden layer
        self.epochs = epochs  # number of iterations
        self.eta = eta  # learning rate
        self.minibatch_size = minibatch_size  # size of training batch - 1 would not work
        self.fc1, self.fc2, self.fc3 = None, None, None

    def _forward(self, X, apply_softmax=False):
        assert self.fc1 is not None
        
        # Carry the data to GPU
        X = nn.functional.relu(self.fc1(X.to(gpu)))
        X = nn.functional.relu(self.fc2(X))
        X = self.fc3(X)
        if apply_softmax:
            X = nn.functional.softmax(X, dim=1)
        return X

    def _reset(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                 m.reset_parameters()

    def predict(self, X):
        assert self.fc1 is not None
        
        # Carry the data to GPU
        net_out = self._forward(X.to(gpu, non_blocking=True), apply_softmax=True)
        p_values, indices = net_out.max(dim=1)
        
        # Carry the data back to CPU
        return indices.to('cpu')

    def fit(self, X_train, y_train):
    
        # Carry the data to GPU
        X_train = torch.flatten(X_train, start_dim=1).to(torch.double)
        X_train = X_train.to(gpu, non_blocking=True)
        y_train = y_train.to(gpu, non_blocking=True).long()
        
        self._reset()  # Reset the neural network weights
        n_output = torch.unique(y_train).shape[0]
        n_features = X_train.shape[1]
        
        # Carry the input, hidden and output layers to GPU
        self.fc1 = nn.Linear(n_features, self.n_hidden, dtype=torch.double).to(gpu)  # Specify dtype here
        self.fc2 = nn.Linear(self.n_hidden, self.n_hidden, dtype=torch.double).to(gpu)  # Specify dtype here
        self.fc3 = nn.Linear(self.n_hidden, n_output, dtype=torch.double).to(gpu)  # Specify dtype here
        
        optimizer = optim.SGD(self.parameters(), lr=self.eta, momentum=0.9)
        loss_func = nn.CrossEntropyLoss()
        
        for epoch in range(self.epochs):
            indices = torch.randperm(X_train.shape[0])  # Shuffle the indices
            X_train, y_train = X_train[indices], y_train[indices]  # Shuffle the data
            
            for start_idx in range(0, X_train.shape[0] - self.minibatch_size + 1, self.minibatch_size):
                end_idx = start_idx + self.minibatch_size
                X_batch, y_batch = X_train[start_idx:end_idx], y_train[start_idx:end_idx]
                
                optimizer.zero_grad()  # Reset the gradients
                y_pred = self._forward(X_batch)  # Forward pass
                loss = loss_func(y_pred, y_batch)  # Compute the loss
                loss.backward()  # Backward pass
                optimizer.step()  # Update the parameters

In [52]:
def kfold_eval_docs(_clf, _Xdocs, _ydocs):
    from sklearn.model_selection import StratifiedKFold
    from sklearn.metrics import accuracy_score

    # Need indexable data structure
    accuracy = []
    kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
    for train_index, test_index in kf.split(_Xdocs, _ydocs):
        _clf.fit(_Xdocs[train_index], _ydocs[train_index])
        y_pred = _clf.predict(_Xdocs[test_index])
        accuracy += [accuracy_score(_ydocs[test_index], y_pred)]
    return np.array(accuracy)

In [53]:
x_train = torch.tensor(train_x.values)
y_train = torch.tensor(train_y.values)

In [54]:
print(len(x_train), len(y_train))

1481915 1481915


In [55]:
gpu = torch.device('cuda:0')

In [56]:
%time

acc = kfold_eval_docs(CustomMLP_GPGPU(50, 500, 0.05, 300).to(gpu),
                      torch.tensor(x_train), torch.tensor(y_train))

print(f"PyTorch Feedforward GPGPU NN 10-fold CV accuracy= {np.mean(acc):.2f} {chr(177)}{np.std(acc):.3f}")

CPU times: total: 0 ns
Wall time: 0 ns


  torch.tensor(x_train), torch.tensor(y_train))


KeyboardInterrupt: 

In [60]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import confusion_matrix

def kfold_eval_docs(_clf, _Xdocs, _ydocs, metrics=['accuracy', 'precision', 'recall', 'f1', 'roc_auc', 'sensitivity', 'specificity']):
    results = {}
    kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
    
    for train_index, test_index in kf.split(_Xdocs, _ydocs):
        _clf.fit(_Xdocs[train_index], _ydocs[train_index])
        y_pred = _clf.predict(_Xdocs[test_index])
        
        if 'accuracy' in metrics:
            results['accuracy'] = accuracy_score(_ydocs[test_index], y_pred)
        if 'precision' in metrics:
            results['precision'] = precision_score(_ydocs[test_index], y_pred, average='weighted')
        if 'recall' in metrics:
            results['recall'] = recall_score(_ydocs[test_index], y_pred, average='weighted')
        if 'f1' in metrics:
            results['f1'] = f1_score(_ydocs[test_index], y_pred, average='weighted')
        if 'roc_auc' in metrics:
            results['roc_auc'] = roc_auc_score(_ydocs[test_index], y_pred, average='weighted', multi_class='ovr')
        if 'sensitivity' in metrics:
            cm = confusion_matrix(_ydocs[test_index], y_pred)
            results['sensitivity'] = cm[1,1] / (cm[1,0] + cm[1,1])
        if 'specificity' in metrics:
            cm = confusion_matrix(_ydocs[test_index], y_pred)
            results['specificity'] = cm[0,0] / (cm[0,0] + cm[0,1])
        print(results)

    return results

In [None]:
%time

results = kfold_eval_docs(CustomMLP_GPGPU(50, 100, 0.05, 50).to(gpu),
                          torch.tensor(x_train), torch.tensor(y_train))

for metric, (mean, std) in results.items():
    print(f"{metric}={mean:.2f} {chr(177)}{std:.3f}")

CPU times: total: 0 ns
Wall time: 0 ns


  torch.tensor(x_train), torch.tensor(y_train))


{'accuracy': 0.9968284387821205, 'precision': 0.9964931440082938, 'recall': 0.9968284387821205, 'f1': 0.9963756963696783, 'roc_auc': 0.7361852885168947, 'sensitivity': 0.4727979274611399, 'specificity': 0.9995726495726496}
{'accuracy': 0.9966732347225221, 'precision': 0.9964238106681867, 'recall': 0.9966732347225221, 'f1': 0.99602738892976, 'roc_auc': 0.7000277835770065, 'sensitivity': 0.40025906735751293, 'specificity': 0.9997964997964998}
