In [1]:
import pandas as pd
import numpy as np
import sklearn
from datasets import Fraud_Dataset
from data_pipeline import ETL_Pipeline


In [2]:
!pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121


Looking in indexes: https://download.pytorch.org/whl/nightly/cu121
[0m

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim

print(f'PyTorch version= {torch.__version__}')
print(f'CUDA available= {torch.cuda.is_available()}')  # Need CUDA and GPU present

PyTorch version= 2.3.0.dev20240219+cu121
CUDA available= True


In [4]:
# CUDA Installation
print('CUDA Version')
!nvcc --version
print()

# CUDNN Installation
print(f'CUDNN Version: {torch.backends.cudnn.version()}')
print(f'Number of CUDA Devices: {torch.cuda.device_count()}')
print(f'Active CUDA Device: {torch.cuda.current_device()}')
print(f'Available devices: {torch.cuda.device_count()}, Name: {torch.cuda.get_device_name(0)}')
print(f'Current CUDA device: {torch.cuda.current_device()}')

CUDA Version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Mon_Apr__3_17:16:06_PDT_2023
Cuda compilation tools, release 12.1, V12.1.105
Build cuda_12.1.r12.1/compiler.32688072_0

CUDNN Version: 8902
Number of CUDA Devices: 1
Active CUDA Device: 0
Available devices: 1, Name: NVIDIA GeForce RTX 2070 with Max-Q Design
Current CUDA device: 0


In [5]:
df = pd.read_csv('transactions.csv')

In [6]:
df.dtypes

Unnamed: 0                 int64
trans_date_trans_time     object
cc_num                     int64
merchant                  object
category                  object
amt                      float64
first                     object
last                      object
sex                       object
street                    object
city                      object
state                     object
zip                        int64
lat                      float64
long                     float64
city_pop                   int64
job                       object
dob                       object
trans_num                 object
unix_time                  int64
merch_lat                float64
merch_long               float64
is_fraud                   int64
dtype: object

In [7]:
etl = ETL_Pipeline()


In [8]:
etl.extract('transactions.csv')

In [9]:
etl.transform()

In [10]:
etl.load("transformed_transactions.csv")

In [11]:
len(df)

1852394

In [12]:
df_used = pd.read_csv('transformed_transactions.csv')

In [13]:
from sklearn.model_selection import StratifiedKFold
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
X = df_used.drop('is_fraud', axis=1)
y = df_used['is_fraud']
    
for train_index, test_index in kf.split(X, y):
    print(train_index, test_index)

[      0       1       2 ... 1852390 1852392 1852393] [     29      51      53 ... 1852386 1852389 1852391]
[      0       1       4 ... 1852390 1852391 1852392] [      2       3      12 ... 1852380 1852388 1852393]
[      0       1       2 ... 1852391 1852392 1852393] [      8      19      49 ... 1852376 1852383 1852384]
[      0       1       2 ... 1852391 1852392 1852393] [      4      20      35 ... 1852377 1852381 1852382]
[      0       1       2 ... 1852391 1852392 1852393] [     10      41      44 ... 1852371 1852372 1852378]
[      0       1       2 ... 1852391 1852392 1852393] [      5      18      25 ... 1852368 1852375 1852390]
[      0       2       3 ... 1852391 1852392 1852393] [      1       9      11 ... 1852349 1852350 1852379]
[      1       2       3 ... 1852391 1852392 1852393] [      0       6      31 ... 1852361 1852370 1852387]
[      0       1       2 ... 1852390 1852391 1852393] [     28      33      42 ... 1852345 1852359 1852392]
[      0       1       2 ...

In [14]:
len(df_used)

1852394

In [15]:
df_used.dtypes

merchant        float64
category        float64
amt             float64
first           float64
last            float64
sex             float64
lat             float64
long            float64
city_pop        float64
job             float64
merch_lat       float64
merch_long      float64
is_fraud        float64
day_of_week     float64
day_of_month    float64
time            float64
generation      float64
dtype: object

In [16]:
df_used

Unnamed: 0,merchant,category,amt,first,last,sex,lat,long,city_pop,job,merch_lat,merch_long,is_fraud,day_of_week,day_of_month,time,generation
0,0.742775,0.615385,0.000137,0.463277,0.037113,0.0,0.343968,0.864638,0.001194,0.750000,0.350307,0.848603,0.0,0.833333,0.0,0.000000,0.8
1,0.348266,0.307692,0.003670,0.881356,0.331959,0.0,0.618450,0.485682,0.000043,0.868952,0.621490,0.486209,0.0,0.833333,0.0,0.000000,0.6
2,0.563584,0.000000,0.007569,0.327684,0.795876,1.0,0.474727,0.546553,0.001421,0.620968,0.497563,0.546698,0.0,0.833333,0.0,0.000000,0.4
3,0.520231,0.153846,0.001520,0.466102,0.964948,1.0,0.561509,0.548070,0.000659,0.665323,0.577666,0.542621,0.0,0.833333,0.0,0.000695,0.6
4,0.429191,0.692308,0.001415,0.957627,0.315464,1.0,0.394153,0.882190,0.000026,0.233871,0.405248,0.882857,0.0,0.833333,0.0,0.002085,0.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1852389,0.732659,0.384615,0.001477,0.692090,0.665979,1.0,0.438562,0.755010,0.000171,0.961694,0.431481,0.755493,0.0,0.666667,1.0,1.000000,0.6
1852390,0.381503,0.538462,0.003829,0.508475,0.903093,1.0,0.193121,0.718694,0.009879,0.417339,0.219328,0.706824,0.0,0.666667,1.0,1.000000,1.0
1852391,0.716763,0.538462,0.002967,0.053672,0.501031,0.0,0.560781,0.478609,0.001260,0.618952,0.569911,0.470881,0.0,0.666667,1.0,1.000000,0.6
1852392,0.108382,1.000000,0.000241,0.338983,0.723711,1.0,0.527114,0.503704,0.000036,0.127016,0.524786,0.497296,0.0,0.666667,1.0,1.000000,0.4


In [17]:
X = df_used.drop('is_fraud', axis=1)
y = df_used['is_fraud']

fraud_dataset = Fraud_Dataset(X, y)
fraud_dataset.stratified_split()

In [18]:
len(X)

1852394

In [19]:
test_x, test_y = fraud_dataset.get_testing_dataset(1)

In [20]:
train_x, train_y = fraud_dataset.get_training_dataset(1)

In [21]:
validate_x, validate_y = fraud_dataset.get_validation_dataset(1)

In [22]:
len(train_x)

1481915

In [23]:
len(train_y)

1481915

In [24]:
len(validate_x)

185239

In [25]:
len(validate_y)

185239

In [26]:
len(test_x)

185240

In [27]:
len(test_y)

185240

In [28]:
len(fraud_dataset.split_data)

5

In [29]:
fraud_dataset.split_data[0]

(         merchant  category       amt     first      last  sex       lat  \
 246494   0.484104  1.000000  0.000223  0.711864  0.496907  0.0  0.526747   
 246500   0.627168  0.615385  0.028113  0.711864  0.496907  0.0  0.526747   
 246536   0.674855  0.923077  0.033672  0.403955  0.486598  0.0  0.389211   
 246552   0.289017  0.846154  0.029213  0.403955  0.486598  0.0  0.389211   
 246579   0.297688  0.615385  0.028303  0.403955  0.486598  0.0  0.389211   
 ...           ...       ...       ...       ...       ...  ...       ...   
 1852389  0.732659  0.384615  0.001477  0.692090  0.665979  1.0  0.438562   
 1852390  0.381503  0.538462  0.003829  0.508475  0.903093  1.0  0.193121   
 1852391  0.716763  0.538462  0.002967  0.053672  0.501031  0.0  0.560781   
 1852392  0.108382  1.000000  0.000241  0.338983  0.723711  1.0  0.527114   
 1852393  0.180636  0.000000  0.001283  0.827684  0.305155  1.0  0.335133   
 
              long  city_pop       job  merch_lat  merch_long  day_of_week

In [28]:
class CustomMLP_GPGPU(nn.Module):
    """ A PyTorch neural network model for GPGPU """
    def __init__(self, n_hidden=30, epochs=100, eta=0.05, minibatch_size=50):
        super(CustomMLP_GPGPU, self).__init__()
        self.n_hidden = n_hidden  # size of the hidden layer
        self.epochs = epochs  # number of iterations
        self.eta = eta  # learning rate
        self.minibatch_size = minibatch_size  # size of training batch - 1 would not work
        self.fc1, self.fc2, self.fc3 = None, None, None

    def _forward(self, X, apply_softmax=False):
        assert self.fc1 is not None
        
        # Carry the data to GPU
        X = nn.functional.relu(self.fc1(X.to(gpu)))
        X = nn.functional.relu(self.fc2(X))
        X = self.fc3(X)
        if apply_softmax:
            X = nn.functional.softmax(X, dim=1)
        return X

    def _reset(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                 m.reset_parameters()

    def predict(self, X):
        assert self.fc1 is not None
        
        # Carry the data to GPU
        net_out = self._forward(X.to(gpu, non_blocking=True), apply_softmax=True)
        p_values, indices = net_out.max(dim=1)
        
        # Carry the data back to CPU
        return indices.to('cpu')

    def fit(self, X_train, y_train):
    
        # Carry the data to GPU
        X_train = torch.flatten(X_train, start_dim=1).to(torch.double)
        X_train = X_train.to(gpu, non_blocking=True)
        y_train = y_train.to(gpu, non_blocking=True).long()
        
        self._reset()  # Reset the neural network weights
        n_output = torch.unique(y_train).shape[0]
        n_features = X_train.shape[1]
        
        # Carry the input, hidden and output layers to GPU
        self.fc1 = nn.Linear(n_features, self.n_hidden, dtype=torch.double).to(gpu)  # Specify dtype here
        self.fc2 = nn.Linear(self.n_hidden, self.n_hidden, dtype=torch.double).to(gpu)  # Specify dtype here
        self.fc3 = nn.Linear(self.n_hidden, n_output, dtype=torch.double).to(gpu)  # Specify dtype here
        
        optimizer = optim.SGD(self.parameters(), lr=self.eta, momentum=0.9)
        loss_func = nn.CrossEntropyLoss()
        
        for epoch in range(self.epochs):
            indices = torch.randperm(X_train.shape[0])  # Shuffle the indices
            X_train, y_train = X_train[indices], y_train[indices]  # Shuffle the data
            
            for start_idx in range(0, X_train.shape[0] - self.minibatch_size + 1, self.minibatch_size):
                end_idx = start_idx + self.minibatch_size
                X_batch, y_batch = X_train[start_idx:end_idx], y_train[start_idx:end_idx]
                
                optimizer.zero_grad()  # Reset the gradients
                y_pred = self._forward(X_batch)  # Forward pass
                loss = loss_func(y_pred, y_batch)  # Compute the loss
                loss.backward()  # Backward pass
                optimizer.step()  # Update the parameters

In [29]:
def kfold_eval_docs(_clf, _Xdocs, _ydocs):
    from sklearn.model_selection import StratifiedKFold
    from sklearn.metrics import accuracy_score

    # Need indexable data structure
    accuracy = []
    kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
    for train_index, test_index in kf.split(_Xdocs, _ydocs):
        _clf.fit(_Xdocs[train_index], _ydocs[train_index])
        y_pred = _clf.predict(_Xdocs[test_index])
        accuracy += [accuracy_score(_ydocs[test_index], y_pred)]
    return np.array(accuracy)

In [30]:
x_train = torch.tensor(train_x.values)
y_train = torch.tensor(train_y.values)

In [31]:
print(len(x_train), len(y_train))

1481915 1481915


In [32]:
gpu = torch.device('cuda:0')

In [26]:
%time

acc = kfold_eval_docs(CustomMLP_GPGPU(50, 500, 0.05, 300).to(gpu),
                      torch.tensor(x_train), torch.tensor(y_train))

print(f"PyTorch Feedforward GPGPU NN 10-fold CV accuracy= {np.mean(acc):.2f} {chr(177)}{np.std(acc):.3f}")

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 6.91 µs


  torch.tensor(x_train), torch.tensor(y_train))

KeyboardInterrupt



In [33]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import confusion_matrix

def kfold_eval_docs(_clf, _Xdocs, _ydocs, metrics=['accuracy', 'precision', 'recall', 'f1', 'roc_auc', 'sensitivity', 'specificity']):
    results = {}
    
    for train_index, test_index in kf.split(_Xdocs, _ydocs):
        _clf.fit(_Xdocs[train_index], _ydocs[train_index])
        y_pred = _clf.predict(_Xdocs[test_index])
        
        if 'accuracy' in metrics:
            results['accuracy'] = accuracy_score(_ydocs[test_index], y_pred)
        if 'precision' in metrics:
            results['precision'] = precision_score(_ydocs[test_index], y_pred, average='weighted')
        if 'recall' in metrics:
            results['recall'] = recall_score(_ydocs[test_index], y_pred, average='weighted')
        if 'f1' in metrics:
            results['f1'] = f1_score(_ydocs[test_index], y_pred, average='weighted')
        if 'roc_auc' in metrics:
            results['roc_auc'] = roc_auc_score(_ydocs[test_index], y_pred, average='weighted', multi_class='ovr')
        if 'sensitivity' in metrics:
            cm = confusion_matrix(_ydocs[test_index], y_pred)
            results['sensitivity'] = cm[1,1] / (cm[1,0] + cm[1,1])
        if 'specificity' in metrics:
            cm = confusion_matrix(_ydocs[test_index], y_pred)
            results['specificity'] = cm[0,0] / (cm[0,0] + cm[0,1])
        print(results)

    return results

In [None]:
%time

results = kfold_eval_docs(CustomMLP_GPGPU(50, 100, 0.05, 50).to(gpu),
                          torch.tensor(x_train), torch.tensor(y_train))

for metric, (mean, std) in results.items():
    print(f"{metric}={mean:.2f} {chr(177)}{std:.3f}")

In [1]:
from metrics import Metrics

In [2]:
y_pred = [1, 0, 1, 1, 1, 0]
y = [ 1, 1, 1, 1, 1, 0]
met = Metrics()
for i in range(5):
    met.run(y, y_pred, i)

In [1]:
from model import Fraud_Detector_Model

In [2]:
fd = Fraud_Detector_Model("transactions.csv")
fd.train(True)


KeyboardInterrupt



In [6]:
!pip install flask

Defaulting to user installation because normal site-packages is not writeable
Collecting flask
  Using cached flask-3.0.2-py3-none-any.whl.metadata (3.6 kB)
Collecting Werkzeug>=3.0.0 (from flask)
  Using cached werkzeug-3.0.1-py3-none-any.whl.metadata (4.1 kB)
Collecting itsdangerous>=2.1.2 (from flask)
  Downloading itsdangerous-2.1.2-py3-none-any.whl.metadata (2.9 kB)
Collecting click>=8.1.3 (from flask)
  Downloading click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Collecting blinker>=1.6.2 (from flask)
  Using cached blinker-1.7.0-py3-none-any.whl.metadata (1.9 kB)
Using cached flask-3.0.2-py3-none-any.whl (101 kB)
Using cached blinker-1.7.0-py3-none-any.whl (13 kB)
Downloading click-8.1.7-py3-none-any.whl (97 kB)
   ---------------------------------------- 0.0/97.9 kB ? eta -:--:--
   ---------------------------------------- 0.0/97.9 kB ? eta -:--:--
   ---- ----------------------------------- 10.2/97.9 kB ? eta -:--:--
   ---------------- ----------------------- 41.0/97.9 kB 393.8



In [77]:
!python fraud_service.py

Training the model, this may take 15-30 minutes depending on data size, the server will start after
starting server...
 * Serving Flask app 'fraud_service'
 * Debug mode: off
 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:8786
 * Running on http://172.17.0.2:8786
[33mPress CTRL+C to quit[0m
172.17.0.1 - - [25/Feb/2024 17:35:19] "POST /predict HTTP/1.1" 200 -
172.17.0.1 - - [25/Feb/2024 19:11:01] "GET /crossvalidate HTTP/1.1" 200 -
172.17.0.1 - - [25/Feb/2024 19:11:14] "POST /predict HTTP/1.1" 200 -
^C
