In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('fraudTest.csv', index_col=0)
print(data.columns)
data.head()

Index(['trans_date_trans_time', 'cc_num', 'merchant', 'category', 'amt',
       'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'lat',
       'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time', 'merch_lat',
       'merch_long', 'is_fraud'],
      dtype='object')


Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2020-06-21 12:14:25,2291163933867244,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,Columbia,...,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,2020-06-21 12:14:33,3573030041201292,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,Altonah,...,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,2020-06-21 12:14:53,3598215285024754,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,Bellmore,...,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,2020-06-21 12:15:15,3591919803438423,fraud_Haley Group,misc_pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,Titusville,...,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,2020-06-21 12:15:17,3526826139003047,fraud_Johnston-Casper,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,Falmouth,...,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0


In [3]:
data.drop(columns=["cc_num", "first", "last", "street",
    "trans_num", "trans_date_trans_time",
    "job", "merchant", "dob", "city", "zip"],
          inplace=True)
data.head()

Unnamed: 0,category,amt,gender,state,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
0,personal_care,2.86,M,SC,33.9659,-80.9355,333497,1371816865,33.986391,-81.200714,0
1,personal_care,29.84,F,UT,40.3207,-110.436,302,1371816873,39.450498,-109.960431,0
2,health_fitness,41.28,F,NY,40.6729,-73.5365,34496,1371816893,40.49581,-74.196111,0
3,misc_pos,60.05,M,FL,28.5697,-80.8191,54767,1371816915,28.812398,-80.883061,0
4,travel,3.19,M,MI,44.2529,-85.017,1126,1371816917,44.959148,-85.884734,0


In [4]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

gender_le = LabelEncoder()
ohe = OneHotEncoder(drop='first', sparse_output=False)

data['gender'] = gender_le.fit_transform(data['gender'])

ohe_cols = ['category', 'state']

ohe_encoded = ohe.fit_transform(data[ohe_cols])

ohe_df = pd.DataFrame(ohe_encoded,
                      columns=ohe.get_feature_names_out(ohe_cols),
                      index = data.index)
df = pd.concat([data.drop(columns=ohe_cols) , ohe_df],
               axis = 1)

In [5]:
X = df.drop(columns=['is_fraud'])
y = df['is_fraud']

In [6]:
from sklearn.model_selection import train_test_split

scaler = StandardScaler()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=100)

num_cols = ['amt','lat','long','city_pop','unix_time','merch_lat','merch_long']

X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.fit_transform(X_test[num_cols])

In [7]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

In [8]:
device = torch.device('cpu')

X_train = torch.tensor(X_train.values, dtype = torch.float32)
X_test = torch.tensor(X_test.values, dtype = torch.float32)
y_train = torch.tensor(y_train.values, dtype = torch.float32).unsqueeze(1)
y_test = torch.tensor(y_test.values, dtype = torch.float32).unsqueeze(1)

In [9]:
train_ds = TensorDataset(X_train, y_train)
test_ds = TensorDataset(X_test, y_test)
batch_size = 2048

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False)

In [10]:
h1, h2 = 128, 64


class FraudNet(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, h1),
            nn.LeakyReLU(0.01),
            nn.Dropout(0.3),
            nn.Linear(h1, h2),
            nn.LeakyReLU(0.01),
            nn.Dropout(0.3),
            nn.Linear(h2, 1),
            nn.Sigmoid()
        )
    def forward(self, x):
        return self.net(x)
    
input_dim = X_train.shape[1]
model = FraudNet(input_dim=input_dim)

In [11]:
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)



In [12]:
num_epochs = 1
debug_batches = 2  # or 2, 3, whatever you want

for epoch in range(num_epochs):
    model.train()
    train_loss_sum = 0.0
    
    for batch_idx, (xb, yb) in enumerate(train_loader):
        optimizer.zero_grad()
        preds = model(xb)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()
        train_loss_sum += loss.item() * xb.size(0)

        print(loss.item(), xb.size(0), train_loss_sum / len(train_loader.dataset))
        
        if batch_idx + 1 >= debug_batches:  # +1 because enumerate starts at 0
            break
        ## rest of the code

## This variant allows yuo to debug the code in a more precise way


0.7553783655166626 2048 0.0034797613284105605
0.7349560260772705 2048 0.006865444152245122


In [13]:
epochs = 10
train_losses = []
val_losses = []

for epoch in range(epochs):
    model.train()
    train_loss_sum = 0

    for xb, yb in train_loader:
        optimizer.zero_grad()
        pred = model(xb)
        loss = criterion(pred, yb)
        loss.backward()
        optimizer.step()
        epoch_loss = loss.item()*xb.size(0)
    
    epoch_train_loss = epoch_loss / len(train_loader)
    train_losses.append(epoch_train_loss)
    
    val_loss_sum = 0
    with torch.no_grad():
        for xb_val, yb_val in val_loader:
            pred_val = model(xb_val)
            loss_val = criterion(pred_val, yb_val)
            val_loss_sum += loss_val.item()*xb.size(0)

    epoch_val_loss = val_loss_sum / len(val_loader.dataset)
    val_losses.append(epoch_val_loss)

    print(f"Epoch {epoch+1} / {epochs} | Train: {epoch_train_loss: .6f} | Val: {epoch_val_loss: .6f}")

Epoch 1 / 10 | Train:  0.001979 | Val:  0.001431
Epoch 2 / 10 | Train:  0.043769 | Val:  0.001373
Epoch 3 / 10 | Train:  0.009249 | Val:  0.001298
Epoch 4 / 10 | Train:  0.004783 | Val:  0.001236
Epoch 5 / 10 | Train:  0.005396 | Val:  0.001206
Epoch 6 / 10 | Train:  0.013396 | Val:  0.001185
Epoch 7 / 10 | Train:  0.038166 | Val:  0.001167
Epoch 8 / 10 | Train:  0.001294 | Val:  0.001108
Epoch 9 / 10 | Train:  0.001776 | Val:  0.001063
Epoch 10 / 10 | Train:  0.010510 | Val:  0.001053
