In [31]:
import os
from pathlib import Path
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F
import numpy as np
from sklearn.model_selection import train_test_split

torch.manual_seed(42)
np.random.seed(42)

In [35]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [32]:
dataset_path = Path('../data_cache/cleanedFraudDataset.csv')

In [19]:
df = pd.read_csv(dataset_path)
df.head()

Unnamed: 0,is_fraud,merchant_encoding,age,user_merchant_distance_km,day_of_week,hour,month,is_weekend,is_night,category_encoded,...,zip_encoded,job_freq,city_freq,gender_encoded,amt_log,city_pop_log,amt_bin_encoded,city_pop_bin_encoded,amt_outlier,city_pop_outlier
0,0,514,31,78.6,6,0,1,1,1,0.014526,...,0.0,0.002762,0.001579,0,1.786747,8.159375,0,0,0,0
1,0,241,41,30.2,6,0,1,1,1,0.013973,...,0.0,0.00393,0.002779,0,4.684259,5.010635,2,0,0,0
2,0,390,57,108.2,6,0,1,1,1,0.002435,...,0.0,0.00038,0.00038,1,5.39866,8.332068,2,0,1,0
3,0,360,52,95.7,6,0,1,1,1,0.004679,...,0.037975,0.00194,0.000377,1,3.828641,7.570443,1,0,0,0
4,0,297,33,77.6,6,0,1,1,1,0.003008,...,0.0,0.001571,0.001571,1,3.760269,4.60517,1,0,0,0


In [9]:
print(len(df.columns))
print(len(df))

21
1048575


## Splitting the Dataset

In [33]:
# Hyperparameters
batch_size = 32
epochs = 20

In [None]:
X = df.drop('is_fraud', axis=1)
y = df['is_fraud']

def create_data_loaders():
    train_X, temp_X, train_y, temp_y = train_test_split(X, y, 
                                                    test_size=0.3,
                                                    random_state=42,
                                                    stratify=y)

    test_X, val_X, test_y, val_y = train_test_split(
        temp_X, temp_y,
        train_size=0.5,
        random_state=42,
        stratify=temp_y)

    print(f"train_X 70%: {len(train_X)}, val_X 15%: {len(val_X)}, test_X 15%: {len(test_X)}")
    
    # Convert to tensors - Fixed syntax
    # Binary Cross Entropy loss expects input shape of [batch_size, 1]
    X_train = torch.FloatTensor(train_X.values)
    y_train = torch.FloatTensor(train_y.values).unsqueeze(1)  # For BCELoss
    X_val = torch.FloatTensor(val_X.values)
    y_val = torch.FloatTensor(val_y.values).unsqueeze(1)
    X_test = torch.FloatTensor(test_X.values)
    y_test = torch.FloatTensor(test_y.values).unsqueeze(1)

    # Create datasets
    train_dataset = TensorDataset(X_train, y_train)
    validation_dataset = TensorDataset(X_val, y_val)
    test_dataset = TensorDataset(X_test, y_test)

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    valid_loader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    return train_loader, valid_loader, test_loader

train_loader, valid_loader, test_loader = create_data_loaders()


train_X 70%: 734002, val_X 15%: 157287, test_X 15%: 157286


## Modle building

In [None]:
class LogisticRegression(nn.Module):
  def __init__(self, input_dim):
    super().__init__()
    self.linear1 = nn.Linear(input_dim, 64)
    self.drop1 = nn.Dropout(0.3)
    self.linear2 = nn.Linear(64, 32)
    self.drop2 = nn.Dropout(0.3)
    self.linear3 = nn.Linear(32, 1)

  def forward(self, x):
    x = self.linear1(x)
    x = F.relu(x)
    x = self.drop1(x)

    x = self.linear2(x)
    x = F.relu(x)
    x = self.drop2(x)

    x = self.linear3(x)
    x = F.sigmoid(x)
    return x

In [None]:
logist_model = LogisticRegression(input_dim=20).to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(logist_model.parameters(), lr=1e-2)

In [47]:
steps_per_epoch = len(train_loader)

for epoch in range(epochs):
    total_loss =  0.0
    for (inputs, label) in train_loader:
        inputs.to(device)
        label.to(device)
        
        outputs = logist_model(inputs)
        loss = criterion(outputs, label)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    print(f"[{epoch+1}]: loss: {(total_loss / steps_per_epoch):.3f}")
    
print('Training Finished')

[1]: loss: 0.573
[2]: loss: 0.573
[3]: loss: 0.573
[4]: loss: 0.573
[5]: loss: 0.573
[6]: loss: 0.573
[7]: loss: 0.573
[8]: loss: 0.573
[9]: loss: 0.573
[10]: loss: 0.573
[11]: loss: 0.573
[12]: loss: 0.573
[13]: loss: 0.573
[14]: loss: 0.573
[15]: loss: 0.573
[16]: loss: 0.573
[17]: loss: 0.573
[18]: loss: 0.573
[19]: loss: 0.573
[20]: loss: 0.573
Training Finished


In [53]:
# Evaluation
logist_model.eval()

correct_predictions = 0
total_predictions = 0

with torch.no_grad():
    for (batch_features, batch_labels) in test_loader:
        batch_features.to(device)
        batch_labels.to(device)
        
        outputs = logist_model(batch_features)
        
        predicted = (outputs > 0.5).float()
        
        total_predictions += batch_labels.size(0)
        correct_predictions += (predicted == batch_labels).sum().item()
        
    
print(f'Accuracy on test set {correct_predictions / total_predictions:.3f}')

Accuracy on test set 0.994
