In [2]:
import os
import pandas as pd

# Replace this with the path to your directory containing the CSV files
directory_path = 'Hybrid_GCN_LSTM'

# Function to merge files with a specific prefix
def merge_csv_files(directory, prefix):
    # Initialize an empty list to store DataFrames
    dataframes = []
    
    # Loop through each file in the directory
    for file in os.listdir(directory):
        # Check if the file name starts with the specified prefix
        if file.startswith(prefix):
            # Create a DataFrame from the CSV file
            file_path = os.path.join(directory, file)
            df = pd.read_csv(file_path)
            # Append the DataFrame to the list
            dataframes.append(df)
    
    # Concatenate all DataFrames in the list
    merged_df = pd.concat(dataframes, ignore_index=True)
    return merged_df

# Merge files starting with 'metrics' and 'fuzzy_result'
metrics_df = merge_csv_files(directory_path, 'Metrics')
fuzzy_result_df = merge_csv_files(directory_path, 'Fuzzy_Results')

# Print the first few rows of each DataFrame to verify
print("Metrics DataFrame:")
print(metrics_df.head())
print("\nFuzzy Result DataFrame:")
print(fuzzy_result_df.head())


Metrics DataFrame:
   Precision   Recall       F1  AUC  KS_Statistic  KS_PValue
0    0.38386  0.38386  0.38386  0.5           NaN        NaN
1    0.58682  0.58682  0.58682  0.5           NaN        NaN
2    0.34556  0.34556  0.34556  0.5           NaN        NaN
3    0.34322  0.34322  0.34322  0.5           NaN        NaN
4    0.47920  0.47920  0.47920  0.5           NaN        NaN

Fuzzy Result DataFrame:
             Time_step  Label Transaction_Id           Sender_Account  \
0  2024-01-21 06:24:00      0     T-90966-08                      NaN   
1  2024-01-21 06:24:00      0    T-930664-06                      NaN   
2  2024-01-21 06:24:00      1    T-197980-00  DIGITAL-MONEY-197559-00   
3  2024-01-21 06:24:00      1    T-197980-00  DIGITAL-MONEY-197559-00   
4  2024-01-21 06:28:00      1    T-295862-00  DIGITAL-MONEY-295444-00   

  Sender_Institution Sender_Country  USD_Amount        Bene_Account  \
0                NaN            NaN     9500.58   CHECKING-90551-08   
1        

In [14]:
import os
import pandas as pd
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, roc_curve,confusion_matrix
from scipy.stats import ks_2samp


def calculate_statistics(df):
    # Calculate average of each column
    averages = df.mean()
    print("Averages of each column:")
    print(averages)
    
def evaluate_model(y_true, y_scores, threshold=0.5):
    # Apply threshold to convert probabilities to binary predictions
    y_pred = (y_scores > threshold).astype(int)
    print("Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))
    # Calculate F1, Precision, and Recall
    f1 = f1_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)

    # Calculate AUC using the probability scores
    auc = roc_auc_score(y_true, y_scores)

    # Calculate the ROC curve to derive the KS Statistic
    fpr, tpr, thresholds = roc_curve(y_true, y_scores)
    ks_statistic = max(tpr - fpr)

    # Print results
    print("F1 Score:", f1)
    print("Precision:", precision)
    print("Recall:", recall)
    print("AUC:", auc)
    print("KS Statistic:", ks_statistic)



# Calculate and display statistics for 'metrics' DataFrame
calculate_statistics(metrics_df)

# Evaluate model for 'fuzzy_result' DataFrame
evaluate_model(fuzzy_result_df['Label'], fuzzy_result_df['Predictions'])


Averages of each column:
Precision       0.477106
Recall          0.477106
F1              0.477106
AUC             0.500000
KS_Statistic         NaN
KS_PValue            NaN
dtype: float64
Confusion Matrix:
[[193913      0]
 [179766      0]]


  _warn_prf(average, modifier, msg_start, len(result))


F1 Score: 0.0
Precision: 0.0
Recall: 0.0
AUC: 0.5692677568014398
KS Statistic: 0.14003083533604044


In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data, DataLoader
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
import optuna
import numpy as np
from sklearn.model_selection import train_test_split

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the data
train_df = pd.read_csv('Thesis/train_with_fuzzy_results2.csv')

class EdgeGCN_LSTM(nn.Module):
    def __init__(self, hidden_channels, lstm_hidden_channels, out_channels, dropout_rate):
        super(EdgeGCN_LSTM, self).__init__()
        # Use a single GCN layer followed by LSTM
        self.conv1 = GCNConv(1, hidden_channels)
        self.lstm = nn.LSTM(input_size=hidden_channels * 2 + 3, hidden_size=lstm_hidden_channels, batch_first=True)
        self.lin = nn.Linear(lstm_hidden_channels, out_channels)
        self.dropout_rate = dropout_rate

    def forward(self, x, edge_index, edge_attr):
        # Apply GCN and dropout
        x = F.dropout(F.relu(self.conv1(x, edge_index)), p=self.dropout_rate, training=self.training)
        
        # Prepare features for LSTM
        sender_features = x[edge_index[0]]
        receiver_features = x[edge_index[1]]
        edge_features = torch.cat([sender_features, receiver_features, edge_attr], dim=1)
        
        # Process with LSTM
        edge_features = edge_features.unsqueeze(0)  # Add batch dimension for LSTM
        lstm_out, _ = self.lstm(edge_features)
        lstm_out = lstm_out.squeeze(0)  # Remove batch dimension
        
        # Linear output layer
        out = self.lin(lstm_out)
        return out.view(-1)


class GraphDataProcessor:
    def __init__(self, df):
        self.df = df
    def undersample_df(self):
        fraud_df = self.df[self.df['Label'] == 1]
        non_fraud_df = self.df[self.df['Label'] == 0]
        print(f"Initial fraud cases: {len(fraud_df)}, non-fraud cases: {len(non_fraud_df)}")

          # Check if there are enough fraud cases to sample
        if len(fraud_df) < len(non_fraud_df):
            balanced_df = non_fraud_df.sample(len(fraud_df), random_state=42)
        else:
            balanced_df = non_fraud_df

        self.df = pd.concat([fraud_df, balanced_df]).sample(frac=1)  # shuffle the dataset
        print(f"Balanced dataset: {len(self.df)} records")

    def prepare_graph_data(self):
        self.undersample_df()
        self.df['Time_step'] = pd.to_datetime(self.df['Time_step'])
        self.df = self.df.sort_values(by=['Sender_Customer_Id', 'Time_step'])
        self.df['Label'] = pd.to_numeric(self.df['Label'], errors='coerce').fillna(0).astype(int)

        all_ids = pd.concat([self.df['Sender_Customer_Id'], self.df['Bene_Customer_Id']]).unique()
        if len(all_ids) == 0:
            raise ValueError("No unique IDs found in the dataset")

        id_map = {id: idx for idx, id in enumerate(all_ids)}
        edge_index = torch.tensor([self.df['Sender_Customer_Id'].map(id_map).values, self.df['Bene_Customer_Id'].map(id_map).values], dtype=torch.long)

        node_features = torch.zeros((len(all_ids), 1))
      
        transaction_type_encoded = torch.tensor(LabelEncoder().fit_transform(self.df['Transaction_Type']), dtype=torch.float).view(-1, 1)
        usd_amount = torch.tensor(StandardScaler().fit_transform(self.df[['USD_Amount']]), dtype=torch.float).view(-1, 1)
        risk_score = torch.tensor(self.df['risk_score'].values, dtype=torch.float).view(-1, 1)

        edge_attr = torch.cat([transaction_type_encoded, usd_amount, risk_score], dim=1)
        edge_labels = torch.tensor(self.df['Label'].values, dtype=torch.long)

        return Data(x=node_features, edge_index=edge_index, edge_attr=edge_attr, y=edge_labels)


train_df, val_df = train_test_split(
    train_df,
    test_size=0.25,
    random_state=42,
    stratify=train_df['Label']
)

train_processor = GraphDataProcessor(train_df)
val_processor = GraphDataProcessor(val_df)

train_data = train_processor.prepare_graph_data()
val_data = val_processor.prepare_graph_data()

train_loader = DataLoader([train_data], batch_size=32, shuffle=True)
val_loader = DataLoader([val_data], batch_size=32, shuffle=False)

def train(model, device, loader, optimizer, criterion):
    model.train()
    total_loss = 0
    for data in loader:
        data = data.to(device)
        optimizer.zero_grad()
        output = model(data.x, data.edge_index, data.edge_attr)
        loss = criterion(output, data.y.float())
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate(model, device, loader, criterion):
    model.eval()
    y_true, y_pred, y_scores = [], [], []
    total_loss = 0
    with torch.no_grad():
        for data in loader:
            data = data.to(device)
            output = model(data.x, data.edge_index, data.edge_attr)
            loss = criterion(output, data.y.float())
            total_loss += loss.item()

            probs = torch.sigmoid(output).cpu().numpy()
            preds = (probs > 0.4).astype(int)

            y_scores.extend(probs)
            y_pred.extend(preds)
            y_true.extend(data.y.cpu().numpy())

    f1 = f1_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_scores)
    fpr, tpr, thresholds = roc_curve(y_true, y_scores)
    ks_statistic = max(tpr - fpr)

    return total_loss / len(loader), f1, precision, recall, auc, ks_statistic
def objective(trial):
    global best_f1, best_model_checkpoint

    # Suggest hyperparameters
    lr = trial.suggest_loguniform('lr', 1e-5, 1e-1)
    hidden_channels = trial.suggest_categorical('hidden_channels', [16, 32, 64])
    lstm_hidden_channels = trial.suggest_categorical('lstm_hidden_channels', [16, 32, 64])
    dropout_rate = trial.suggest_uniform('dropout_rate', 0.1, 0.7)

    # Model initialization
    model = EdgeGCN_LSTM(hidden_channels=hidden_channels, lstm_hidden_channels=lstm_hidden_channels, out_channels=1, dropout_rate=dropout_rate).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.BCEWithLogitsLoss()

    # Training and validation
    for epoch in range(10):
        train_loss = train(model, device, train_loader, optimizer, criterion)
        val_loss, f1, precision, recall, auc, ks_statistic = evaluate(model, device, val_loader, criterion)

        # Check if the current model is the best one; save it if true
        if f1 > best_f1:
            best_f1 = f1
            best_model_checkpoint = {
                'state_dict': model.state_dict(),
                'hyperparameters': {
                    'lr': lr,
                    'hidden_channels': hidden_channels,
                    'lstm_hidden_channels': lstm_hidden_channels,
                    'dropout_rate': dropout_rate
                },
                'metrics': {
                    'f1': f1,
                    'precision': precision,
                    'recall': recall,
                    'auc': auc,
                    'ks_statistic': ks_statistic
                }
            }

    return f1

# Initialize variables to store best model details
best_f1 = 0
best_model_checkpoint = None

# Create a study object and run the optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

# Save the best model at the end of all trials to the desired local path
best_model_path = "Thesis/gsm_lstm_model.pth"
if best_model_checkpoint:
    torch.save(best_model_checkpoint, best_model_path)
    print(f"Best model saved at: {best_model_path}")

# Output the results of the best trial
print("Best trial:")
trial = study.best_trial
print(f" Value (F1 Score): {trial.value}")
print(" Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")
# Load the best model's checkpoint
checkpoint = torch.load(best_model_path)

# Access and print the validation metrics stored in the checkpoint
metrics = checkpoint['metrics']
print("Validation set metrics:")
print(f"    F1 Score: {metrics['f1']}")
print(f"    Precision: {metrics['precision']}")
print(f"    Recall: {metrics['recall']}")
print(f"    AUC: {metrics['auc']}")
print(f"    KS Statistic: {metrics['ks_statistic']}")


Initial fraud cases: 46507, non-fraud cases: 274326
Balanced dataset: 93014 records
Initial fraud cases: 15503, non-fraud cases: 91442
Balanced dataset: 31006 records


[32m[I 2024-08-22 09:58:36,584][0m A new study created in memory with name: no-name-5a495783-7782-4228-bb80-babef13a25b6[0m
[33m[W 2024-08-22 09:58:36,589][0m Trial 0 failed because of the following error: AttributeError("module 'torch.nn.parameter' has no attribute 'UninitializedParameter'",)[0m
Traceback (most recent call last):
  File "/home/echristi/.local/lib/python3.6/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "<ipython-input-7-5cfbcb8d1b3c>", line 154, in objective
    model = EdgeGCN_LSTM(hidden_channels=hidden_channels, lstm_hidden_channels=lstm_hidden_channels, out_channels=1, dropout_rate=dropout_rate).to(device)
  File "<ipython-input-7-5cfbcb8d1b3c>", line 23, in __init__
    self.conv1 = GCNConv(1, hidden_channels)
  File "/home/echristi/.local/lib/python3.6/site-packages/torch_geometric/nn/conv/gcn_conv.py", line 140, in __init__
    weight_initializer='glorot')
  File "/home/echristi/.local/lib/python

AttributeError: module 'torch.nn.parameter' has no attribute 'UninitializedParameter'

In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 320833 entries, 269273 to 82844
Data columns (total 21 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   Time_step                     320833 non-null  object 
 1   Label                         320833 non-null  int64  
 2   Transaction_Id                320833 non-null  object 
 3   Sender_Account                257106 non-null  object 
 4   Sender_Institution            257106 non-null  object 
 5   Sender_Country                257106 non-null  object 
 6   USD_Amount                    320833 non-null  float64
 7   Bene_Account                  320833 non-null  object 
 8   Bene_Institution              320833 non-null  object 
 9   Bene_Country                  320833 non-null  object 
 10  Transaction_Type              320833 non-null  object 
 11  Sender_Is_Pep                 320833 non-null  int64  
 12  Sender_Customer_Id            320833 non