In [None]:
import os
import glob
import pandas as pd
import torch
from torch_geometric.data import Data, DataLoader
from sklearn.preprocessing import LabelEncoder, StandardScaler
import numpy as np
import shutil  # for moving files

class EdgeGCN_LSTM(nn.Module):
    def __init__(self, hidden_channels, lstm_hidden_channels, out_channels, dropout_rate, num_layers, l2_lambda):
        super(EdgeGCN_LSTM, self).__init__()
        self.conv1 = GCNConv(1, hidden_channels)
        self.bn1 = nn.BatchNorm1d(hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.bn2 = nn.BatchNorm1d(hidden_channels)
        self.lstm = nn.LSTM(
            input_size=hidden_channels * 2 + 3,
            hidden_size=lstm_hidden_channels,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout_rate
        )
        self.lin1 = nn.Linear(lstm_hidden_channels, lstm_hidden_channels // 2)
        self.lin2 = nn.Linear(lstm_hidden_channels // 2, out_channels)
        self.dropout_rate = dropout_rate
        self.l2_lambda = l2_lambda

    def forward(self, x, edge_index, edge_attr):
        x = F.dropout(F.relu(self.bn1(self.conv1(x, edge_index))), p=self.dropout_rate, training=self.training)
        x = F.dropout(F.relu(self.bn2(self.conv2(x, edge_index))), p=self.dropout_rate, training=self.training)
        sender_features = x[edge_index[0]]
        receiver_features = x[edge_index[1]]
        edge_features = torch.cat([sender_features, receiver_features, edge_attr], dim=1)
        edge_features = edge_features.unsqueeze(0)
        lstm_out, _ = self.lstm(edge_features)
        lstm_out = lstm_out.squeeze(0)
        out = F.relu(self.lin1(lstm_out))
        out = self.lin2(out)
        return out.view(-1)

# Function to load the most recent CSV file from 'Result' directory
def load_most_recent_csv(directory):
    list_of_files = glob.glob(f'{directory}/*.csv')  # * means all if need specific format then *.csv
    latest_file = max(list_of_files, key=os.path.getctime)
    return latest_file

# Load the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = EdgeGCN_LSTM(hidden_channels=32, lstm_hidden_channels=64, out_channels=1, dropout_rate=0.3, num_layers=2, l2_lambda=0.001)
model.load_state_dict(torch.load('gcn_lstm_model.pth'))
model.to(device)
model.eval()

# Process and predict function
def process_and_predict(filename, output_directory):
    # Load data
    df = pd.read_csv(filename)

    # Preprocess data as per your requirement
    df['Transaction_Type'] = LabelEncoder().fit_transform(df['Transaction_Type'])
    df['USD_Amount'] = StandardScaler().fit_transform(df[['USD_Amount']])
    df['risk_score'] = StandardScaler().fit_transform(df[['risk_score']])
    
    # Prepare the graph data
    all_ids = pd.concat([df['Sender_Customer_Id'], df['Bene_Customer_Id']]).unique()
    id_map = {id: idx for idx, id in enumerate(all_ids)}
    edge_index = torch.tensor(
        np.vstack([
            df['Sender_Customer_Id'].map(id_map).values,
            df['Bene_Customer_Id'].map(id_map).values
        ]), dtype=torch.long)
    node_features = torch.zeros((len(all_ids), 1))
    edge_attr = torch.tensor(df[['Transaction_Type', 'USD_Amount', 'risk_score']].values, dtype=torch.float)

    test_data = Data(x=node_features, edge_index=edge_index, edge_attr=edge_attr)
    test_loader = DataLoader([test_data], batch_size=1)

    # Predict
    predictions = []
    with torch.no_grad():
        for data in test_loader:
            data = data.to(device)
            output = model(data.x, data.edge_index, data.edge_attr)
            prediction = torch.sigmoid(output).cpu().numpy()
            predictions.append(prediction[0])

    # Save predictions back to CSV
    df['Predictions'] = predictions
    df.to_csv(filename, index=False)

    # Move the file to 'Result_Done' directory
    os.makedirs(output_directory, exist_ok=True)  # Ensure the directory exists
    shutil.move(filename, os.path.join(output_directory, os.path.basename(filename)))

# Main execution
directory = 'Result'
output_directory = 'Result_Done'
latest_file = load_most_recent_csv(directory)
process_and_predict(latest_file, output_directory)
print(f"File {os.path.basename(latest_file)} processed and moved to {output_directory}")


In [1]:
!jupyter nbconvert --to script 2.ML_GRU_LSTM.ipynb


[NbConvertApp] Converting notebook 2.ML_GRU_LSTM.ipynb to script
[NbConvertApp] Writing 5534 bytes to 2.ML_GRU_LSTM.py


In [5]:
import os

def count_files_in_directory(directory):
    # List all entries in the directory
    entries = os.listdir(directory)
    # Filter out directory names, only count files
    files = [entry for entry in entries if os.path.isfile(os.path.join(directory, entry))]
    return len(files)

# Specify the directory you want to check
directory_path = 'Result/'
number_of_files = count_files_in_directory(directory_path)
print(f"There are {number_of_files} files in the directory '{directory_path}'.")


There are 189 files in the directory 'Result/'.


In [4]:
import pandas as pd

# Function to calculate the number of batches
def calculate_batches(filename, batch_size=100):
    # Read the CSV into a DataFrame
    df = pd.read_csv(filename)
    
    # Calculate the number of batches
    total_rows = len(df)
    number_of_batches = (total_rows + batch_size - 1) // batch_size  # This uses integer division to round up
    
    return number_of_batches

# Example usage
filename = 'Thesis/test.csv'
num_batches = calculate_batches(filename)
print(f"Number of batches: {num_batches}")


Number of batches: 3787
