## GNN for fraud detection:
Creating a multigraph for fraud detection using transaction data and applying a Graph Neural Network (GNN) on the edge list can be done in the following steps:

1. Prepare the transaction data: Collect and organize the transaction data into a format that can be used to create the edges of the multigraph. For example, each transaction could be represented as a tuple (node1, node2, attributes), where node1 and node2 represent the sender and receiver of the transaction, and attributes is a dictionary containing properties such as the amount, timestamp, and transaction type.

2. Create the multigraph: Use the transaction data to create a multigraph using the NetworkX library. The add_edge() method can be used to add edges to the multigraph, where each edge represents a transaction.

3. Extract the edges list and their features: Use the edges() method of the multigraph to extract the edges list and their features, which will be used as input to the GNN.

4. Apply a GNN on the edge list: Use a GNN library such as PyTorch Geometric, Deep Graph Library (DGL) or Spektral to apply a GNN on the edge list. The GNN will learn representations of the edges in the multigraph and use them to classify the edges as fraudulent or non-fraudulent.

5. Evaluation: To evaluate the performance of the GNN, you can split the data into train and test sets, and use the test set to evaluate the accuracy, precision, recall, and F1-score of the model.

In [1]:
import os
os.chdir("../")
%pwd

'd:\\Final-Year-Project\\Credit-Card-Fraud-Detection-Using-GNN'

In [2]:
# Entity

from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class GraphConstructionConfig:
    root_dir: Path
    transformed_data_path: Path
    graph_data_path: Path

In [3]:
from Credit_Card_Fraud_Detection.constants import *
from Credit_Card_Fraud_Detection.utils.common import read_yaml, create_directories

In [4]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath=CONFIG_FILE_PATH,
        params_filepath=PARAMS_FILE_PATH,
        schema_filepath=SCHEMA_FILE_PATH
    ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)
        create_directories([self.config.artifacts_root])

    def get_graph_construction_config(self) -> GraphConstructionConfig:
        print("get_graph_construction_config method called") # add this line
        config = self.config.graph_construction
        create_directories([config.root_dir])
        
        graph_construction_config = GraphConstructionConfig(
            root_dir=config.root_dir,
            transformed_data_path=config.transformed_data_path,
            graph_data_path=config.graph_data_path,
        )
        return graph_construction_config


In [5]:
import pandas as pd
import torch
from torch_geometric.data import HeteroData
from Credit_Card_Fraud_Detection import logger
import os
from sklearn.model_selection import train_test_split

In [6]:

class GraphConstructor:
    def __init__(self, config):
        self.config = config

    def create_node_ids(self, df):
        """
        Directly uses existing customer_id, merchant_id, and transaction_unique as node IDs.
        Removes the original columns after creating node IDs.
        """
        df["transaction_node"] = df["transaction_unique"].astype(int)
        df["customer_node"] = df["customer_id"].astype(int)
        df["merchant_node"] = df["merchant_id"].astype(int)

        df.drop(columns=["customer_id", "merchant_id", "transaction_unique"], inplace=True)  # Remove old columns

        logger.info(f"New max transaction_node in df: {df['transaction_node'].max()}")
        logger.info(f"Total transaction nodes: {len(df['transaction_node'].unique())}")
        logger.info("Node indices created successfully.")

        return df

    def create_edge_indices(self, df):
        customer_to_transaction_edges = torch.tensor(df[["customer_node", "transaction_node"]].values.T, dtype=torch.long)
        transaction_to_merchant_edges = torch.tensor(df[["transaction_node", "merchant_node"]].values.T, dtype=torch.long)

        logger.info(f"Customer-to-Transaction edges shape: {customer_to_transaction_edges.shape}")
        logger.info(f"Transaction-to-Merchant edges shape: {transaction_to_merchant_edges.shape}")

        return customer_to_transaction_edges, transaction_to_merchant_edges

    def create_node_features(self, df):
        customer_features_list = ["customer_avg_amt", "customer_min_amt", "customer_amt_std"]
        merchant_features_list = ["merchant_avg_amt", "merchant_min_amt", "merchant_amt_std"]
        transaction_features_list = [
            "high_amt", "amt_ratio_merchant", "sqrt_amt", "amt", "amt_diff_customer_avg",
            "hour_cos", "amt_per_city_pop", "merchant_category_fraud_risk"
        ]

        customer_features_dim = len(customer_features_list)
        merchant_features_dim = len(merchant_features_list)
        transaction_features_dim = len(transaction_features_list)

        unique_customer_nodes = df["customer_node"].unique()
        unique_merchant_nodes = df["merchant_node"].unique()

        customer_features = torch.zeros((len(unique_customer_nodes), customer_features_dim), dtype=torch.float32)
        merchant_features = torch.zeros((len(unique_merchant_nodes), merchant_features_dim), dtype=torch.float32)
        transaction_features = torch.tensor(df[transaction_features_list].values, dtype=torch.float32)

        for i, customer_id in enumerate(unique_customer_nodes):
            group = df[df["customer_node"] == customer_id]
            customer_features[i] = torch.tensor(group[customer_features_list].mean().values, dtype=torch.float32)

        for i, merchant_id in enumerate(unique_merchant_nodes):
            group = df[df["merchant_node"] == merchant_id]
            merchant_features[i] = torch.tensor(group[merchant_features_list].mean().values, dtype=torch.float32)

        logger.info("Node features created correctly.")

        # Debugging: Check for NaN and Inf
        if torch.isnan(customer_features).any() or torch.isinf(customer_features).any():
            logger.error("NaN or Inf values found in customer_features.")
        if torch.isnan(merchant_features).any() or torch.isinf(merchant_features).any():
            logger.error("NaN or Inf values found in merchant_features.")
        if torch.isnan(transaction_features).any() or torch.isinf(transaction_features).any():
            logger.error("NaN or Inf values found in transaction_features.")

        return customer_features, merchant_features, transaction_features

    def create_transaction_labels(self, df):
        """Creates labels only for unique transaction nodes."""
        transaction_labels = {}
        for transaction_id, group in df.groupby("transaction_node"):  # change to transaction_node
            transaction_labels[transaction_id] = group["is_fraud"].iloc[0]

        y = torch.tensor(list(transaction_labels.values()), dtype=torch.float32).view(-1, 1)
        return y

    def train_test_split_nodes(self, y):
        num_transaction_nodes = len(y)
        train_indices, test_indices = train_test_split(
            torch.arange(num_transaction_nodes),
            test_size=0.2,
            random_state=42,
            stratify=y.squeeze().numpy()
        )

        transaction_train_mask = torch.zeros(num_transaction_nodes, dtype=torch.bool)
        transaction_test_mask = torch.zeros(num_transaction_nodes, dtype=torch.bool)

        transaction_train_mask[train_indices] = True
        transaction_test_mask[test_indices] = True

        logger.info("Train-test split applied.")

        return transaction_train_mask, transaction_test_mask

    def describe_data_structure(self, data, filepath):
        with open(filepath, 'w') as f:
            f.write("Data Object Structure:\n")
            for node_type in data.node_types:
                f.write(f"  Node type: {node_type}\n")
                if hasattr(data[node_type], 'x'):
                    f.write(f"    x: {data[node_type].x.shape}, dtype={data[node_type].x.dtype}\n")
                if hasattr(data[node_type], 'y'):
                    f.write(f"    y: {data[node_type].y.shape}, dtype={data[node_type].y.dtype}\n")
                if hasattr(data[node_type], 'train_mask'):
                    f.write(f"    train_mask: {data[node_type].train_mask.shape}, dtype={data[node_type].train_mask.dtype}\n")
                if hasattr(data[node_type], 'test_mask'):
                    f.write(f"    test_mask: {data[node_type].test_mask.shape}, dtype={data[node_type].test_mask.dtype}\n")
                if hasattr(data[node_type], 'n_id'):
                    f.write(f"    n_id: {data[node_type].n_id.shape}, dtype={data[node_type].n_id.dtype}\n")
            for edge_type in data.edge_types:
                f.write(f"  Edge type: {edge_type}\n")
                f.write(f"    edge_index: {data[edge_type].edge_index.shape}, dtype={data[edge_type].edge_index.dtype}\n")

        logger.info(f"Data structure description saved to: {filepath}")

    def construct_graph(self):
        df = pd.read_csv(self.config.transformed_data_path)

        df = self.create_node_ids(df)
        customer_to_transaction_edges, transaction_to_merchant_edges = self.create_edge_indices(df)
        customer_features, merchant_features, transaction_features = self.create_node_features(df)
        y = self.create_transaction_labels(df)
        transaction_train_mask, transaction_test_mask = self.train_test_split_nodes(y)

        data = HeteroData()
        data["customer"].x = customer_features
        data["merchant"].x = merchant_features
        data["transaction"].x = transaction_features
        data["customer", "transacts", "transaction"].edge_index = customer_to_transaction_edges
        data["transaction", "occurs_at", "merchant"].edge_index = transaction_to_merchant_edges
        data["transaction", "transacted_by", "customer"].edge_index = customer_to_transaction_edges.flip(0)
        data["merchant", "related_to", "transaction"].edge_index = transaction_to_merchant_edges.flip(0)
        data["transaction"].y = y
        data["transaction"].train_mask = transaction_train_mask
        data["transaction"].test_mask = transaction_test_mask

        # Assign n_id attributes
        data["customer"].n_id = torch.tensor(df["customer_node"].unique())
        data["merchant"].n_id = torch.tensor(df["merchant_node"].unique())
        data["transaction"].n_id = torch.tensor(df["transaction_node"].unique())

        # Create merchant node ID mapping
        merchant_id_mapping = {merchant_id: idx for idx, merchant_id in enumerate(df["merchant_node"].unique())}

        # Adjust merchant edge indices
        data["transaction", "occurs_at", "merchant"].edge_index[1] = torch.tensor([merchant_id_mapping[merchant_id.item()] for merchant_id in data["transaction", "occurs_at", "merchant"].edge_index[1]])
        data["merchant", "related_to", "transaction"].edge_index[0] = torch.tensor([merchant_id_mapping[merchant_id.item()] for merchant_id in data["merchant", "related_to", "transaction"].edge_index[0]])


        # Debugging: Check edge indices and data types
        for edge_type, edge_index in data.edge_index_dict.items():
            src_node_type, _, dst_node_type = edge_type
            num_src_nodes = data[src_node_type].x.size(0)
            num_dst_nodes = data[dst_node_type].x.size(0)

            if (edge_index[0] >= num_src_nodes).any() or (edge_index[1] >= num_dst_nodes).any():
                logger.error(f"Edge index out of bounds for edge type {edge_type}")
            if edge_index.dtype != torch.long:
                logger.error(f"Edge index dtype is not torch.long for edge type {edge_type}")

        for node_type in data.node_types:
            if data[node_type].x.dtype != torch.float32:
                logger.error(f"{node_type} features dtype is not torch.float32")

        # Debugging: Check min/max node IDs and n_id values
        logger.info(f"Min customer_node: {df['customer_node'].min()}, Max customer_node: {df['customer_node'].max()}")
        logger.info(f"Min merchant_node: {df['merchant_node'].min()}, Max merchant_node: {df['merchant_node'].max()}")
        logger.info(f"Min transaction_node: {df['transaction_node'].min()}, Max transaction_node: {df['transaction_node'].max()}")

        logger.info(f"Customer n_id: {data['customer'].n_id.min()} to {data['customer'].n_id.max()}")
        logger.info(f"Merchant n_id: {data['merchant'].n_id.min()} to {data['merchant'].n_id.max()}")
        logger.info(f"Transaction n_id: {data['transaction'].n_id.min()} to {data['transaction'].n_id.max()}")

        # Debugging: Check feature and edge_index shapes
        for node_type in data.node_types:
            if hasattr(data[node_type], 'x'):
                logger.info(f"{node_type} features shape: {data[node_type].x.shape}")
        for edge_type in data.edge_types:
            logger.info(f"{edge_type} edge_index shape: {data[edge_type].edge_index.shape}")

        torch.save(data, self.config.graph_data_path)
        logger.info(f"Graph data saved to: {self.config.graph_data_path}")

        # Save graph structure description
        structure_save_path = os.path.join(self.config.root_dir, "graph_structure.txt")
        self.describe_data_structure(data, structure_save_path)

        # Save updated DataFrame
        node_mapped_data_path = os.path.join(self.config.root_dir, "node_mapped_data.csv")
        df.to_csv(node_mapped_data_path, index=False)
        logger.info(f"Updated data frame saved to: {node_mapped_data_path}")

        return data

In [7]:
# Pipeline Execution

try:
    config = ConfigurationManager()
    graph_construction_config = config.get_graph_construction_config()
    graph_constructor = GraphConstructor(config=graph_construction_config)
    data = graph_constructor.construct_graph()  # Change this line

    if data is not None:
        logger.info("Graph construction completed successfully.")

except Exception as e:
    logger.exception("An error occurred during graph construction.")
    raise e

[2025-03-24 09:32:01,495: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-03-24 09:32:01,497: INFO: common: yaml file: params.yaml loaded successfully]
[2025-03-24 09:32:01,499: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-03-24 09:32:01,500: INFO: common: created directory at: artifacts]
get_graph_construction_config method called
[2025-03-24 09:32:01,502: INFO: common: created directory at: artifacts/graph_construction]
[2025-03-24 09:32:02,857: INFO: 2814552714: New max transaction_node in df: 1295933]
[2025-03-24 09:32:02,883: INFO: 2814552714: Total transaction nodes: 1295934]
[2025-03-24 09:32:02,884: INFO: 2814552714: Node indices created successfully.]
[2025-03-24 09:32:02,919: INFO: 2814552714: Customer-to-Transaction edges shape: torch.Size([2, 1295934])]
[2025-03-24 09:32:02,919: INFO: 2814552714: Transaction-to-Merchant edges shape: torch.Size([2, 1295934])]
[2025-03-24 09:32:06,205: INFO: 2814552714: Node features created correctly