In [1]:
import os
os.chdir("../")
%pwd

'd:\\Final-Year-Project\\Credit-Card-Fraud-Detection-Using-GNN'

In [2]:
from dataclasses import dataclass
from pathlib import Path
import pandas as pd
import torch
from torch_geometric.data import HeteroData

# Importing constants and utility functions
from Credit_Card_Fraud_Detection.constants import *
from Credit_Card_Fraud_Detection.utils.common import read_yaml, create_directories
from Credit_Card_Fraud_Detection import logger

In [3]:
# ====================================================
# ENTITY: DataTransformationConfig
# ====================================================

@dataclass(frozen=True)
class GraphConstructionConfig:
    """Configuration for graph construction."""
    root_dir: Path
    transformed_data_path: Path
    graph_data_path: Path




In [4]:
# ====================================================
# CONFIGURATION MANAGER
# ====================================================

class ConfigurationManager:
    """Manages configuration settings by reading YAML files and initializing directories."""

    def __init__(
        self,
        config_filepath=CONFIG_FILE_PATH,
        params_filepath=PARAMS_FILE_PATH,
        schema_filepath=SCHEMA_FILE_PATH
    ):
        """Initializes configuration manager by reading YAML files and creating necessary directories."""
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        # Ensure the artifacts root directory exists
        create_directories([self.config.artifacts_root])

    def get_graph_construction_config(self) -> GraphConstructionConfig:
        """Retrieves graph construction configuration and ensures required directories exist."""
        print("get_graph_construction_config method called")  # Debugging log

        config = self.config.graph_construction
        create_directories([config.root_dir])

        return GraphConstructionConfig(
            root_dir=config.root_dir,
            transformed_data_path=config.transformed_data_path,
            graph_data_path=config.graph_data_path,
        )


In [6]:
# ====================================================
# COMPONENT: Graph Construction
# ====================================================

class GraphConstructor:
    """
    This class constructs a heterogeneous graph from processed transaction data.
    It assigns node IDs, creates edges, generates node features, and creates transaction labels.
    This graph representation is used to model the relationships between customers, merchants, and transactions,
    which can be beneficial for fraud detection.
    """
    def __init__(self, config):
        """
        Initializes the GraphConstructor class with the provided configuration.

        Args:
            config: Configuration object containing graph construction details.
        """
        self.config = config
        logger.info("GraphConstructor initialized.")

    def create_node_ids(self, df):
        """
        Assigns unique numeric IDs to customers, merchants, and transactions.
        This is done to represent each entity as a node in the graph.

        Args:
            df (pd.DataFrame): DataFrame containing transaction data.

        Returns:
            pd.DataFrame: DataFrame with assigned node IDs.
        """
        logger.info("Assigning node IDs...")
        df["transaction_node"] = df["transaction_unique"].astype(int)  # Assign transaction node IDs (for unique transactions)
        df["customer_node"] = df["customer_id"].astype(int)  # Assign customer node IDs (for unique customers)
        df["merchant_node"] = df["merchant_id"].astype(int)  # Assign merchant node IDs (for unique merchants)

        df.drop(columns=["customer_id", "merchant_id", "transaction_unique"], inplace=True)  # Remove original ID columns (to avoid redundancy)
        logger.info("Node IDs assigned successfully.")
        return df

    def create_edge_indices(self, df):
        """
        Creates edges that connect nodes in the graph.
        Edges represent the relationships between customers, transactions, and merchants.

        Args:
            df (pd.DataFrame): DataFrame with node IDs.

        Returns:
            tuple[torch.Tensor, torch.Tensor]: Tensors representing customer-transaction and transaction-merchant edges.
        """
        logger.info("Creating edge indices...")

        try:
            customer_to_transaction_edges = torch.tensor(
                df[["customer_node", "transaction_node"]].values.T, dtype=torch.long  # Create customer-transaction edges (customer made transaction)
            )
            transaction_to_merchant_edges = torch.tensor(
                df[["transaction_node", "merchant_node"]].values.T, dtype=torch.long  # Create transaction-merchant edges (transaction occurred at merchant)
            )
            logger.info("Edges created successfully.")
            return customer_to_transaction_edges, transaction_to_merchant_edges

        except Exception as e:
            logger.error(f"Error creating edge indices: {e}")  # Log error if edge creation fails
            raise

    def create_node_features(self, df):
        """
        Generates features for each type of node: customer, merchant, and transaction.
        These features are used to represent node attributes in the graph.

        Args:
            df (pd.DataFrame): DataFrame with transaction data.

        Returns:
            tuple[torch.Tensor, torch.Tensor, torch.Tensor]: Tensors representing customer, merchant, and transaction features.
        """
        logger.info("Generating node features...")

        customer_features_list = ["customer_avg_amt", "customer_min_amt", "customer_amt_std"]  # Customer feature list (customer behavior)
        merchant_features_list = ["merchant_avg_amt", "merchant_min_amt", "merchant_amt_std"]  # Merchant feature list (merchant behavior)
        transaction_features_list = [  # Transaction feature list (transaction details)
            "high_amt", "amt_ratio_merchant", "sqrt_amt", "amt", "amt_diff_customer_avg",
            "hour_cos", "amt_per_city_pop"
        ]

        customer_features = torch.tensor(
            df.groupby("customer_node")[customer_features_list].mean().reindex(df["customer_node"].unique()).values,
            dtype=torch.float32  # Compute customer features (mean)
        )

        merchant_features = torch.tensor(
            df.groupby("merchant_node")[merchant_features_list].mean().reindex(df["merchant_node"].unique()).values,
            dtype=torch.float32  # Compute merchant features (mean)
        )

        transaction_features = torch.tensor(df[transaction_features_list].values, dtype=torch.float32)  # Compute transaction features

        logger.info("Node features generated successfully.")
        return customer_features, merchant_features, transaction_features

    def create_transaction_labels(self, df):
        """
        Creates labels for transactions indicating whether they are fraudulent.
        These labels are used for training the fraud detection model.

        Args:
            df (pd.DataFrame): DataFrame with transaction labels.

        Returns:
            torch.Tensor: Tensor representing transaction labels.
        """
        logger.info("Creating transaction labels...")
        labels = torch.tensor(df["is_fraud"].values, dtype=torch.float32).view(-1, 1)  # Create transaction labels (fraud or not)
        logger.info("Transaction labels created successfully.")
        return labels

    def construct_graph(self):
        """
        Builds the heterogeneous graph from the processed DataFrame.
        Loads data, creates node IDs, edges, features, labels, and saves the graph.

        Returns:
            HeteroData: Constructed heterogeneous graph.
        """
        logger.info("Starting graph construction...")

        df = pd.read_csv(self.config.transformed_data_path)  # Load transformed data (preprocessed data)

        df = self.create_node_ids(df)  # Create node IDs
        customer_to_transaction_edges, transaction_to_merchant_edges = self.create_edge_indices(df)  # Create edge indices
        customer_features, merchant_features, transaction_features = self.create_node_features(df)  # Create node features
        y = self.create_transaction_labels(df)  # Create transaction labels

        data = HeteroData()  # Create HeteroData object (for storing the heterogeneous graph)

        data["customer"].x = customer_features  # Assign customer features
        data["merchant"].x = merchant_features  # Assign merchant features
        data["transaction"].x = transaction_features  # Assign transaction features

        data["customer", "transacts", "transaction"].edge_index = customer_to_transaction_edges  # Assign customer-transaction edges
        data["transaction", "occurs_at", "merchant"].edge_index = transaction_to_merchant_edges  # Assign transaction-merchant edges

        data["transaction", "transacted_by", "customer"].edge_index = customer_to_transaction_edges.flip(0)  # Reverse customer-transaction edges (for bidirectional relationships)
        data["merchant", "related_to", "transaction"].edge_index = transaction_to_merchant_edges.flip(0)  # Reverse transaction-merchant edges (for bidirectional relationships)

        data["transaction"].y = y  # Assign transaction labels

        merchant_id_mapping = {merchant_id: i for i, merchant_id in enumerate(df["merchant_node"].unique())}  # Create merchant ID mapping (for consistent merchant IDs)
        data["transaction", "occurs_at", "merchant"].edge_index[1] = torch.tensor([merchant_id_mapping[merchant_id.item()] for merchant_id in data["transaction", "occurs_at", "merchant"].edge_index[1]])  # Map merchant IDs in edges
        data["merchant", "related_to", "transaction"].edge_index[0] = torch.tensor([merchant_id_mapping[merchant_id.item()] for merchant_id in data["merchant", "related_to", "transaction"].edge_index[0]])  # Map merchant IDs in reversed edges

        data["customer"].n_id = torch.tensor(df["customer_node"].unique(), dtype=torch.long)  # Assign customer node n_ids (for tracking original node IDs)
        data["merchant"].n_id = torch.tensor(df["merchant_node"].unique(), dtype=torch.long)  # Assign merchant node n_ids (for tracking original node IDs)
        data["transaction"].n_id = torch.tensor(df["transaction_node"].unique(), dtype=torch.long)  # Assign transaction node n_ids (for tracking original node IDs)

        torch.save(data, self.config.graph_data_path)  # Save the graph (for later use in model training)
        logger.info(f"Graph construction complete. Data saved at {self.config.graph_data_path}")

        node_mapped_data_path = os.path.join(self.config.root_dir, "node_mapped_data.csv")  # Define output path for node-mapped data (for reference)
        df.to_csv(node_mapped_data_path, index=False)  # Save the node-mapped data to CSV (for reference)
        logger.info(f"Updated data frame saved to: {node_mapped_data_path}")

        return data  # Return the constructed graph

In [7]:
# ===========================================
# TRAINING DATA PIPELINE
# ===========================================
try:
    logger.info("Initializing configuration and graph constructor.")
    config = ConfigurationManager().get_graph_construction_config()
    graph_constructor = GraphConstructor(config)

    logger.info("Starting graph construction.")
    data = graph_constructor.construct_graph()

    if data:
        logger.info("Graph construction successful. Data saved at: %s", config.graph_data_path)
    else:
        logger.warning("Graph construction returned None.")

except Exception as e:
    logger.exception("Error during graph construction: %s", str(e))
    raise


[2025-03-26 11:23:32,173: INFO: 65691384: Initializing configuration and graph constructor.]
[2025-03-26 11:23:32,175: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-03-26 11:23:32,181: INFO: common: yaml file: params.yaml loaded successfully]
[2025-03-26 11:23:32,182: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-03-26 11:23:32,184: INFO: common: created directory at: artifacts]
get_graph_construction_config method called
[2025-03-26 11:23:32,185: INFO: common: created directory at: artifacts/graph_construction]
[2025-03-26 11:23:32,186: INFO: 2254373509: GraphConstructor initialized.]
[2025-03-26 11:23:32,187: INFO: 65691384: Starting graph construction.]
[2025-03-26 11:23:32,188: INFO: 2254373509: Starting graph construction...]
[2025-03-26 11:23:33,486: INFO: 2254373509: Assigning node IDs...]
[2025-03-26 11:23:33,547: INFO: 2254373509: Node IDs assigned successfully.]
[2025-03-26 11:23:33,548: INFO: 2254373509: Creating edge indices...]
[2