In [1]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
import torch
import torch_geometric
from torch import nn
from torch_geometric.data import Data

from CryptoFraudDetection.utils.enums import LoggerMode
from CryptoFraudDetection.utils.logger import Logger

device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.mps.is_available() else "cpu")
logger_ = Logger(name="graph_attention_network", level=LoggerMode.WARNING, log_dir="../logs")


In [2]:
embedding_exists = None
df_reddit = None

try:
    df_reddit = pd.read_parquet(
        "../data/processed/reddit_posts_embedded.parquet",
        columns=[
            "id",
            "parent_id",
            "author",
            "score",
            "search_query",
            "subreddit",
            "test",
            "fraud",
            "embedding",
        ],
    )
    embedding_exists = True
except FileNotFoundError:
    raise


In [3]:
df_reddit.head(1)

Unnamed: 0,id,parent_id,author,score,search_query,subreddit,test,fraud,embedding
0,yxu5tv,,magus-21,1597,Safe Moon,r/CryptoCurrency,True,True,"[-0.4334820508956909, -0.4628458321094513, -0...."


In [4]:
score_min = df_reddit["score"].min()
score_max = df_reddit["score"].max()
df_reddit["normalized_score"] = (df_reddit["score"] - score_min) / (score_max - score_min)
df_reddit["normalized_score"].describe()

count    502081.0
mean     0.052482
std      0.003728
min           0.0
25%      0.052263
50%      0.052285
75%      0.052328
max           1.0
Name: normalized_score, dtype: Float64

In [5]:
# Add features column
df_reddit["features"] = df_reddit.apply(
    lambda row: np.concatenate([np.array([row["normalized_score"]]), np.array(row["embedding"])], axis=0),
    axis=1,
)
df_reddit = df_reddit.drop(columns=["embedding", "normalized_score"])

df_reddit.head(1)

Unnamed: 0,id,parent_id,author,score,search_query,subreddit,test,fraud,features
0,yxu5tv,,magus-21,1597,Safe Moon,r/CryptoCurrency,True,True,"[0.08661773252685279, -0.4334820508956909, -0...."


In [6]:
df_train = df_reddit[~df_reddit["test"]]
df_test = df_reddit[df_reddit["test"]]

In [7]:
# This approach has been scraped, since it creates hundreds of millions of edges and fills up my whole memory
# The idea was to create edges between all nodes that share the same author, subreddit or search query

# -----------------------------------------------------------------------------------------------------------

# def createedges_from_group(df, group_feature):
#     """Create edges by connecting all nodes within the same group without duplicates."""
#     edges = []
#     unique_groups = sorted(df[group_feature].unique())  # Sort groups for ordered processing
#
#     for group in tqdm(unique_groups, desc=f"Processing {group_feature}"):
#         # Filter IDs for the current group
#         group_ids = sorted(df[df[group_feature] == group]["id"].to_numpy())  # Sort node IDs
#
#         # Process connections for each node
#         while group_ids:  # Keep processing until all nodes in this group are handled
#             current_node = group_ids.pop(0)  # Take the first node and "pop" it from the list
#             for target_node in group_ids:  # Connect it with all remaining nodes
#                 edges.append((current_node, target_node))
#     return edges
#
#
# # Same-author, same-subreddit, same-search_query edges (bidirectional)
# edges_same_author = createedges_from_group(df_reddit, "author")
# edges_same_subreddit = createedges_from_group(df_reddit, "subreddit")
# edges_same_query = createedges_from_group(df_reddit, "search_query")


In [8]:
class GAT(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_layers) -> None:
        super().__init__()

        # GAT layers
        self.gat = torch_geometric.nn.models.GAT(
            in_channels=in_channels,
            hidden_channels=hidden_channels,
            out_channels=out_channels,
            num_layers=num_layers,
            v2=True,
        )

    def forward(self, x, edge_index):
        # Apply GAT layers
        x = self.gat(x, edge_index)

        # Return the output with a sigmoid activation
        return torch.sigmoid(x)


class RedditDataset(Data):
    def __init__(self, data, device) -> None:
        super().__init__()
        self.data = data
        self.x = None
        self.y = None
        self.edge_index = None
        self.device = device

    def process(self) -> None:
        # reset index
        self.data = self.data.reset_index(drop=True)

        # create mapping
        id_mapping = dict(zip(self.data["id"].values, self.data.index.values))

        # replace hashes with indices
        self.data["id"] = self.data["id"].map(id_mapping).astype(int)
        self.data["parent_id"] = self.data["parent_id"].map(id_mapping).fillna(-1).astype(int)

        # create edges
        edges = pd.concat([self.data["id"], self.data["parent_id"]], axis=1)
        edges = edges[edges["parent_id"] != -1]
        edges = edges.to_numpy()

        # prepare data
        self.x = torch.tensor(np.array(self.data["features"].to_list()), dtype=torch.float).to(self.device)
        self.y = torch.tensor(self.data["fraud"].to_list(), dtype=torch.float).to(self.device)
        self.edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous().to(self.device)

    def __len__(self) -> int:
        return self.x.shape[0]


In [9]:
def prepare_datasets(df, coin, device, balance_labels, logger_):
    """Prepare training and validation datasets."""
    logger_.debug(f"Preparing datasets for coin: {coin}")
    fit_data = df[df["search_query"] != coin]

    if balance_labels:
        fit_data = pd.concat(
            [
                fit_data[fit_data["fraud"] == 1],
                fit_data[fit_data["fraud"] == 0].sample(n=fit_data[fit_data["fraud"] == 1].shape[0], random_state=42),
            ],
        )

    val_data = df[df["search_query"] == coin]

    fit_dataset = RedditDataset(fit_data, device)
    fit_dataset.process()

    val_dataset = RedditDataset(val_data, device)
    val_dataset.process()

    return fit_dataset, val_dataset


def prepare_model(device, in_channels, hidden_channels, out_channels, num_layers, lr, logger_):
    """Initialize the GAT model, loss function, and optimizer."""
    logger_.debug(
        "Initializing model with parameters: "
        f"in_channels={in_channels}, hidden_channels={hidden_channels}, "
        f"out_channels={out_channels}, num_layers={num_layers}, lr={lr}"
    )
    model = GAT(
        in_channels=in_channels,
        hidden_channels=hidden_channels,
        out_channels=out_channels,
        num_layers=num_layers,
    ).to(device)

    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    return model, criterion, optimizer


def train_model(model, dataset, criterion, optimizer, logger_):
    """Train the model for one iteration."""
    model.train()
    optimizer.zero_grad()
    out = model(dataset.x, dataset.edge_index).squeeze()
    loss = criterion(out, dataset.y)
    loss.backward()
    optimizer.step()

    logger_.debug(f"Training loss: {loss.item()}")
    return loss


def validate_model(model, dataset, criterion, logger_):
    """Evaluate the model on the validation dataset."""
    model.eval()
    all_preds = []
    all_targets = []

    with torch.no_grad():
        out = model(dataset.x, dataset.edge_index).squeeze()
        val_loss = criterion(out, dataset.y)
        all_preds.append(out)
        all_targets.append(dataset.y)

    all_preds = torch.cat(all_preds).cpu().numpy()
    all_targets = torch.cat(all_targets).cpu().numpy()

    accuracy = ((all_preds >= 0.5) * 1 == all_targets).mean()

    logger_.debug(f"Validation loss: {val_loss.item()}, Validation accuracy: {accuracy}")
    return val_loss, accuracy


def train_for_coin(
    coin,
    df,
    device,
    training_loops,
    in_channels,
    hidden_channels,
    out_channels,
    num_layers,
    lr,
    balance_labels,
    logger_,
):
    """Train and validate the model, leaving one coin out."""
    logger_.debug(f"Training model - leaving out {coin}")

    # Prepare data
    fit_dataset, val_dataset = prepare_datasets(df, coin, device, balance_labels, logger_)

    # Prepare model
    model, criterion, optimizer = prepare_model(
        device,
        in_channels,
        hidden_channels,
        out_channels,
        num_layers,
        lr,
        logger_,
    )

    for i in range(training_loops):
        logger_.debug(f"Training loop {i + 1}/{training_loops}")

        # Training
        train_loss = train_model(model, fit_dataset, criterion, optimizer, logger_)

        # Validation
        val_loss, accuracy = validate_model(model, val_dataset, criterion, logger_)
        logger_.debug(
            f"End of loop {i + 1}: Training loss: {train_loss.item()}, Validation loss: {val_loss.item()}, Validation accuracy: {accuracy}"
        )

    return accuracy


def main(
    df,
    device,
    training_loops,
    in_channels,
    hidden_channels,
    out_channels,
    num_layers,
    lr,
    balance_labels,
    logger_,
):
    """
    Train a GAT model for each coin in the dataset.

    Parameters
    ----------
        df (DataFrame): Input dataset.
        device (torch.device): Device for training.
        training_loops (int): Number of training loops.
        in_channels (int): Input channels for the GAT model.
        hidden_channels (int): Number of hidden channels in the GAT model.
        out_channels (int): Number of output channels for the GAT model.
        num_layers (int): Number of layers in the GAT model.
        lr (float): Learning rate for the optimizer.
        balance_labels (bool): Whether to balance the labels in the fitting dataset.
        logger_ (logging.Logger): Logger object for logging.

    Returns
    -------
        dict: A dictionary where keys are coin names and values are validation accuracies.

    """
    accuracies = {}
    coins = df["search_query"].unique()

    for coin in coins:
        accuracy = train_for_coin(
            coin,
            df,
            device,
            training_loops,
            in_channels,
            hidden_channels,
            out_channels,
            num_layers,
            lr,
            balance_labels,
            logger_,
        )
        accuracies[coin] = accuracy.item()

    return accuracies


In [10]:
# Test if works
_ = main(
    df_train,
    device,
    training_loops=10,
    in_channels=769,
    hidden_channels=256,
    out_channels=1,
    num_layers=10,
    lr=0.0005,
    balance_labels=True,
    logger_=logger_,
)
torch.cuda.empty_cache()
print("Success")

Success


In [11]:
def initialize_json_file(file_path):
    """Initialize the JSON file if it does not exist."""
    if not file_path.exists():
        with file_path.open("w") as f:
            json.dump([], f)  # Start with an empty list


def append_to_json(file_path, data):
    """Append a new entry to the JSON file."""
    with file_path.open("r+") as f:
        existing_data = json.load(f)  # Load existing data
        existing_data.append(data)  # Add the new entry
        f.seek(0)  # Reset cursor to the beginning of the file
        json.dump(existing_data, f, indent=4)  # Write updated data back to the file


def perform_hyperparameter_search(searches, file_path, logger_, df_train, device, main):
    """Perform random hyperparameter searches and log results."""
    rng = np.random.default_rng()

    for i in range(searches):
        try:
            # Randomly sample hyperparameters
            params = {
                "training_loops": rng.choice([500, 1000, 2000, 4000]).item(),
                "hidden_channels": rng.choice([8, 16, 32, 64, 128, 256]).item(),
                "num_layers": rng.integers(1, 10).item(),
                "lr": rng.choice([0.0001, 0.0005, 0.001, 0.005]).item(),
                "balance_labels": rng.choice([True, False]).item(),
            }
            logger_.info(f"Starting search {i + 1}/{searches}")
            logger_.info(f"Hyperparameters: {params}")

            # Run the main function to get accuracies
            accuracies = main(
                df_train,
                device,
                training_loops=params["training_loops"],
                in_channels=769,
                hidden_channels=params["hidden_channels"],
                out_channels=1,
                num_layers=params["num_layers"],
                lr=params["lr"],
                balance_labels=params["balance_labels"],
                logger_=logger_,
            )

            # Combine parameters and accuracies
            result = {**params, "accuracies": accuracies}

            # Append result to the JSON file
            append_to_json(file_path, result)

        except torch.cuda.OutOfMemoryError as e:  # noqa: PERF203
            logger_.error(f"Out of memory error: {e}, skipping search {i + 1}/{searches}")
        finally:
            torch.cuda.empty_cache()


# Main execution
file_path = Path("../reports/hyperparameter_search_gat.json")
initialize_json_file(file_path)

perform_hyperparameter_search(
    searches=1000,
    file_path=file_path,
    logger_=logger_,
    df_train=df_train,
    device=device,
    main=main,
)


2025-01-01 00:30:28,804 - graph_attention_network - ERROR - Out of memory error: CUDA out of memory. Tried to allocate 478.00 MiB. GPU 0 has a total capacity of 11.60 GiB of which 52.50 MiB is free. Including non-PyTorch memory, this process has 11.46 GiB memory in use. Of the allocated memory 9.77 GiB is allocated by PyTorch, and 1.45 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables), skipping search 37/1000


2025-01-01 05:39:15,316 - graph_attention_network - ERROR - Out of memory error: CUDA out of memory. Tried to allocate 478.00 MiB. GPU 0 has a total capacity of 11.60 GiB of which 52.50 MiB is free. Including non-PyTorch memory, this process has 11.46 GiB memory in use. Of the allocated memory 9.77 GiB is allocated by PyTorch, and 1.45 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables), skipping search 72/1000


KeyboardInterrupt: 