### loading the data 


In [21]:
import torch

# Check if CUDA (GPU support) is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("Using device:", device)
print("CUDA available:", torch.cuda.is_available())


Using device: cuda
CUDA available: True


In [4]:
!pip install torch_geometric
# !pip install torch
!pip install networkx
# !pip install torch-geometric

Collecting torch_geometric
  Using cached torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
Collecting aiohttp (from torch_geometric)
  Downloading aiohttp-3.12.13-cp310-cp310-win_amd64.whl.metadata (7.9 kB)
Collecting fsspec (from torch_geometric)
  Using cached fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)
Collecting pyparsing (from torch_geometric)
  Using cached pyparsing-3.2.3-py3-none-any.whl.metadata (5.0 kB)
Collecting tqdm (from torch_geometric)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting aiohappyeyeballs>=2.5.0 (from aiohttp->torch_geometric)
  Using cached aiohappyeyeballs-2.6.1-py3-none-any.whl.metadata (5.9 kB)
Collecting aiosignal>=1.1.2 (from aiohttp->torch_geometric)
  Using cached aiosignal-1.4.0-py3-none-any.whl.metadata (3.7 kB)
Collecting async-timeout<6.0,>=4.0 (from aiohttp->torch_geometric)
  Downloading async_timeout-5.0.1-py3-none-any.whl.metadata (5.1 kB)
Collecting attrs>=17.3.0 (from aiohttp->torch_geometric)
  Using cac

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torch 2.5.1 requires sympy==1.13.1, but you have sympy 1.13.3 which is incompatible.




In [13]:
import networkx as nx
from torch_geometric.utils import from_networkx

# Load the .graphml file
G_nx = nx.read_graphml("all_documents_newww.graphml")

# Optional: Convert node attributes to float tensors (if needed)
for node_id in G_nx.nodes:
    attrs = G_nx.nodes[node_id]
    for k, v in attrs.items():
        try:
            G_nx.nodes[node_id][k] = float(v)
        except:
            pass  # Skip non-numeric attributes

# Convert to PyTorch Geometric format
from torch_geometric.data import Data

data = from_networkx(G_nx)

# Now data is ready to be used with GAT
print(data)


Data(edge_index=[2, 0], Text=[480], ValueType=[480], EndsWithColon=[480], left_spacing=[480], right_spacing=[480], IsHorizontalNeighbourKey=[480], IsVerticalNeighbourKey=[480], Label=[480], num_nodes=480)


In [14]:
import torch
num_nodes = data.num_nodes
data.x = torch.eye(num_nodes)  # One-hot features
print(data.x)


tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 1., 0., 0.],
        [0., 0., 0.,  ..., 0., 1., 0.],
        [0., 0., 0.,  ..., 0., 0., 1.]])


In [15]:
data.x.shape

torch.Size([480, 480])

In [None]:
import networkx as nx
import torch
from torch_geometric.utils import from_networkx

# Step 1: Load .graphml file
G = nx.read_graphml("all_documents_newww.graphml")

# Optional: convert node attributes to float (if needed)
for node in G.nodes:
    for key, val in G.nodes[node].items():
        try:
            G.nodes[node][key] = float(val)
        except:
            pass  # Skip non-numeric attributes

# Step 2: Convert to PyTorch Geometric Data
data = from_networkx(G)

# If node features are missing, create identity or random features
if not hasattr(data, 'x'):
    num_nodes = data.num_nodes
    data.x = torch.eye(num_nodes)  # one-hot as fallback
    # Or use: data.x = torch.rand(num_nodes, feature_dim)
# print(data.x)
# Step 3: Save to .pt file
torch.save(data, "graph_data.pt")
print("Saved as graph_data.pt")


None
Saved as graph_data.pt


In [15]:
import torch
from torch_geometric.data import Data
data = torch.load("graph_data.pt",weights_only=False)



In [35]:
# print(data)
print(data.edge_index)
# there is no edge index in the data

tensor([], size=(2, 0), dtype=torch.int64)


In [None]:
data.ValueType[0] # alphanumeric, # numeric etc.

'[0, 0, 0, 0, 0, 0, 0, 1, 0]'

### model testing

In [28]:
import json
import torch
import matplotlib.pyplot as plt
import pandas as pd
from torch.nn import CrossEntropyLoss
from torch_geometric.loader import DataLoader
from torch_geometric.nn.models import GAT
import os
from torch_geometric.data import Data
from torch_geometric.data.data import DataEdgeAttr, DataTensorAttr
from torch_geometric.data.storage import GlobalStorage
import torch.serialization

In [29]:
import torch
print(torch.__version__)
print(torch.version.cuda)
print(torch.cuda.is_available())

2.5.1
12.1
True


In [52]:
!pip install torch-scatter -f https://data.pyg.org/whl/torch-2.5.1+cu121.html

Looking in links: https://data.pyg.org/whl/torch-2.5.1+cu121.html
Collecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-2.5.0%2Bcu121/torch_scatter-2.1.2%2Bpt25cu121-cp310-cp310-win_amd64.whl (3.5 MB)
     ---------------------------------------- 0.0/3.5 MB ? eta -:--:--
     ---------------------------------------- 3.5/3.5 MB 52.4 MB/s eta 0:00:00
Installing collected packages: torch-scatter
Successfully installed torch-scatter-2.1.2+pt25cu121


In [None]:
# # ✅ NodeFormer-style Graph Transformer for Node-Level Classification

# import torch
# import torch.nn as nn
# import torch.nn.functional as F
# from torch_geometric.nn import knn_graph
# from torch_scatter import scatter_mean

# class NodeFormerLayer(nn.Module):
#     def __init__(self, in_dim, out_dim, k=16):
#         super().__init__()
#         self.k = k
#         self.attn_proj = nn.Linear(in_dim, out_dim)
#         self.val_proj = nn.Linear(in_dim, out_dim)
#         self.out_proj = nn.Linear(out_dim, out_dim)

#     def forward(self, x, batch):
#         # x: [N, F]  -- node features
#         # batch: [N] -- batch IDs

#         edge_index = knn_graph(x, self.k, batch=batch, loop=False)
#         row, col = edge_index

#         # Attention score between i and j
#         q = self.attn_proj(x)  # [N, D]
#         v = self.val_proj(x)

#         attn_score = (q[row] * q[col]).sum(dim=-1) / (q.size(-1) ** 0.5)  # [E]
#         attn_score = F.softmax(attn_score, dim=0)

#         # Weighted aggregation
#         out = attn_score.unsqueeze(-1) * v[col]  # [E, D]
#         out = scatter_mean(out, row, dim=0, dim_size=x.size(0))  # [N, D]

#         return self.out_proj(out) + x  # Residual


# class NodeFormer(nn.Module):
#     def __init__(self, in_dim, hidden_dim, out_dim, num_layers=2, k=16):
#         super().__init__()
#         self.input_proj = nn.Linear(in_dim, hidden_dim)
#         self.layers = nn.ModuleList([
#             NodeFormerLayer(hidden_dim, hidden_dim, k=k)
#             for _ in range(num_layers)
#         ])
#         self.classifier = nn.Linear(hidden_dim, out_dim)

#     def forward(self, x, batch):
#         x = self.input_proj(x)
#         for layer in self.layers:
#             x = layer(x, batch)
#         return self.classifier(x)


# # # Example usage:
# # if __name__ == '__main__':
# #     from torch_geometric.datasets import Planetoid
# #     from torch_geometric.loader import DataLoader
# #     from torch_geometric.utils import to_dense_batch
    
# #     dataset = Planetoid(root="./data", name="Cora")
# #     data = dataset[0]

# #     model = NodeFormer(in_dim=dataset.num_node_features, hidden_dim=64, out_dim=dataset.num_classes)
# #     optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
# #     criterion = nn.CrossEntropyLoss()

# #     model.train()
# #     for epoch in range(100):
# #         optimizer.zero_grad()
# #         out = model(data.x, batch=torch.zeros_like(data.y))
# #         loss = criterion(out[data.train_mask], data.y[data.train_mask])
# #         loss.backward()
# #         optimizer.step()
# #         print(f"Epoch {epoch} | Loss: {loss.item():.4f}")


In [30]:

def smooth_curve(data, weight=0.9):
    smoothed = []
    last = data[0]
    for point in data:
        smoothed_val = last * weight + (1 - weight) * point
        smoothed.append(smoothed_val)
        last = smoothed_val
    return smoothed


def train_single_config(config, train_loader, val_loader, in_channels, num_classes, run_name, model_dir, results_dir, plots_dir):
    model = GAT(
        in_channels=in_channels,
        hidden_channels=config['hidden_channels'],
        num_layers=config['num_layers'],
        out_channels=num_classes,
        dropout=config['dropout'],
        heads=config['heads'],
        v2=True,
        edge_dim=1,
        jk='lstm'
    )
    # model = NodeFormer(in_dim=in_channels, hidden_dim=64, out_dim=4)

    all_labels = torch.cat([data.y for data in train_loader.dataset])
    class_counts = torch.bincount(all_labels, minlength=num_classes)
    class_weights = 1.0 / (class_counts.float() + 1e-6)
    class_weights = class_weights / class_weights.sum()

    criterion = CrossEntropyLoss(weight=class_weights)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0005, weight_decay=5e-4)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=10)

    training_loss, validation_loss, validation_acc = [], [], []

    for epoch in range(500):
        model.train()
        total_loss = 0
        for data in train_loader:
            optimizer.zero_grad()
            out = model(data.x, data.edge_index, edge_weight=data.edge_attr)
            # model = NodeFormer(in_dim=data.x, hidden_dim=64, out_dim=4)
            loss = criterion(out, data.y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)
        training_loss.append(avg_train_loss)

        model.eval()
        val_total_loss = 0
        correct = 0
        total = 0
        with torch.no_grad():
            for data in val_loader:
                out = model(data.x, data.edge_index, edge_weight=data.edge_attr)
                loss = criterion(out, data.y)
                val_total_loss += loss.item()
                pred = out.argmax(dim=1)
                correct += (pred == data.y).sum().item()
                total += data.y.size(0)

        avg_val_loss = val_total_loss / len(val_loader)
        val_accuracy = correct / total
        validation_loss.append(avg_val_loss)
        validation_acc.append(val_accuracy)

        scheduler.step(avg_val_loss)

        print(f"Epoch {epoch + 1:03d} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | Val Acc: {val_accuracy:.4f}")

    os.makedirs(model_dir, exist_ok=True)
    os.makedirs(results_dir, exist_ok=True)
    os.makedirs(plots_dir, exist_ok=True)

    model_path = os.path.join(model_dir, f"{run_name}.pth")
    csv_path = os.path.join(results_dir, f"{run_name}.csv")
    plot_path = os.path.join(plots_dir, f"{run_name}.png")

    torch.save(model.state_dict(), model_path)

    df = pd.DataFrame({
        'Epoch': list(range(1, len(training_loss)+1)),
        'TrainLoss': training_loss,
        'ValLoss': validation_loss,
        'ValAcc': validation_acc
    })
    df.to_csv(csv_path, index=False)

    plt.figure()
    plt.plot(smooth_curve(training_loss), label='Train')
    plt.plot(validation_loss, label='Val')
    plt.title(run_name)
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.savefig(plot_path)
    plt.close()




In [9]:
!pip install optuna

Collecting optuna
  Using cached optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.3-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Using cached colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting sqlalchemy>=1.4.2 (from optuna)
  Downloading sqlalchemy-2.0.41-cp310-cp310-win_amd64.whl.metadata (9.8 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Using cached mako-1.3.10-py3-none-any.whl.metadata (2.9 kB)
Collecting greenlet>=1 (from sqlalchemy>=1.4.2->optuna)
  Downloading greenlet-3.2.3-cp310-cp310-win_amd64.whl.metadata (4.2 kB)
Using cached optuna-4.4.0-py3-none-any.whl (395 kB)
Downloading alembic-1.16.3-py3-none-any.whl (246 kB)
Downloading sqlalchemy-2.0.41-cp310-cp310-win_amd64.whl (2.1 MB)
   ---------------------------------------- 0.0/2.1 MB ? eta -:--:--
   ---------------------------------- ----- 1.8/2.1 MB 10.0 MB/s eta 0:00:01
   ---------------------------------------- 2.

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [57]:
data_list = torch.load("datacheckpoint_training_(15).pt", map_location='cuda', weights_only=False)
data_list

[Data(x=[103, 18], edge_index=[2, 79], edge_attr=[79, 1], y=[103]),
 Data(x=[111, 18], edge_index=[2, 84], edge_attr=[84, 1], y=[111]),
 Data(x=[35, 18], edge_index=[2, 25], edge_attr=[25, 1], y=[35]),
 Data(x=[76, 18], edge_index=[2, 56], edge_attr=[56, 1], y=[76]),
 Data(x=[256, 18], edge_index=[2, 194], edge_attr=[194, 1], y=[256]),
 Data(x=[48, 18], edge_index=[2, 36], edge_attr=[36, 1], y=[48]),
 Data(x=[92, 18], edge_index=[2, 70], edge_attr=[70, 1], y=[92]),
 Data(x=[138, 18], edge_index=[2, 106], edge_attr=[106, 1], y=[138]),
 Data(x=[40, 18], edge_index=[2, 30], edge_attr=[30, 1], y=[40]),
 Data(x=[81, 18], edge_index=[2, 60], edge_attr=[60, 1], y=[81]),
 Data(x=[113, 18], edge_index=[2, 86], edge_attr=[86, 1], y=[113]),
 Data(x=[185, 18], edge_index=[2, 143], edge_attr=[143, 1], y=[185]),
 Data(x=[85, 18], edge_index=[2, 64], edge_attr=[64, 1], y=[85]),
 Data(x=[73, 18], edge_index=[2, 56], edge_attr=[56, 1], y=[73]),
 Data(x=[54, 18], edge_index=[2, 41], edge_attr=[41, 1], y

In [51]:
len(data_list[0].x)

103

In [None]:
import torch
import json
import os
import pandas as pd
import matplotlib.pyplot as plt
import optuna
from torch.nn import CrossEntropyLoss
from torch_geometric.loader import DataLoader
from torch_geometric.data import Data

# from your_model_file import GAT  # Replace with actual import
# from your_utils import smooth_curve  # Replace if defined elsewhere

def smooth_curve(data, weight=0.9):
    smoothed = []
    last = data[0]
    for point in data:
        smoothed_val = last * weight + (1 - weight) * point
        smoothed.append(smoothed_val)
        last = smoothed_val
    return smoothed



# Load data 
with torch.serialization.safe_globals([Data]):
    data_list = torch.load("training_data\Datacheckpoint_latest_22", map_location='cuda', weights_only=False)

labels = json.load(open("label_encoding.json"))
batch_size = 1

train_split = int(len(data_list) * 0.8)
train_data = data_list[:train_split]
val_data = data_list[train_split:]

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size)

in_channels = data_list[0].x.size(1)
# in_channels =18
num_classes = len(labels)

model_dir = "C:\\Users\\User\\OneDrive\\Desktop\\GAT-model testing\\GAT-test\\models\\model"
results_dir = "C:\\Users\\User\\OneDrive\\Desktop\\GAT-model testing\\GAT-test\\results"
plots_dir = "C:\\Users\\User\\OneDrive\\Desktop\\GAT-model testing\\GAT-test\\plots"
os.makedirs(model_dir, exist_ok=True)
os.makedirs(results_dir, exist_ok=True)
os.makedirs(plots_dir, exist_ok=True)

def objective(trial):
    config = {
        'hidden_channels': trial.suggest_categorical('hidden_channels', [64, 128, 256]),
        'num_layers': trial.suggest_int('num_layers', 1, 3),
        'heads': trial.suggest_categorical('heads', [1, 2, 4, 8]),
        'dropout': trial.suggest_float('dropout', 0.0, 0.5),
        'hidden_dim': trial.suggest_categorical('hidden_dim', [64, 128, 256])
    }

    model = GAT(
        in_channels=in_channels,
        hidden_channels=config['hidden_channels'],
        num_layers=config['num_layers'],
        out_channels=num_classes,
        dropout=config['dropout'],
        heads=config['heads'],
        v2=True,
        edge_dim=1,
        jk='lstm'
    ).to(device)  # 🚀 Move model to GPU
    # model = NodeFormer(in_dim=in_channels, hidden_dim=64, out_dim=4).to(device)

    all_labels = torch.cat([data.y for data in train_loader.dataset])
    class_counts = torch.bincount(all_labels, minlength=num_classes)
    class_weights = 1.0 / (class_counts.float() + 1e-6)
    class_weights = class_weights / class_weights.sum()
    class_weights = class_weights.to(device)  # 🎯 Move weights to GPU

    criterion = CrossEntropyLoss(weight=class_weights)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0005, weight_decay=5e-4)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=10)

    best_val_acc = 0
    training_loss, validation_loss, validation_acc, training_acc = [], [], [], []
    best_model_state_path = None
    best_model_full_path = None

    # Early stopping parameters
    patience = 30  # Number of epochs to wait for improvement
    min_delta = 1e-4  # Minimum change to qualify as improvement
    wait = 0
    best_val_loss = float('inf')

    for epoch in range(500):
        model.train()
        total_loss = 0
        correct_train = 0
        total_train = 0
        for data in train_loader:
            data = data.to(device)  #  Move batch to GPU
            optimizer.zero_grad()
            out = model(data.x, data.edge_index, edge_weight=data.edge_attr)
            loss = criterion(out, data.y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            pred_train = out.argmax(dim=1)
            correct_train += (pred_train == data.y).sum().item()
            total_train += data.y.size(0)

        avg_train_loss = total_loss / len(train_loader)
        training_loss.append(avg_train_loss)
        train_acc = correct_train / total_train if total_train > 0 else 0
        training_acc.append(train_acc)

        model.eval()
        val_loss = 0
        correct, total = 0, 0
        with torch.no_grad():
            for data in val_loader:
                data = data.to(device)  # 🚀 Move validation data to GPU
                out = model(data.x, data.edge_index, edge_weight=data.edge_attr)
                loss = criterion(out, data.y)
                val_loss += loss.item()
                pred = out.argmax(dim=1)
                correct += (pred == data.y).sum().item()
                total += data.y.size(0)

        avg_val_loss = val_loss / len(val_loader)
        val_acc = correct / total if total > 0 else 0
        validation_loss.append(avg_val_loss)
        validation_acc.append(val_acc)

        scheduler.step(avg_val_loss)
        print(f"Epoch {epoch + 1:03d} | Train Loss: {avg_train_loss:.4f} | Train Acc: {train_acc:.4f} | Val Loss: {avg_val_loss:.4f} | Val Acc: {val_acc:.4f}")
        trial.report(val_acc, epoch)

        # Early stopping logic
        if avg_val_loss < best_val_loss - min_delta:
            best_val_loss = avg_val_loss
            wait = 0
            # Save best model at this point
            run_name = f"BestTrial_H{config['hidden_channels']}_L{config['num_layers']}_HD{config['heads']}_DO{int(config['dropout']*10)}"
            best_model_state_path = os.path.join(model_dir, f"{run_name}_best_state_dict.pth")
            best_model_full_path = os.path.join(model_dir, f"{run_name}_best_full.pt")
            torch.save(model.state_dict(), best_model_state_path)
            torch.save(model, best_model_full_path)
        else:
            wait += 1
            if wait >= patience:
                print(f"Early stopping at epoch {epoch+1}.")
                break

        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    # Save the best model's path for later use
    if best_model_state_path is not None:
        trial.set_user_attr("best_model_state_path", best_model_state_path)
    if best_model_full_path is not None:
        trial.set_user_attr("best_model_full_path", best_model_full_path)

    # Save only the final model loss curve plot
    run_name = f"BestTrial_H{config['hidden_channels']}_L{config['num_layers']}_HD{config['heads']}_DO{int(config['dropout']*10)}"
    plt.figure()
    plt.plot(training_loss, label='Train Loss')
    plt.plot(validation_loss, label='Validation Loss')
    plt.title(run_name)
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.savefig(os.path.join(plots_dir, f"{run_name}_final_loss_curve.png"))
    plt.close()

    # Save loss/acc history to CSV (optional, keep for analysis)
    df = pd.DataFrame({
        'Epoch': list(range(1, len(training_loss)+1)),
        'TrainLoss': training_loss,
        'ValLoss': validation_loss,
        'TrainAcc': training_acc,
        'ValAcc': validation_acc
    })
    df.to_csv(os.path.join(results_dir, f"{run_name}.csv"), index=False)

    return max(validation_acc) if validation_acc else 0


In [25]:


# with torch.serialization.safe_globals([Data, DataEdgeAttr, DataTensorAttr, GlobalStorage]):
#     data_list = torch.load(f"DatacheckpointNew_Training.pt", map_location='cpu')

# labels = json.load(open("label_encoding.json"))
# batch_size = 1

# train_split = int(len(data_list) * 0.8)
# train_data = data_list[:train_split]
# val_data = data_list[train_split:]

# train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
# val_loader = DataLoader(val_data, batch_size=batch_size)

# in_channels = data_list[0].x.size(1)
# num_classes = len(labels)
# # num_classes = 4

#     # 🔧 Use only one configuration here:
# config = {
#         'hidden_channels':256,
#         'num_layers': 2,
#         'heads':8,
#         'dropout': 0.2
#     }

# run_name = f"SingleRun_H{config['hidden_channels']}_L{config['num_layers']}_HD{config['heads']}_DO{int(config['dropout']*10)}_Updated"

# model_dir = "C:\\Users\\User\\OneDrive\\Desktop\\GAT-model testing\\models"
# results_dir = "C:\\Users\\User\\OneDrive\\Desktop\\GAT-model testing\\results"
# plots_dir = "C:\\Users\\User\\OneDrive\\Desktop\\GAT-model testing\\plots"

# print(f"\n🚀 Starting {run_name}")
# train_single_config(config, train_loader, val_loader, in_channels, num_classes, run_name, model_dir, results_dir, plots_dir)


In [32]:
# # ✅ NodeFormer-style Graph Transformer for Node-Level Classification

# import torch
# import torch.nn as nn
# import torch.nn.functional as F
# from torch_geometric.nn import knn_graph
# from torch_scatter import scatter_mean

# class NodeFormerLayer(nn.Module):
#     def __init__(self, in_dim, out_dim, k=16):
#         super().__init__()
#         self.k = k
#         self.attn_proj = nn.Linear(in_dim, out_dim)
#         self.val_proj = nn.Linear(in_dim, out_dim)
#         self.out_proj = nn.Linear(out_dim, out_dim)

#     def forward(self, x, batch):
#         # x: [N, F]  -- node features
#         # batch: [N] -- batch IDs

#         edge_index = knn_graph(x, self.k, batch=batch, loop=False)
#         row, col = edge_index

#         # Attention score between i and j
#         q = self.attn_proj(x)  # [N, D]
#         v = self.val_proj(x)

#         attn_score = (q[row] * q[col]).sum(dim=-1) / (q.size(-1) ** 0.5)  # [E]
#         attn_score = F.softmax(attn_score, dim=0)

#         # Weighted aggregation
#         out = attn_score.unsqueeze(-1) * v[col]  # [E, D]
#         out = scatter_mean(out, row, dim=0, dim_size=x.size(0))  # [N, D]

#         return self.out_proj(out) + x  # Residual


# class NodeFormer(nn.Module):
#     def __init__(self, in_dim, hidden_dim, out_dim, num_layers=2, k=16):
#         super().__init__()
#         self.input_proj = nn.Linear(in_dim, hidden_dim)
#         self.layers = nn.ModuleList([
#             NodeFormerLayer(hidden_dim, hidden_dim, k=k)
#             for _ in range(num_layers)
#         ])
#         self.classifier = nn.Linear(hidden_dim, out_dim)

#     def forward(self, x, batch):
#         x = self.input_proj(x)
#         for layer in self.layers:
#             x = layer(x, batch)
#         return self.classifier(x)


# # # Example usage:
# # if __name__ == '__main__':
# #     from torch_geometric.datasets import Planetoid
# #     from torch_geometric.loader import DataLoader
# #     from torch_geometric.utils import to_dense_batch
    
# #     dataset = Planetoid(root="./data", name="Cora")
# #     data = dataset[0]

# #     model = NodeFormer(in_dim=dataset.num_node_features, hidden_dim=64, out_dim=dataset.num_classes)
# #     optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
# #     criterion = nn.CrossEntropyLoss()

# #     model.train()
# #     for epoch in range(100):
# #         optimizer.zero_grad()
# #         out = model(data.x, batch=torch.zeros_like(data.y))
# #         loss = criterion(out[data.train_mask], data.y[data.train_mask])
# #         loss.backward()
# #         optimizer.step()
# #         print(f"Epoch {epoch} | Loss: {loss.item():.4f}")


In [6]:
print(type(data_list))

<class 'list'>


In [70]:
# !pip uninstall -y torch-cluster
!pip install torch-cluster -f https://data.pyg.org/whl/torch-2.5.1+cu121.html

Looking in links: https://data.pyg.org/whl/torch-2.5.1+cu121.html
Collecting torch-cluster
  Downloading https://data.pyg.org/whl/torch-2.5.0%2Bcu121/torch_cluster-1.6.3%2Bpt25cu121-cp310-cp310-win_amd64.whl (1.6 MB)
     ---------------------------------------- 0.0/1.6 MB ? eta -:--:--
     ---------------------------------------- 0.0/1.6 MB ? eta -:--:--
     ---------------------------------------- 0.0/1.6 MB ? eta -:--:--
     ---------------------------------------- 0.0/1.6 MB ? eta -:--:--
     ------ --------------------------------- 0.3/1.6 MB ? eta -:--:--
     ------ --------------------------------- 0.3/1.6 MB ? eta -:--:--
     ------------------- -------------------- 0.8/1.6 MB 2.2 MB/s eta 0:00:01
     ---------------------------------------- 1.6/1.6 MB 2.5 MB/s eta 0:00:00
Installing collected packages: torch-cluster
Successfully installed torch-cluster-1.6.3+pt25cu121


In [None]:
import pandas as pd
import os
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

print("Best Trial:")
# print("  Accuracy:", study.best_trial.value)
print("  Params:")
for k, v in study.best_trial.params.items():
    print(f"    {k}: {v}")

# === No need to retrain: Best model is already saved during Optuna search ===
best_trial = study.best_trial
print("Best model state_dict saved at:", best_trial.user_attrs["best_model_state_path"])
print("Best full model saved at:", best_trial.user_attrs["best_model_full_path"])

# Print best train and validation accuracy from CSV
run_name = f"BestTrial_H{best_trial.params['hidden_channels']}_L{best_trial.params['num_layers']}_HD{best_trial.params['heads']}_DO{int(best_trial.params['dropout']*10)}"
csv_path = os.path.join("results", f"{run_name}.csv")
if os.path.exists(csv_path):
    df = pd.read_csv(csv_path)
    best_train_acc = df['TrainAcc'].max() if 'TrainAcc' in df else None
    best_val_acc = df['ValAcc'].max() if 'ValAcc' in df else None
    print(f"Best Training Accuracy: {best_train_acc:.4f}" if best_train_acc is not None else "Best Training Accuracy: N/A")
    print(f"Best Validation Accuracy: {best_val_acc:.4f}" if best_val_acc is not None else "Best Validation Accuracy: N/A")
else:
    print(f"Could not find CSV file for best trial at {csv_path}")


[I 2025-07-14 15:16:02,943] A new study created in memory with name: no-name-a3ae3b11-ff0b-4341-9e0c-fb2bbd46cbab


Epoch 001 | Train Loss: 1.3842 | Val Loss: 1.3947
Epoch 002 | Train Loss: 1.3772 | Val Loss: 1.3921
Epoch 002 | Train Loss: 1.3772 | Val Loss: 1.3921
Epoch 003 | Train Loss: 1.3711 | Val Loss: 1.3875
Epoch 003 | Train Loss: 1.3711 | Val Loss: 1.3875
Epoch 004 | Train Loss: 1.3643 | Val Loss: 1.3850
Epoch 004 | Train Loss: 1.3643 | Val Loss: 1.3850
Epoch 005 | Train Loss: 1.3542 | Val Loss: 1.3814
Epoch 005 | Train Loss: 1.3542 | Val Loss: 1.3814
Epoch 006 | Train Loss: 1.3413 | Val Loss: 1.3739
Epoch 006 | Train Loss: 1.3413 | Val Loss: 1.3739
Epoch 007 | Train Loss: 1.3223 | Val Loss: 1.3667
Epoch 007 | Train Loss: 1.3223 | Val Loss: 1.3667
Epoch 008 | Train Loss: 1.2884 | Val Loss: 1.3547
Epoch 008 | Train Loss: 1.2884 | Val Loss: 1.3547
Epoch 009 | Train Loss: 1.2500 | Val Loss: 1.3411
Epoch 009 | Train Loss: 1.2500 | Val Loss: 1.3411
Epoch 010 | Train Loss: 1.2040 | Val Loss: 1.3252
Epoch 010 | Train Loss: 1.2040 | Val Loss: 1.3252
Epoch 011 | Train Loss: 1.1631 | Val Loss: 1.3104


[W 2025-07-14 15:16:14,717] Trial 0 failed with parameters: {'hidden_channels': 64, 'num_layers': 3, 'heads': 4, 'dropout': 0.06745096027957437, 'hidden_dim': 256} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\User\anaconda3\envs\ENVGAT\lib\site-packages\optuna\study\_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\User\AppData\Local\Temp\ipykernel_28788\1925102268.py", line 99, in objective
    out = model(data.x, data.edge_index, edge_weight=data.edge_attr)
  File "c:\Users\User\anaconda3\envs\ENVGAT\lib\site-packages\torch\nn\modules\module.py", line 1736, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "c:\Users\User\anaconda3\envs\ENVGAT\lib\site-packages\torch\nn\modules\module.py", line 1747, in _call_impl
    return forward_call(*args, **kwargs)
  File "c:\Users\User\anaconda3\envs\ENVGAT\lib\site-packages\torch_geometric\nn\models\basic_gnn.py", line 254

Epoch 031 | Train Loss: 0.6383 | Val Loss: 1.0771


KeyboardInterrupt: 

In [36]:
# # Save best params to JSON
# best_params_path = os.path.join(model_dir, "best_params.json")
# with open(best_params_path, "w") as f:
#     json.dump(study.best_trial.params, f, indent=4)

# # ----------------- FINAL MODEL TRAINING ----------------------

# # Build model with best params
# best_params = study.best_trial.params

# final_model = GAT(
#     in_channels=in_channels,
#     hidden_channels=best_params['hidden_channels'],
#     num_layers=best_params['num_layers'],
#     out_channels=num_classes,
#     dropout=best_params['dropout'],
#     heads=best_params['heads'],
#     v2=True,
#     edge_dim=1,
#     jk='lstm'
# ).to(device)
# # final_model = NodeFormer(
# #     in_dim=in_channels,
# #     hidden_dim=best_params['hidden_dim'],
# #     out_dim=num_classes,
# #     num_layers=best_params['num_layers'],
# #     k=best_params['k']
# # ).to(device)
# # Loss and optimizer setup
# all_labels = torch.cat([data.y for data in train_loader.dataset])
# class_counts = torch.bincount(all_labels, minlength=num_classes)
# class_weights = 1.0 / (class_counts.float() + 1e-6)
# class_weights = class_weights / class_weights.sum()
# class_weights = class_weights.to(device)

# criterion = CrossEntropyLoss(weight=class_weights)
# optimizer = torch.optim.Adam(final_model.parameters(), lr=0.0005, weight_decay=5e-4)

# # Train final model
# train_losses = []
# val_losses = []
# best_val_loss = float('inf')
# best_model_path = os.path.join(model_dir, "GAT_full_model_best_001.pt")
# for epoch in range(500):
#     final_model.train()
#     train_loss = 0
#     for data in train_loader:
#         data = data.to(device)
#         optimizer.zero_grad()
#         out = final_model(data.x, data.edge_index, edge_weight=data.edge_attr)
#         # out = final_model(data.x, batch=None)
#         loss = criterion(out, data.y)
#         loss.backward()
#         optimizer.step()
#         train_loss += loss.item()
#     avg_train_loss = train_loss / len(train_loader)
#     train_losses.append(avg_train_loss)

#     # Validation loss
#     final_model.eval()
#     val_loss = 0
#     with torch.no_grad():
#         for data in val_loader:
#             data = data.to(device)
#             out = final_model(data.x, data.edge_index, edge_weight=data.edge_attr)
#             loss = criterion(out, data.y)
#             val_loss += loss.item()
#     avg_val_loss = val_loss / len(val_loader)
#     val_losses.append(avg_val_loss)

#     # Save best model (lowest val loss)
#     if avg_val_loss < best_val_loss:
#         best_val_loss = avg_val_loss
#         torch.save(final_model.state_dict(), best_model_path)

#     print(f"[FINAL TRAIN] Epoch {epoch+1:03d} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")

# # ----------------- SAVE FULL MODEL ----------------------

# full_model_path = os.path.join(model_dir, "GAT_full_model_2.pt")
# torch.save(final_model, full_model_path)
# print(f"✅ Full model saved to {full_model_path}")
# print(f"✅ Best model (lowest val loss) saved to {best_model_path}")

# # ----------------- PLOT TRAIN/VAL LOSS ----------------------
# import matplotlib.pyplot as plt
# plt.figure(figsize=(8,5))
# plt.plot(train_losses, label='Train Loss')
# plt.plot(val_losses, label='Validation Loss')
# plt.xlabel('Epoch')
# plt.ylabel('Loss')
# plt.title('Final Model Training and Validation Loss')
# plt.legend()
# plt.grid(True)
# plt.tight_layout()
# plt.savefig(os.path.join(plots_dir, "final_train_val_loss.png"))
# plt.show()

In [37]:
# data.x

In [46]:
import torch
import json
from torch_geometric.loader import DataLoader

# Load the saved full model
model = torch.load("C:\\Users\\User\\OneDrive\\Desktop\\GAT-model testing\\GAT-test\\models\\BestTrial_H128_L3_HD1_DO3_best_full.pt")
model.eval()

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Load label decoder (to map index -> class name)
label_mapping = json.load(open("label_encoding.json"))
index_to_label = {v: k for k, v in label_mapping.items()}  # reverse mapping

# Load the data you want to predict on
data_list = torch.load("C:\\Users\\User\\OneDrive\\Desktop\\GAT-model testing\\GAT-test\\data\\background_verification\\datacheckpoint_1.pt", map_location='cuda', weights_only=False)
test_loader = DataLoader(data_list, batch_size=1, shuffle=False)

# Predict on each sample
predictions = []
true_labels = []
model.eval()
with torch.no_grad():
    for data in test_loader:
        data = data.to(device)
        out = model(data.x, data.edge_index, edge_weight=data.edge_attr)
        pred = out.argmax(dim=1).cpu().numpy()
        labels = [index_to_label[int(p)] for p in pred]
        predictions.extend(labels)
        # Collect true labels if available
        if hasattr(data, 'y'):
            true_labels.extend([index_to_label[int(y)] for y in data.y.cpu().numpy()])

# Only print comparison between true and predicted labels if available
if true_labels:
    print("Comparison of True vs Predicted Labels:")
    for i, (true_label, pred_label) in enumerate(zip(true_labels, predictions)):
        print(f"Sample {i+1}: True label = {true_label} | Predicted label = {pred_label}")
else:
    print("No true labels found in test data. Cannot compare.")

Comparison of True vs Predicted Labels:
Sample 1: True label = KEY | Predicted label = NON_RELATED
Sample 2: True label = NON_RELATED | Predicted label = VALUE
Sample 3: True label = NON_RELATED | Predicted label = NON_RELATED
Sample 4: True label = KEY | Predicted label = NON_RELATED
Sample 5: True label = NON_RELATED | Predicted label = NON_RELATED
Sample 6: True label = NON_RELATED | Predicted label = VALUE
Sample 7: True label = KEY | Predicted label = KEY
Sample 8: True label = NON_RELATED | Predicted label = VALUE
Sample 9: True label = NON_RELATED | Predicted label = OTHER_KEY
Sample 10: True label = NON_RELATED | Predicted label = OTHER_KEY
Sample 11: True label = KEY | Predicted label = NON_RELATED
Sample 12: True label = NON_RELATED | Predicted label = VALUE
Sample 13: True label = NON_RELATED | Predicted label = NON_RELATED
Sample 14: True label = NON_RELATED | Predicted label = OTHER_KEY
Sample 15: True label = KEY | Predicted label = KEY
Sample 16: True label = NON_RELATED

  model = torch.load("C:\\Users\\User\\OneDrive\\Desktop\\GAT-model testing\\GAT-test\\models\\BestTrial_H128_L3_HD1_DO3_best_full.pt")


In [37]:
data_list = torch.load("datacheckpoint_01 (1).pt", map_location='cuda', weights_only=False)
data_list[0]

Data(x=[19, 18], edge_index=[2, 13], edge_attr=[13, 1], y=[19])

In [None]:
from sklearn.metrics import accuracy_score

# Compute accuracy if true labels are available
if true_labels:
    accuracy = accuracy_score(true_labels, predictions)
    print(f"Test Classification Accuracy: {accuracy:.4f}")
else:
    print("No true labels found in test data. Accuracy cannot be computed.")