# Test of GAT
- use DGL
- test some model

In [1]:
import dgl
import json
import torch
import torch as th
from tqdm import tqdm
import torch.nn as nn
from dgl.nn import GraphConv, GATConv
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import get_linear_schedule_with_warmup

- check the GPU and assign the GPU by the best memory usage

In [2]:
import subprocess
import torch

def get_free_gpu():
    try:
        # Run nvidia-smi command to get GPU details
        _output_to_list = lambda x: x.decode('ascii').split('\n')[:-1]
        command = "nvidia-smi --query-gpu=memory.free --format=csv,nounits,noheader"
        memory_free_info = _output_to_list(subprocess.check_output(command.split())) 
        memory_free_values = [int(x) for i, x in enumerate(memory_free_info)]
        
        # Get the GPU with the maximum free memory
        best_gpu_id = memory_free_values.index(max(memory_free_values))
        return best_gpu_id
    except:
        # If any exception occurs, default to GPU 0 (this handles cases where nvidia-smi isn't installed)
        return 0

if torch.cuda.is_available():
    # Get the best GPU ID based on free memory and set it
    best_gpu_id = get_free_gpu()
    device = torch.device(f"cuda:{best_gpu_id}")
else:
    device = torch.device("cpu")

print(device)


cuda:1


## Fix the seed

In [3]:
import numpy as np
import torch
import random

#fix seed
def same_seeds(seed = 8787):
    torch.manual_seed(seed)
    # random.seed(seed) 
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)  
    np.random.seed(seed)  
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

## Data Loader

In [4]:
class GraphDataset(Dataset):
    def __init__(self, data_list, device):
        self.data_list = data_list
        self.device = device

    def __len__(self):
        return len(self.data_list)
    
    def __getitem__(self, idx):
        data = self.data_list[idx]

        g = dgl.graph((th.tensor(data["edge_index"][0]), th.tensor(data["edge_index"][1])), num_nodes=data["num_nodes"]).to(self.device)

        g.ndata['feat'] = th.tensor(data["node_feat"]).to(self.device)
        g.edata['feat'] = th.tensor(data["edge_attr"]).to(self.device)  # Add edge features to graph

        return g, th.tensor(data["label"]).to(self.device)


def collate(samples):
    # The input `samples` is a list of pairs
    #  (graph, label).
    graphs, labels = map(list, zip(*samples))
    batched_graph = dgl.batch(graphs)
    return batched_graph, torch.tensor(labels)


In [9]:
datasets = ['train', 'valid', 'test']
dataloaders = {}

for dataset_name in tqdm(datasets):
#     file_path = f"../data/training_data/repeated_{dataset_name}.jsonl"
#     file_path = f"../data/test_10(500times)/repeated_{dataset_name}.jsonl"
    file_path = f"../../data_processing/dgl/data/test/repeated_{dataset_name}.jsonl"
    
    print(file_path)
    with open(file_path) as f:
#         data_list = [json.loads(line) for line in f]
        data_list = [json.loads(line) for line in tqdm(f, position=0, leave=True)]
    
    dataset = GraphDataset(data_list, device)
    dataloaders[dataset_name] = DataLoader(dataset, batch_size=8, shuffle=False, collate_fn=collate)
    
print("Done!")

  0%|          | 0/3 [00:00<?, ?it/s]

../../data_processing/dgl/data/test/repeated_train.jsonl


130000it [00:29, 4419.04it/s]
 33%|███▎      | 1/3 [00:29<00:58, 29.42s/it]

../../data_processing/dgl/data/test/repeated_valid.jsonl


26it [00:00, 3963.36it/s]


../../data_processing/dgl/data/test/repeated_test.jsonl


26it [00:00, 2263.66it/s]
100%|██████████| 3/3 [00:29<00:00,  9.82s/it]

Done!





### Model
- Try teh model with 3 layers

In [16]:
class GAT(nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim, num_heads, dropout_prob=0.25):
        super(GAT, self).__init__()
        
        # do not check the zero in_degree since we have all the complete graph
        self.layer1 = GATConv(in_dim, hidden_dim, num_heads=num_heads, activation=F.relu, allow_zero_in_degree=True)
        self.layer2 = GATConv(hidden_dim * num_heads, hidden_dim, num_heads=num_heads, allow_zero_in_degree=True)
        self.layer3 = GATConv(hidden_dim * num_heads, out_dim, num_heads=num_heads, allow_zero_in_degree=True)

#         self.layer2 = GATConv(hidden_dim * num_heads, hidden_dim, num_heads=1, allow_zero_in_degree=True)
#         self.layer3 = GATConv(hidden_dim * num_heads, out_dim, num_heads=1, allow_zero_in_degree=True)
         
        # Adding Batch Normalization after each GAT layer
        self.batchnorm1 = nn.BatchNorm1d(hidden_dim * num_heads)
        self.batchnorm2 = nn.BatchNorm1d(hidden_dim * num_heads)
#         self.batchnorm3 = nn.BatchNorm1d(out_dim) # there's no need to use BN3
        
        # Adding Dropout for regularization
        self.dropout = nn.Dropout(dropout_prob)

    def forward(self, g, h):
        # Layer 1
        h1 = self.layer1(g, h)
        h1 = h1.view(h1.shape[0], -1)
        h1 = F.relu(h1)
        h1 = self.dropout(h1)
        
        # Layer 2
        h2 = self.layer2(g, h1)
        h2 = h2.view(h2.shape[0], -1)
        h2 = F.relu(h2)
        h2 = self.dropout(h2)

        # Layer 3
        h3 = self.layer3(g, h2).squeeze(1)
        h3 = self.dropout(h3)
        
        '''
        問題出現在 h3 = self.layer3(g, h2).squeeze(1)。
        在這裡，你應該得到一個形狀為 [N, num_heads, out_dim] 的tensor，但你使用了 squeeze(1)，
        如果 num_heads 是 1，你會得到 [N, out_dim]，這樣是沒問題的。
        但如果 num_heads 不是 1，那麼squeeze操作不會更改tensor的形狀，結果仍然是 [N, num_heads, out_dim]。
        因此，對這個tensor使用 batch normalization 會導致維度不匹配。
        '''
        # output layer so not need the BN
        # 不使用BN: GAT本身已經有注意力機制，所以BN不一定是必需的，尤其是在輸出層。
        # h3 = self.batchnorm3(h3)
        


        # Aggregate
        g.ndata['h_out'] = h3
        h_agg = dgl.mean_nodes(g, feat='h_out')
        return h_agg

    

- Model Forward  

In [17]:
def model_fn(data, model, criterion, device, count=1):
    """Forward a batch through the model."""
    batched_g, labels = data
#     print(batch_g)
    batched_g = batched_g.to(device)
    
    labels = labels.to(device)
    logits = model(batched_g, batched_g.ndata['feat'].float()) # for GAT
    logits = logits.mean(dim=1)
#     print(logits)
    
    loss = criterion(logits, labels)
#     print(batched_g.ndata['feat'].dtype)
#     print("Logits shape:", logits.shape)  # Expected: (batch_size, 168)
#     print("Labels shape:", labels.shape)  # Expected: (batch_size)

    # Get the class id with the highest probability.
    preds = logits.argmax(1)
    
    # Compute accuracy.
    accuracy = torch.mean((preds == labels).float())

#     return loss, accuracy
    return loss, accuracy, preds

### Training

- Fix the seed and save the model.state_dict that contains the initial weight

In [11]:
seed = 8787
same_seeds(seed)

model = GAT(in_dim=50, hidden_dim=16, out_dim=168, num_heads=8)
torch.save(model.state_dict(), 'model_initial/initial_weight.pth')

In [12]:
model.layer1.fc.weight

Parameter containing:
tensor([[-0.1806, -0.0598,  0.0091,  ...,  0.0719,  0.2496,  0.0873],
        [ 0.1694, -0.0015, -0.0139,  ...,  0.0147,  0.0892,  0.0146],
        [ 0.0969, -0.0595, -0.0115,  ..., -0.0474,  0.0529, -0.0565],
        ...,
        [-0.0433, -0.2248,  0.3002,  ...,  0.0850,  0.1621,  0.0422],
        [ 0.2097, -0.2492,  0.0612,  ..., -0.0041,  0.0365, -0.1483],
        [ 0.0971, -0.2221,  0.1652,  ..., -0.1312, -0.2610,  0.0077]],
       requires_grad=True)

- Check if model really load the model_dict

In [13]:
model = GAT(in_dim=50, hidden_dim=16, out_dim=168, num_heads=8)
model.load_state_dict(torch.load('model_initial/initial_weight.pth'))
model.layer1.fc.weight

Parameter containing:
tensor([[-0.1806, -0.0598,  0.0091,  ...,  0.0719,  0.2496,  0.0873],
        [ 0.1694, -0.0015, -0.0139,  ...,  0.0147,  0.0892,  0.0146],
        [ 0.0969, -0.0595, -0.0115,  ..., -0.0474,  0.0529, -0.0565],
        ...,
        [-0.0433, -0.2248,  0.3002,  ...,  0.0850,  0.1621,  0.0422],
        [ 0.2097, -0.2492,  0.0612,  ..., -0.0041,  0.0365, -0.1483],
        [ 0.0971, -0.2221,  0.1652,  ..., -0.1312, -0.2610,  0.0077]],
       requires_grad=True)

- 26 APs same as above x 5000 times and batch size = 4, model 2

In [18]:
seed = 8787
same_seeds(seed)

model = GAT(in_dim=50, hidden_dim=16, out_dim=168, num_heads=8)
# in_dim means the dimension of the node_feat(50 dim, since the 50-dim embedding)
# out_dim means the # of the categories -> 168 for out tasks
# model.load_state_dict(torch.load('model_initial/initial_weight.pth'))

model = model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-4)
# scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=total_steps)

criterion = nn.CrossEntropyLoss()
total_steps = 18

# save the best model
best_val_loss = float('inf')
patience = 3  # Number of epochs with no improvement after which training will be stopped.
waiting = 0  # The number of epochs with no improvement so far.


# Training Part
for epoch in tqdm(range(total_steps)):
    # Train
    model.train()
    total_loss = 0.0
    total_accuracy = 0.0
    num_batches = 0
    
    count = 0 
    
    for data in tqdm(dataloaders['train'], position=0, leave=True):
        
        count += 1
        loss, accuracy, _ = model_fn(data, model, criterion, device, count)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total_accuracy += accuracy.item()
        num_batches += 1
        
#     scheduler.step()
    print(f"total count: {count}")
    
    avg_loss = total_loss / num_batches
    avg_accuracy = total_accuracy / num_batches

    print(f'Epoch {epoch} | Train Loss: {avg_loss:.4f} | Train Accuracy: {avg_accuracy:.4f}')

    
    # Validation Part
    model.eval()
    total_accuracy = 0.0
    total_loss = 0.0
    num_batches = 0

    with torch.no_grad():
        for batched_g in dataloaders['valid']:
            loss, accuracy, _ = model_fn(batched_g, model, criterion, device)
            total_accuracy += accuracy.item()
            total_loss += loss.item()
            num_batches += 1

    avg_accuracy = total_accuracy / num_batches
    current_loss = total_loss / num_batches
    print(f'Validation Loss: {avg_loss:.4f} | Validation Accuracy: {current_loss:.4f}')
    
    
    if current_loss < best_val_loss:
        best_val_loss = current_val_loss
        waiting = 0
#         torch.save(model.state_dict(), 'best_model.pth')
        torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': loss,
                }, f"../checkpoint_GAT/best_model_{epoch}.pt")
    
    else:
        waiting += 1
        if waiting >= patience:
            print("Early stopping")
            break

            
    # Save checkpoint
#     if epoch%20 == 0:
#         torch.save({
#                 'epoch': epoch,
#                 'model_state_dict': model.state_dict(),
#                 'optimizer_state_dict': optimizer.state_dict(),
#                 'loss': loss,
#                 }, f"../checkpoint_GAT/checkpoint_{epoch}.pt")
    

# Testing Part
model.eval()
total = 0
correct = 0

with torch.no_grad():
    for data in dataloaders['test']:
        loss, accuracy, predicted = model_fn(data, model, criterion, device)
        labels = data[1].to(device)  # Assuming labels are the second element in the tuple
        
        print(f"labels: {labels}", labels.shape)
        print(f"predicted: {predicted}", predicted.shape)
        
        total += labels.size(0) # label.size(0) is the batch size
        correct += (predicted == labels).sum().item() 
        # (predicted == labels).sum() would return how many of them are equal; 
        # .item() would make the tensor to the regular value
        
    print('Test Accuracy: %d %%' % (100 * correct / total))

100%|██████████| 16250/16250 [07:13<00:00, 37.50it/s]
  0%|          | 0/18 [07:13<?, ?it/s]

total count: 16250
Epoch 0 | Train Loss: 3.9531 | Train Accuracy: 0.0382
Validation Loss: 3.9531 | Validation Accuracy: 3.3935





NameError: name 'current_val_loss' is not defined