# Test of GraphSAGE
- use DGL
- predict `graphs`
- valid, test data are in the training dataset

In [39]:
!pip uninstall dgl

Found existing installation: dgl 1.1.1
Uninstalling dgl-1.1.1:
  Would remove:
    /workdir/home/euni/anaconda3/lib/python3.9/site-packages/dgl-1.1.1.dist-info/*
    /workdir/home/euni/anaconda3/lib/python3.9/site-packages/dgl/*
Proceed (Y/n)? ^C
[31mERROR: Operation cancelled by user[0m


In [1]:
import dgl
import json
import torch
import torch as th
import dgl.nn as dglnn
# from tqdm import tqdm
from tqdm.notebook import tqdm  # 使用 notebook 版本的 tqdm
import torch.nn as nn
from dgl.nn import GraphConv, GATConv, SAGEConv
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import get_linear_schedule_with_warmup

- check the GPU and assign the GPU by the best memory usage

In [37]:
import subprocess
import torch

def get_free_gpu():
    try:
        # Run nvidia-smi command to get GPU details
        _output_to_list = lambda x: x.decode('ascii').split('\n')[:-1]
        command = "nvidia-smi --query-gpu=memory.free --format=csv,nounits,noheader"
        memory_free_info = _output_to_list(subprocess.check_output(command.split())) 
        memory_free_values = [int(x) for i, x in enumerate(memory_free_info)]
        
        # Get the GPU with the maximum free memory
        best_gpu_id = memory_free_values.index(max(memory_free_values))
        return best_gpu_id
    except:
        # If any exception occurs, default to GPU 0 (this handles cases where nvidia-smi isn't installed)
        return 0

if torch.cuda.is_available():
    # Get the best GPU ID based on free memory and set it
    best_gpu_id = get_free_gpu()
    device = torch.device(f"cuda:{best_gpu_id}")
else:
    device = torch.device("cpu")
    print("there's no available GPU")

# device = torch.device(f"cuda:{1}")
print(device)


cuda:1


## Fix the seed

In [11]:
import numpy as np
import torch
import random

#fix seed
def same_seeds(seed = 8787):
    torch.manual_seed(seed)
    # random.seed(seed) 
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)  
    np.random.seed(seed)  
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

## Data Loader

In [4]:
class GraphDataset(Dataset):
    def __init__(self, data_list, device):
        self.data_list = data_list
        self.device = device

    def __len__(self):
        return len(self.data_list)
    
    def __getitem__(self, idx):
        data = self.data_list[idx]

        g = dgl.graph((th.tensor(data["edge_index"][0]), th.tensor(data["edge_index"][1])), num_nodes=data["num_nodes"]).to(self.device)

        g.ndata['feat'] = th.tensor(data["node_feat"]).to(self.device)
        g.edata['feat'] = th.tensor(data["edge_attr"]).to(self.device)  # Add edge features to graph

        return g, th.tensor(data["label"]).to(self.device)


def collate(samples):
    # The input `samples` is a list of pairs
    #  (graph, label).
    graphs, labels = map(list, zip(*samples))
    batched_graph = dgl.batch(graphs)
    return batched_graph, torch.tensor(labels)


In [5]:
datasets = ['train', 'valid', 'test']
dataset_data = {}

for dataset_name in tqdm(datasets):
    file_path = f"../../data_processing/dgl/data/test_graph/repeated_{dataset_name}.jsonl"
    
    print(file_path)
    with open(file_path) as f:
        data_list = [json.loads(line) for line in tqdm(f, position=0, leave=True)]
    
    dataset_data[dataset_name] = GraphDataset(data_list, device)

print("Datasets loaded!")

  0%|          | 0/3 [00:00<?, ?it/s]

../../data_processing/dgl/data/test_graph/repeated_train.jsonl


0it [00:00, ?it/s]

../../data_processing/dgl/data/test_graph/repeated_valid.jsonl


0it [00:00, ?it/s]

../../data_processing/dgl/data/test_graph/repeated_test.jsonl


0it [00:00, ?it/s]

Datasets loaded!


- choose batch size

In [6]:
def create_dataloaders(batch_size, shuffle=True):
    dataloaders = {}
    for dataset_name, dataset in dataset_data.items():
        dataloaders[dataset_name] = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, collate_fn=collate)
    return dataloaders

dataloaders = create_dataloaders(4)
# dataloaders = create_dataloaders(16)

- Turn the print message to a log file

In [7]:
import datetime

now = datetime.datetime.now()

formatted_time = now.strftime("%m%d_%H:%M")

log_file_path = f"../log_message/{formatted_time}_GraphSAGE.log"

def add_log_msg(msg, log_file_path=log_file_path):
    with open(log_file_path, 'a') as f:
        f.write(f'{datetime.datetime.now().strftime("%m/%d/%Y, %H:%M:%S")}# {msg}\n')
    print(f'{datetime.datetime.now().strftime("%m/%d/%Y, %H:%M:%S")}# {msg}')

print(log_file_path)

../log_message/0816_16:24_GraphSAGE.log


### Model

In [21]:
class GraphSAGE(nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim):
        super(GraphSAGE, self).__init__()
        self.layer1 = dglnn.SAGEConv(in_dim, hidden_dim, 'mean')
        self.layer2 = dglnn.SAGEConv(hidden_dim, out_dim, 'mean')  # Output layer

    def forward(self, g, inputs):
        h = self.layer1(g, inputs)
        h = torch.relu(h)
        h = self.layer2(g, h)  # You can apply another non-linearity here if needed
        
        g.ndata['h'] = h
        hg = dgl.mean_nodes(g, 'h')
        return hg


- Model Forward  

In [22]:
def model_fn(data, model, criterion, device, count=1, type='train'):
    """Forward a batch through the model."""
    batched_g, labels = data
    batched_g = batched_g.to(device)
    
    labels = labels.to(device)
    logits = model(batched_g, batched_g.ndata['feat'].float()) # for GAT
    logits = logits.mean(dim=1)
    
    loss = criterion(logits, labels)

    # Get the class id with the highest probability
    preds = logits.argmax(1)
    
    # Compute accuracy
    accuracy = torch.mean((preds == labels).float())
    
    if type == 'validation':
        add_log_msg(f"labels of Validation: {labels} {labels.shape}")
        add_log_msg(f"predicted of Validation: {preds} {preds.shape}")
        
    elif type == 'test':
        add_log_msg(f"labels of Test: {labels} {labels.shape}")
        add_log_msg(f"predicted of Test: {preds} {preds.shape}")
        
    if count % 5000 == 0: 
        add_log_msg(f"labels of {count}: {labels} {labels.shape}")
        add_log_msg(f"predicted of {count}: {preds} {preds.shape}")
        
    return loss, accuracy, preds

### Training

- Fix the seed and save the model.state_dict that contains the initial weight

In [32]:
seed = 8787
same_seeds(seed)

model = GraphSAGE(in_dim=50, hidden_dim=16, out_dim=168)
torch.save(model.state_dict(), 'model3_initial(graphsage)/initial_weight.pth')

In [33]:
model.layer1.fc_self.weight

Parameter containing:
tensor([[ 0.3966, -0.2622, -0.4172, -0.2120,  0.0392, -0.1415,  0.4194,  0.1208,
         -0.0180, -0.4136, -0.3660,  0.2784,  0.1991, -0.0898,  0.3969, -0.1612,
          0.2863, -0.3886, -0.0421, -0.1185, -0.3744,  0.3962,  0.2371, -0.3675,
         -0.4098,  0.3475,  0.1936,  0.1666, -0.3460,  0.3121, -0.2045, -0.0577,
          0.4003, -0.1058,  0.1000, -0.1319,  0.1262, -0.1793, -0.3533,  0.2065,
         -0.3789,  0.1936,  0.2644, -0.4193,  0.2918,  0.1618,  0.0928, -0.0824,
         -0.1039,  0.0728],
        [ 0.2606, -0.4032,  0.3426,  0.3319, -0.1503, -0.0656, -0.1955, -0.2699,
         -0.1863,  0.4212,  0.1890, -0.3355, -0.1435,  0.0435,  0.2325,  0.1970,
          0.0711,  0.3621,  0.0238,  0.4196,  0.2668,  0.4046, -0.0977, -0.2018,
          0.1634, -0.1156,  0.4119,  0.2383, -0.0069,  0.0162, -0.0569, -0.2738,
          0.2987, -0.0467,  0.3767, -0.4006, -0.1762, -0.3149,  0.3337,  0.2914,
         -0.3179,  0.1646,  0.2352,  0.1848, -0.0304, -0.23

- Check if model really load the model_dict

In [34]:
model = GraphSAGE(in_dim=50, hidden_dim=16, out_dim=168)
model.load_state_dict(torch.load('model3_initial(graphsage)/initial_weight.pth'))
model.layer1.fc_self.weight

Parameter containing:
tensor([[ 0.3966, -0.2622, -0.4172, -0.2120,  0.0392, -0.1415,  0.4194,  0.1208,
         -0.0180, -0.4136, -0.3660,  0.2784,  0.1991, -0.0898,  0.3969, -0.1612,
          0.2863, -0.3886, -0.0421, -0.1185, -0.3744,  0.3962,  0.2371, -0.3675,
         -0.4098,  0.3475,  0.1936,  0.1666, -0.3460,  0.3121, -0.2045, -0.0577,
          0.4003, -0.1058,  0.1000, -0.1319,  0.1262, -0.1793, -0.3533,  0.2065,
         -0.3789,  0.1936,  0.2644, -0.4193,  0.2918,  0.1618,  0.0928, -0.0824,
         -0.1039,  0.0728],
        [ 0.2606, -0.4032,  0.3426,  0.3319, -0.1503, -0.0656, -0.1955, -0.2699,
         -0.1863,  0.4212,  0.1890, -0.3355, -0.1435,  0.0435,  0.2325,  0.1970,
          0.0711,  0.3621,  0.0238,  0.4196,  0.2668,  0.4046, -0.0977, -0.2018,
          0.1634, -0.1156,  0.4119,  0.2383, -0.0069,  0.0162, -0.0569, -0.2738,
          0.2987, -0.0467,  0.3767, -0.4006, -0.1762, -0.3149,  0.3337,  0.2914,
         -0.3179,  0.1646,  0.2352,  0.1848, -0.0304, -0.23

### test of valid and test part is ``graph``

- 60 APs in training x 10000times
- 5 APs in validation x 4 times
- 3 APs in test x 4 times
- Batch size = 4

In [38]:
seed = 8787
same_seeds(seed)

model = GraphSAGE(in_dim=50, hidden_dim=16, out_dim=168)
# in_dim means the dimension of the node_feat(50 dim, since the 50-dim embedding)
# out_dim means the # of the categories -> 168 for out tasks
model.load_state_dict(torch.load('model3_initial(graphsage)/initial_weight.pth'))

model = model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
# scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=total_steps)

criterion = nn.CrossEntropyLoss()
total_steps = 180

# save the best model
best_val_loss = float('inf')
patience = 500  # Number of epochs with no improvement after which training will be stopped.
waiting = 0  # The number of epochs with no improvement so far.


# Training Part
for epoch in tqdm(range(total_steps)):
    # Train
    model.train()
    total_loss = 0.0
    total_accuracy = 0.0
    num_batches = 0
    
    count = 0 
    
    for data in tqdm(dataloaders['train'], desc="Training", position=0, leave=True):
        
        count += 1
        loss, accuracy, _ = model_fn(data, model, criterion, device, count, type='train')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total_accuracy += accuracy.item()
        num_batches += 1
        
#     scheduler.step()
    add_log_msg(f"total count: {count}")

    avg_loss = total_loss / num_batches
    avg_accuracy = total_accuracy / num_batches

    add_log_msg(f'Epoch {epoch} | Train Loss: {avg_loss:.4f} | Train Accuracy: {avg_accuracy:.4f}')

    
    # Validation Part
    model.eval()
    total_accuracy = 0.0
    total_loss = 0.0
    num_batches = 0

    with torch.no_grad():
        for batched_g in dataloaders['valid']:
            loss, accuracy, _ = model_fn(batched_g, model, criterion, device, type=='validation')
            total_accuracy += accuracy.item()
            total_loss += loss.item()
            num_batches += 1

    avg_accuracy = total_accuracy / num_batches
    current_loss = total_loss / num_batches
    
    add_log_msg(f'Validation Loss: {current_loss:.4f} | Validation Accuracy: {avg_accuracy:.4f}\n')
    
    
    if current_loss < best_val_loss:
        best_val_loss = current_loss
        waiting = 0
        
#         torch.save(model.state_dict(), 'best_model.pth')
        torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': loss,
                }, f"../checkpoint_GAT/best_model_{epoch}.pt")
    
    else:
        waiting += 1
        if waiting >= patience:
            add_log_msg("Early stopping")
            break

            
# Testing Part
model.eval()
total = 0
correct = 0

with torch.no_grad():
    for data in dataloaders['test']:
        loss, accuracy, predicted = model_fn(data, model, criterion, device, type=='test')
        labels = data[1].to(device)  # Assuming labels are the second element in the tuple
        
        add_log_msg(f"labels: {labels} {labels.shape}")
        add_log_msg(f"predicted: {predicted} {predicted.shape}")
        
        total += labels.size(0) # label.size(0) is the batch size
        correct += (predicted == labels).sum().item() 
        # (predicted == labels).sum() would return how many of them are equal; 
        # .item() would make the tensor to the regular value
        
#     print('Test Accuracy: %d %%' % (100 * correct / total))
add_log_msg(f'Test Accuracy: {100 * correct / total} %%')

  0%|          | 0/180 [00:00<?, ?it/s]

Training:   0%|          | 0/222500 [00:00<?, ?it/s]

DGLError: [16:38:02] /opt/dgl/src/runtime/c_runtime_api.cc:82: Check failed: allow_missing: Device API cuda is not enabled. Please install the cuda version of dgl.
Stack trace:
  [bt] (0) /workdir/home/euni/anaconda3/lib/python3.9/site-packages/dgl/libdgl.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x75) [0x7fd921d694e5]
  [bt] (1) /workdir/home/euni/anaconda3/lib/python3.9/site-packages/dgl/libdgl.so(dgl::runtime::DeviceAPIManager::GetAPI(std::string, bool)+0x1f2) [0x7fd9220e99e2]
  [bt] (2) /workdir/home/euni/anaconda3/lib/python3.9/site-packages/dgl/libdgl.so(dgl::runtime::DeviceAPI::Get(DGLContext, bool)+0x1e1) [0x7fd9220e3f91]
  [bt] (3) /workdir/home/euni/anaconda3/lib/python3.9/site-packages/dgl/libdgl.so(dgl::runtime::NDArray::Empty(std::vector<long, std::allocator<long> >, DGLDataType, DGLContext)+0x13b) [0x7fd922106ebb]
  [bt] (4) /workdir/home/euni/anaconda3/lib/python3.9/site-packages/dgl/libdgl.so(dgl::runtime::NDArray::CopyTo(DGLContext const&) const+0xc3) [0x7fd922141213]
  [bt] (5) /workdir/home/euni/anaconda3/lib/python3.9/site-packages/dgl/libdgl.so(dgl::UnitGraph::CopyTo(std::shared_ptr<dgl::BaseHeteroGraph>, DGLContext const&)+0x3ef) [0x7fd92224eb8f]
  [bt] (6) /workdir/home/euni/anaconda3/lib/python3.9/site-packages/dgl/libdgl.so(dgl::HeteroGraph::CopyTo(std::shared_ptr<dgl::BaseHeteroGraph>, DGLContext const&)+0xf6) [0x7fd922152676]
  [bt] (7) /workdir/home/euni/anaconda3/lib/python3.9/site-packages/dgl/libdgl.so(+0x52dfa6) [0x7fd922161fa6]
  [bt] (8) /workdir/home/euni/anaconda3/lib/python3.9/site-packages/dgl/libdgl.so(DGLFuncCall+0x48) [0x7fd9220e8fa8]

