In [19]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import sys
sys.path.append('..')
from Dataset_Loader_Node_Classification import Dataset_Loader
from gcn.models import GCN
from gcn.utils import accuracy

In [20]:
# In a new cell

# --- 1. Load Data ---
# Initialize Dataset_Loader for Pubmed
# The dName 'pubmed' should match the folder name in 'stage_5_data'
# and the dataset_name attribute in Dataset_Loader for specific train/test/val splits.
data_loader = Dataset_Loader(dName='pubmed', dDescription='Pubmed citation network')
data_loader.dataset_name = 'pubmed' # Set dataset_name for train/val/test splits
data_loader.dataset_source_folder_path = '../stage_5_data/pubmed/'  # Path to the dataset folder

import os

# Set the correct data path if needed (adjust the path as necessary)
data_loader.data_path = '../stage_5_data/pubmed/'  # or the correct relative/absolute path to your data

# Check if the data path exists
if not os.path.exists(data_loader.data_path):
	raise FileNotFoundError(f"Data path does not exist: {data_loader.data_path}")

loaded_data = data_loader.load()

graph_data = loaded_data['graph']
train_test_val_indices = loaded_data['train_test_val']

adj = graph_data['utility']['A']
features = graph_data['X']
labels = graph_data['y']

idx_train = train_test_val_indices['idx_train']
idx_val = train_test_val_indices['idx_val']
idx_test = train_test_val_indices['idx_test']

print("Data Loaded:")
print(f"  Features shape: {features.shape}")
print(f"  Adjacency matrix shape: {adj.shape}")
print(f"  Labels shape: {labels.shape}")
print(f"  Number of training samples: {len(idx_train)}")
print(f"  Number of validation samples: {len(idx_val)}")
print(f"  Number of testing samples: {len(idx_test)}")

Loading pubmed dataset...
Data Loaded:
  Features shape: torch.Size([19717, 500])
  Adjacency matrix shape: torch.Size([19717, 19717])
  Labels shape: torch.Size([19717])
  Number of training samples: 60
  Number of validation samples: 300
  Number of testing samples: 1000


In [21]:
# In a new cell

# --- 2. Model, Optimizer, and Hyperparameters ---
# Hyperparameters (similar to _Pubmed/train.py)
n_epochs = 200
lr = 0.01
weight_decay = 5e-4
hidden_units = 16
dropout_rate = 0.5
cuda_available = torch.cuda.is_available()

# Model initialization
n_features = features.shape[1]
n_classes = labels.max().item() + 1

model = GCN(nfeat=n_features,
            nhid=hidden_units,
            nclass=n_classes,
            dropout=dropout_rate)

optimizer = optim.Adam(model.parameters(),
                       lr=lr, weight_decay=weight_decay)

if cuda_available:
    model.cuda()
    features = features.cuda()
    adj = adj.cuda()
    labels = labels.cuda()
    idx_train = idx_train.cuda()
    idx_val = idx_val.cuda()
    idx_test = idx_test.cuda()

print("Model Initialized:")
print(model)

Model Initialized:
GCN(
  (gc1): GraphConvolution (500 -> 16)
  (gc2): GraphConvolution (16 -> 3)
)


In [22]:
# In a new cell

# --- 3. Training Loop ---
import time

def train_epoch(epoch):
    t = time.time()
    model.train()
    optimizer.zero_grad()
    
    output = model(features, adj) # The GCN model expects features and adjacency matrix
    
    loss_train = F.nll_loss(output[idx_train], labels[idx_train])
    acc_train = accuracy(output[idx_train], labels[idx_train]) # Using accuracy from utils
    
    loss_train.backward()
    optimizer.step()

    # Evaluate validation set performance
    model.eval()
    output_val = model(features, adj)
    loss_val = F.nll_loss(output_val[idx_val], labels[idx_val])
    acc_val = accuracy(output_val[idx_val], labels[idx_val])

    print(f'Epoch: {epoch+1:04d}',
          f'loss_train: {loss_train.item():.4f}',
          f'acc_train: {acc_train.item():.4f}',
          f'loss_val: {loss_val.item():.4f}',
          f'acc_val: {acc_val.item():.4f}',
          f'time: {time.time() - t:.4f}s')
    return loss_val.item()

print("Starting Training...")
t_total = time.time()
for epoch in range(n_epochs):
    train_epoch(epoch)

print("Optimization Finished!")
print(f"Total time elapsed: {time.time() - t_total:.4f}s")

Starting Training...
Epoch: 0001 loss_train: 1.1107 acc_train: 0.3500 loss_val: 1.1145 acc_val: 0.3633 time: 0.0372s
Epoch: 0002 loss_train: 1.0944 acc_train: 0.4167 loss_val: 1.1083 acc_val: 0.3367 time: 0.0226s
Epoch: 0003 loss_train: 1.0774 acc_train: 0.5000 loss_val: 1.1015 acc_val: 0.3400 time: 0.0198s
Epoch: 0004 loss_train: 1.0749 acc_train: 0.4500 loss_val: 1.0945 acc_val: 0.3467 time: 0.0204s
Epoch: 0005 loss_train: 1.0634 acc_train: 0.4833 loss_val: 1.0861 acc_val: 0.3733 time: 0.0203s
Epoch: 0006 loss_train: 1.0477 acc_train: 0.5167 loss_val: 1.0759 acc_val: 0.4300 time: 0.0199s
Epoch: 0007 loss_train: 1.0301 acc_train: 0.5667 loss_val: 1.0637 acc_val: 0.4867 time: 0.0203s
Epoch: 0008 loss_train: 1.0252 acc_train: 0.5667 loss_val: 1.0507 acc_val: 0.5333 time: 0.0198s
Epoch: 0009 loss_train: 0.9982 acc_train: 0.6000 loss_val: 1.0372 acc_val: 0.5533 time: 0.0227s
Epoch: 0010 loss_train: 0.9923 acc_train: 0.6333 loss_val: 1.0238 acc_val: 0.5600 time: 0.0218s
Epoch: 0011 loss_tr

In [23]:
# filepath: _Pubmed/main.ipynb
# In a new cell

# --- 4. Testing ---
def test_model():
    model.eval()
    output = model(features, adj)
    loss_test = F.nll_loss(output[idx_test], labels[idx_test])
    acc_test = accuracy(output[idx_test], labels[idx_test])
    print("\nTest set results:",
          f"loss= {loss_test.item():.4f}",
          f"accuracy= {acc_test.item():.4f}")

print("\nEvaluating on Test Set...")
test_model()


Evaluating on Test Set...

Test set results: loss= 0.5498 accuracy= 0.7780
