### **Tokenizer and Encoder-only model**

In [11]:
from tqdm import tqdm
import pandas as pd
import numpy as np
import copy
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import torch.nn as nn
import torch.optim as optim
from Encoder import *

import sys
sys.path.append("../Processing/")  # folder containing your file
from utils import *  # or import specific functions

In [5]:
cora = pd.read_csv("../Hyperedges/Cora_dataset.csv")

# I need to parse all the embeddings first
for col in ['BoW', 'Abstract_W2V', 'bert_embedding']:
    cora[col] = cora[col].apply(parse_embedding_flexible)

In [12]:
cora = pd.read_csv("./Cora_dataset_new_clsBERT.csv")
for col in ['bert_cls_embedding2']:
    cora[col] = cora[col].apply(parse_embedding_flexible)

In [14]:
class CoraDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = embeddings.astype(np.float32)
        self.labels = labels.astype(np.int64)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return torch.tensor(self.embeddings[idx]), torch.tensor(self.labels[idx])
    

X = np.stack(cora['bert_cls_embedding2'].values)  
y = np.array(cora['topic2'].factorize()[0])  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

train_dataset = CoraDataset(X_train, y_train)
test_dataset  = CoraDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader  = DataLoader(test_dataset, batch_size=16)

### **Build the model**

In [15]:
emb, label = train_dataset[0]
print(emb.shape, label)

torch.Size([768]) tensor(3)


In [16]:
model = TransformerEncoder(
    input_dim=emb.shape[0],      
    d_model=256,
    num_heads=8,
    num_layers=8,
    d_ff=2048,
    output_dim=7        
)

batch1 = next(iter(train_loader))
X, Y = batch1
print(f"First batch: {X.shape}, {Y.shape}")

logits = model(X) 
print(f"Model output shape: {logits.shape}") # batch size, num_classes

First batch: torch.Size([16, 768]), torch.Size([16])
Model output shape: torch.Size([16, 7])


In [17]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)

patience = 5
best_val_acc = 0.0
patience_counter = 0
best_model_state = None

num_epochs = 10

for epoch in range(num_epochs):
    # Training
    model.train()
    train_loss = 0.0
    train_preds = []
    train_targets = []

    for X_batch, y_batch in train_loader:
        X_batch = X_batch
        y_batch = y_batch

        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        preds = outputs.argmax(dim=1)
        train_preds.extend(preds.cpu().numpy())
        train_targets.extend(y_batch.cpu().numpy())

    train_acc = accuracy_score(train_targets, train_preds)
    avg_train_loss = train_loss / len(train_loader)

    # Validation
    model.eval()
    val_loss = 0.0
    val_preds = []
    val_targets = []

    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch = X_batch
            y_batch = y_batch

            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            val_loss += loss.item()

            preds = outputs.argmax(dim=1)
            val_preds.extend(preds.cpu().numpy())
            val_targets.extend(y_batch.cpu().numpy())

    val_acc = accuracy_score(val_targets, val_preds)
    avg_val_loss = val_loss / len(test_loader)

    print(f"Epoch {epoch+1}/{num_epochs} | "
          f"Train Loss: {avg_train_loss:.4f} | Train Acc: {train_acc:.4f} | "
          f"Val Loss: {avg_val_loss:.4f} | Val Acc: {val_acc:.4f} | ")

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_model_state = copy.deepcopy(model.state_dict())
        patience_counter = 0
    else:
        patience_counter += 1

    if patience_counter >= patience:
        break

print(f"\nBest validation accuracy: {best_val_acc:.4f}")


Epoch 1/10 | Train Loss: 2.0292 | Train Acc: 0.2397 | Val Loss: 1.8316 | Val Acc: 0.3038 | 
Epoch 2/10 | Train Loss: 1.8879 | Train Acc: 0.2788 | Val Loss: 1.8319 | Val Acc: 0.3038 | 
Epoch 3/10 | Train Loss: 1.8949 | Train Acc: 0.2703 | Val Loss: 1.8322 | Val Acc: 0.3038 | 
Epoch 4/10 | Train Loss: 1.8823 | Train Acc: 0.2788 | Val Loss: 1.8521 | Val Acc: 0.3038 | 
Epoch 5/10 | Train Loss: 1.8800 | Train Acc: 0.2856 | Val Loss: 1.8552 | Val Acc: 0.3038 | 
Epoch 6/10 | Train Loss: 1.8732 | Train Acc: 0.2851 | Val Loss: 1.8387 | Val Acc: 0.3038 | 

Best validation accuracy: 0.3038
