In [2]:
!pip install transformers datasets spacy torchdiffeq
!python -m spacy download en_core_web_sm

import torch
import torch.nn as nn
import numpy as np
import requests
import spacy
from transformers import RobertaTokenizer, RobertaModel
from datasets import load_dataset
from torchdiffeq import odeint
from tqdm import tqdm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m99.6 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [3]:
class Config:
    roberta_model = "roberta-base"
    max_length = 128
    batch_size = 8
    lstm_hidden = 256
    diffusion_steps = 25
    num_choices = 5
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

config = Config()


In [4]:
nlp = spacy.load("en_core_web_sm")

def get_conceptnet_edges_api(concept):
    # Query ConceptNet for related concepts (edges) for a given concept
    url = f"http://api.conceptnet.io/c/en/{concept}"
    try:
        obj = requests.get(url, timeout=3).json()
        edges = []
        for edge in obj.get('edges', []):
            if '/en/' in edge['end']['@id']:
                edges.append(edge['end']['label'])
        return list(set(edges))
    except Exception:
        return []


In [5]:
tokenizer = RobertaTokenizer.from_pretrained(config.roberta_model)

def augment_question_with_knowledge(question, question_concept):
    # Use only question_concept for knowledge lookup, not the answer
    q_concept = question_concept.replace(" ", "_").lower()
    q_knowledge = get_conceptnet_edges_api(q_concept)
    q_knowledge_text = " ".join(list(set(q_knowledge)))
    # Augment the question with ConceptNet knowledge
    aug_question = question + " " + q_knowledge_text if q_knowledge_text else question
    return aug_question

def preprocess_example(example):
    input_ids = []
    attention_mask = []
    aug_question = augment_question_with_knowledge(example["question"], example["question_concept"])
    for choice_text in example["choices"]["text"]:
        encoded = tokenizer(
            aug_question, choice_text,
            padding='max_length',
            max_length=config.max_length,
            truncation=True,
            return_tensors='pt'
        )
        input_ids.append(encoded['input_ids'][0])
        attention_mask.append(encoded['attention_mask'][0])
    label = example["choices"]["label"].index(example["answerKey"]) if "answerKey" in example else -1
    return {
        "input_ids": torch.stack(input_ids),       # shape: (5, max_length)
        "attention_mask": torch.stack(attention_mask),
        "label": label
    }

def preprocess_dataset(dataset):
    processed = []
    for ex in tqdm(dataset, desc="Preprocessing"):
        processed.append(preprocess_example(ex))
    return processed
'''
# Load and preprocess datasets
raw_train = load_dataset("tau/commonsense_qa", split="train[:-1000]")
raw_valid = load_dataset("tau/commonsense_qa", split="train[-1000:]")
raw_test = load_dataset("tau/commonsense_qa", split="validation")
'''
# Load the full train split
full_train = load_dataset("tau/commonsense_qa", split="train")

# Create splits according to your specifications
raw_train = full_train.select(range(1000))        # First 1000 samples
raw_valid = full_train.select(range(1000, 1300))  # Next 300 samples (index 1000-1299)
raw_test = full_train.select(range(1300, 1400))   # Next 100 samples (index 1300-1399)

train_data = preprocess_dataset(raw_train)
valid_data = preprocess_dataset(raw_valid)
test_data = preprocess_dataset(raw_test)


Preprocessing: 100%|██████████| 1000/1000 [17:29<00:00,  1.05s/it]
Preprocessing: 100%|██████████| 300/300 [05:13<00:00,  1.05s/it]
Preprocessing: 100%|██████████| 100/100 [01:41<00:00,  1.01s/it]


In [6]:
from torch.utils.data import Dataset, DataLoader

class CSQADataset(Dataset):
    def __init__(self, data):
        self.data = data
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        item = self.data[idx]
        return {
            "input_ids": item["input_ids"],
            "attention_mask": item["attention_mask"],
            "label": item["label"]
        }

train_loader = DataLoader(CSQADataset(train_data), batch_size=config.batch_size, shuffle=True)
valid_loader = DataLoader(CSQADataset(valid_data), batch_size=config.batch_size)
test_loader = DataLoader(CSQADataset(test_data), batch_size=config.batch_size)


In [7]:
class ODEDiffusion(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(hidden_size, hidden_size*2),
            nn.Tanh(),
            nn.Linear(hidden_size*2, hidden_size)
        )
        self.time_steps = torch.linspace(0, 1, config.diffusion_steps)
    
    def odefunc(self, t, y):
        return self.net(y)
    
    def forward(self, x):
        batch, seq, hidden = x.size()
        # Use reshape instead of view for non-contiguous tensors
        x_reshaped = x.reshape(-1, hidden)
        refined = odeint(self.odefunc, x_reshaped, self.time_steps.to(x.device), method='dopri5')[-1]
        return refined.reshape(batch, seq, hidden)


class HybridModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.roberta = RobertaModel.from_pretrained(config.roberta_model)
        self.bilstm = nn.LSTM(
            input_size=768,
            hidden_size=config.lstm_hidden // 2,
            bidirectional=True,
            batch_first=True
        )
        self.diffusion = ODEDiffusion(config.lstm_hidden)
        self.classifier = nn.Sequential(
            nn.Linear(config.lstm_hidden, 256),
            nn.ReLU(),
            nn.Linear(256, 1)
        )
    def forward(self, input_ids, attention_mask):
        batch, num_choices, seq = input_ids.size()
        input_ids = input_ids.view(-1, seq)
        attention_mask = attention_mask.view(-1, seq)
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        x = outputs.last_hidden_state  # (batch*num_choices, seq, 768)
        lstm_out, _ = self.bilstm(x)
        diffused = self.diffusion(lstm_out)
        pooled = diffused.mean(dim=1)  # (batch*num_choices, hidden)
        logits = self.classifier(pooled)  # (batch*num_choices, 1)
        logits = logits.view(batch, num_choices)
        return logits


In [8]:
def accuracy_fn(logits, labels):
    preds = logits.argmax(dim=1)
    return (preds == labels).float().mean().item()

def train_epoch(model, loader, optimizer, criterion):
    model.train()
    total_loss, total_acc = 0, 0
    for batch in loader:
        input_ids = batch["input_ids"].to(config.device)
        attention_mask = batch["attention_mask"].to(config.device)
        labels = batch["label"].to(config.device)
        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)
        acc = accuracy_fn(logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        total_acc += acc
    return total_loss / len(loader), total_acc / len(loader)

def eval_epoch(model, loader, criterion):
    model.eval()
    total_loss, total_acc = 0, 0
    with torch.no_grad():
        for batch in loader:
            input_ids = batch["input_ids"].to(config.device)
            attention_mask = batch["attention_mask"].to(config.device)
            labels = batch["label"].to(config.device)
            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)
            acc = accuracy_fn(logits, labels)
            total_loss += loss.item()
            total_acc += acc
    return total_loss / len(loader), total_acc / len(loader)


In [10]:
test_loss, test_acc = eval_epoch(model, test_loader, criterion)
print(f"Test Loss: {test_loss:.4f} | Test Acc: {test_acc:.4f}")


Test Loss: 1.6064 | Test Acc: 0.1827


In [11]:
epochs = 15  # Maximum number of epochs
patience = 5  # Number of epochs to wait for improvement in validation loss
best_val_loss = float('inf')
epochs_no_improve = 0

for epoch in range(epochs):
    train_loss, train_acc = train_epoch(model, train_loader, optimizer, criterion)
    val_loss, val_acc = eval_epoch(model, valid_loader, criterion)
    print(f"Epoch {epoch+1}/{epochs} | Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")

    # Check for early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print(f"Early stopping triggered after {epoch+1} epochs due to no improvement in validation loss for {patience} consecutive epochs.")
            break

Epoch 1/15 | Train Loss: 1.6013 | Train Acc: 0.2340 | Val Loss: 1.6088 | Val Acc: 0.2829
Epoch 2/15 | Train Loss: 1.5855 | Train Acc: 0.2890 | Val Loss: 1.5852 | Val Acc: 0.2829
Epoch 3/15 | Train Loss: 1.4701 | Train Acc: 0.3660 | Val Loss: 1.5368 | Val Acc: 0.2895
Epoch 4/15 | Train Loss: 1.2257 | Train Acc: 0.5180 | Val Loss: 1.5865 | Val Acc: 0.3750
Epoch 5/15 | Train Loss: 0.8716 | Train Acc: 0.6940 | Val Loss: 1.7394 | Val Acc: 0.3717
Epoch 6/15 | Train Loss: 0.5686 | Train Acc: 0.8050 | Val Loss: 2.0080 | Val Acc: 0.3289
Epoch 7/15 | Train Loss: 0.3409 | Train Acc: 0.8850 | Val Loss: 2.4970 | Val Acc: 0.3355
Epoch 8/15 | Train Loss: 0.2373 | Train Acc: 0.9230 | Val Loss: 3.5361 | Val Acc: 0.3289
Early stopping triggered after 8 epochs due to no improvement in validation loss for 5 consecutive epochs.
