# Imports

In [1]:
from nltk.corpus import stopwords
from datasets import load_dataset
import re
import concurrent.futures
import nltk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer
import spacy
from transformers import Trainer, TrainingArguments
from transformers import AlbertForSequenceClassification, DistilBertForSequenceClassification, BertForSequenceClassification
from transformers import DistilBertModel, AdamW
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score
import torch.optim as optim
import torch.nn.functional as F
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


# Dataloaders

In [2]:
train_DB_inputs = torch.load('intermediates/DB_inputs.pt')
dev_DB_inputs = torch.load('intermediates/DB_dev_inputs.pt')

train_labels = torch.load('intermediates/labels.pt')
dev_labels = torch.load('intermediates/labels_dev.pt')

In [3]:
class CustomDataset(Dataset):
    def __init__(self, dB_inputs, labels):
        """
        Initialize the dataset.
        :param dB_inputs: Dictionary containing the tokenized inputs from the tokenizer (e.g., input_ids, attention_mask).
        :param labels: List or tensor containing the corresponding labels for each example.
        """
        self.dB_inputs = dB_inputs  # The tokenized inputs (input_ids, attention_mask, etc.)
        self.labels = labels  # The target labels (e.g., 0 or 1 for binary classification)

    def __len__(self):
        # Return the total number of samples
        return len(self.dB_inputs['input_ids'])

    def __getitem__(self, idx):
        # Return the sample corresponding to index `idx`
        input_ids = self.dB_inputs['input_ids'][idx]
        attention_mask = self.dB_inputs['attention_mask'][idx]
        label = self.labels[idx]
        
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': label
        }


In [4]:
train_custom_dataset = CustomDataset(dB_inputs=train_DB_inputs, labels=train_labels)
train_custom_dataset = CustomDataset(dB_inputs=dev_DB_inputs, labels=dev_labels)

train_dataloader = DataLoader(train_custom_dataset, batch_size=64, shuffle=True)
dev_dataloader = DataLoader(train_custom_dataset, batch_size=64)

# Loading Pretrained and Finetuning model

In [5]:
distilbert = DistilBertModel.from_pretrained('distilbert-base-uncased')

In [6]:
train_dataset = pd.read_csv('cleaned_dataset.csv')
dev_dataset = pd.read_csv('cleaned_dev_dataset.csv')

In [8]:
class CNNLSTMClassifier(nn.Module):
    def __init__(self, distilbert, cnn_out_channels=64, lstm_hidden_dim=64, num_classes=2):
        super(CNNLSTMClassifier, self).__init__()
        self.distilbert = distilbert
        self.cnn = nn.Conv1d(in_channels=768, out_channels=cnn_out_channels, kernel_size=3, padding=1)
        self.lstm = nn.LSTM(cnn_out_channels, lstm_hidden_dim, batch_first=True)
        self.fc = nn.Linear(lstm_hidden_dim, num_classes)
        
    def forward(self, input_ids, attention_mask):
        with torch.set_grad_enabled(self.distilbert.training):
            distilbert_output = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
            embeddings = distilbert_output.last_hidden_state.permute(0, 2, 1)  # (batch, embed_dim, seq_len)
        
        cnn_out = self.cnn(embeddings)
        
        lstm_out, _ = self.lstm(cnn_out.permute(0, 2, 1))  # (batch, seq_len, lstm_hidden_dim)
        
        logits = self.fc(lstm_out[:, -1, :])  # Use last hidden state for classification
        return logits

In [9]:
model = CNNLSTMClassifier(distilbert)

In [10]:
model.train()

criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=2e-5) #learing rate used by baseline from COLING 2025

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')
model.to(device)

def train_model(model, dataloader, criterion, optimizer, num_epochs=1):
    for epoch in range(num_epochs):
        total_loss = 0
        for batches in tqdm(dataloader):
            input_ids = batches['input_ids'].to(device)
            attention_mask = batches['attention_mask'].to(device)
            labels = batches['labels'].to(device)
            
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        avg_loss = total_loss / len(dataloader)
        print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}')

    return model



Using device: cuda


In [11]:
def evaluate_model(model, dataloader):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for input_ids, attention_mask, labels in dataloader:
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)
            
            outputs = model(input_ids, attention_mask)
            _, preds = torch.max(outputs, dim=1)
            
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    
    accuracy = accuracy_score(true_labels, predictions)
    print(f'Validation Accuracy: {accuracy:.4f}')

In [None]:
DB_model = train_model(model, train_dataloader, criterion, optimizer)

  attn_output = torch.nn.functional.scaled_dot_product_attention(
  0%|          | 13/4090 [03:54<22:14:51, 19.64s/it]

In [None]:
torch.save(DB_model, 'intermediates/DB_model.pt')

In [None]:
evaluate_model(model, dev_dataloader)