In [16]:
import os
import numpy as np
import pandas as pd
import torch
import random
import nltk
import torch.nn as nn
import torch.nn.functional as F

from tqdm import tqdm
from nltk.corpus import wordnet
from transformers import AutoTokenizer
from datasets import Dataset, DatasetDict
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.metrics import f1_score, accuracy_score, classification_report
from transformers import Trainer, TrainingArguments

# ----------------------------------------------------------------------------
# Global Variables and Flags
# ----------------------------------------------------------------------------
TESTING_FLAG = True  # If True, print debug info
DOWNLOAD_FLAG = True # If True, handle NLTK data

NLTK_DATA_DIR = "data\\nltk_data"
TRAIN_PATH = "data\\train.csv"
DEV_PATH   = "data\\dev.csv"
TEST_PATH  = "data\\test.csv" 

BEST_MODEL_PATH = "data\\taskB\\ED_B_Model.pt"
OUTPUT_PATH = "data\\predictions.csv"

AUGMENTED_COPY_CHANCE = 0.15
EPOCH_OPTIONS = [2, 3, 4]
BATCH_OPTIONS  = [4, 8, 16]
USE_FOCAL_OPTIONS = [False, True]

SEARCH_SPACE = {
    "learning_rate":   hp.loguniform("learning_rate", np.log(1e-5), np.log(5e-4)),
    "epochs":          hp.choice("epochs", EPOCH_OPTIONS),
    "batch_size":      hp.choice("batch_size", BATCH_OPTIONS),
    "use_focal_loss":  hp.choice("use_focal_loss", USE_FOCAL_OPTIONS),
    "gamma":           hp.quniform("gamma", 1.0, 5.0, 0.5),
    "label_smoothing": hp.uniform("label_smoothing", 0.0, 0.2)
}

MAX_EVALS = 10  
EVAL_BATCH_SIZE = 8
BEST_MODEL_METRIC = "f1"

GLOVE_PATH = "data\\glove.6B.300d.txt"  
EMBED_DIM = 300         
HIDDEN_DIM = 256
NUM_LAYERS = 2
DROPOUT = 0.3           
USE_ATTENTION = True  

# Check GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if TESTING_FLAG:
    print("Using device:", device)

Using device: cuda


In [16]:
# ----------------------------------------------------------------------------
# 1. Fetch NLTK data
# ----------------------------------------------------------------------------
if DOWNLOAD_FLAG:
    nltk.download("wordnet")
    nltk.download("omw-1.4")

    #nltk.data.path.append(NLTK_DATA_DIR)
    #nltk.download("wordnet", download_dir=NLTK_DATA_DIR)
    #nltk.download("omw-1.4", download_dir=NLTK_DATA_DIR)
    #!unzip /kaggle/working/nltk_data/corpora/omw-1.4.zip -d /kaggle/working/nltk_data/corpora/
    #!unzip /kaggle/working/nltk_data/corpora/wordnet.zip -d /kaggle/working/nltk_data/corpora/
    print("Downloaded NLTK data")

Downloaded NLTK data


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Backe\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Backe\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
# ----------------------------------------------------------------------------
# 2. Load Data
# ----------------------------------------------------------------------------
train_df = pd.read_csv(TRAIN_PATH)
dev_df   = pd.read_csv(DEV_PATH)
test_df  = pd.read_csv(TEST_PATH)

train_df.rename(columns={"Claim": "claim", "Evidence": "evidence"}, inplace=True)
dev_df.rename(columns={"Claim": "claim", "Evidence": "evidence"}, inplace=True)
test_df.rename(columns={"Claim": "claim", "Evidence": "evidence"}, inplace=True)

if TESTING_FLAG:
    print("Train samples:", len(train_df))
    print("Dev samples:", len(dev_df))
    print("Test samples:", len(test_df))

train_df["label"] = train_df["label"].astype(int)
dev_df["label"]   = dev_df["label"].astype(int)


Train samples: 21508
Dev samples: 5926
Test samples: 4688


In [3]:
# ----------------------------------------------------------------------------
# 3. Data Augmentation (Synonym Replacement)
# ----------------------------------------------------------------------------
random.seed(42)

def synonym_replacement(sentence, n=1):
    words = sentence.split()
    if len(words) < 2:
        return sentence
    indices_to_replace = random.sample(range(len(words)), k=min(n, len(words)))
    new_words = words[:]
    for i in indices_to_replace:
        word = words[i]
        syns = wordnet.synsets(word)
        if not syns:
            continue
        lemmas = syns[0].lemma_names()
        lemmas = [l for l in lemmas if l.lower() != word.lower()]
        if len(lemmas) == 0:
            continue
        new_words[i] = random.choice(lemmas)
    return " ".join(new_words)

def augment_dataframe(df):
    augmented_rows = []
    for _, row in df.iterrows():
        augmented_rows.append(row.to_dict())
        if random.random() < AUGMENTED_COPY_CHANCE:
            new_row = row.copy()
            if random.random() < 0.5:
                new_row["claim"] = synonym_replacement(row["claim"], n=1)
            else:
                new_row["evidence"] = synonym_replacement(row["evidence"], n=1)
            augmented_rows.append(new_row.to_dict())
    return pd.DataFrame(augmented_rows)

augmented_train_df = augment_dataframe(train_df)
if TESTING_FLAG:
    print("Original train size:", len(train_df),
          "=> After augmentation:", len(augmented_train_df))

train_df = augmented_train_df.reset_index(drop=True)

Original train size: 21508 => After augmentation: 24819


In [4]:
# ----------------------------------------------------------------------------
# 4. Create Hugging Face Datasets
# ----------------------------------------------------------------------------
train_dataset = Dataset.from_pandas(train_df)
dev_dataset   = Dataset.from_pandas(dev_df)
test_dataset  = Dataset.from_pandas(test_df)

dataset_dict = DatasetDict({
    "train": train_dataset,
    "dev":   dev_dataset,
    "test":  test_dataset
})

if TESTING_FLAG:
    print(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['claim', 'evidence', 'label'],
        num_rows: 24819
    })
    dev: Dataset({
        features: ['claim', 'evidence', 'label'],
        num_rows: 5926
    })
    test: Dataset({
        features: ['claim', 'evidence'],
        num_rows: 4688
    })
})


In [5]:
# ----------------------------------------------------------------------------
# 5. Tokenization
# ----------------------------------------------------------------------------

TOKENIZER_NAME = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)

def tokenize_function(examples):
    return tokenizer(
        examples["claim"],
        examples["evidence"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

encoded_dataset = dataset_dict.map(tokenize_function, batched=True)

Map:   0%|          | 0/24819 [00:00<?, ? examples/s]

Map:   0%|          | 0/5926 [00:00<?, ? examples/s]

Map:   0%|          | 0/4688 [00:00<?, ? examples/s]

In [6]:
# ----------------------------------------------------------------------------
# 5.1. Format for PyTorch
# ----------------------------------------------------------------------------
encoded_dataset["train"] = encoded_dataset["train"].rename_column("label", "labels")
encoded_dataset["dev"]   = encoded_dataset["dev"].rename_column("label", "labels")

encoded_dataset["train"] = encoded_dataset["train"].remove_columns(["claim", "evidence"])
encoded_dataset["dev"]   = encoded_dataset["dev"].remove_columns(["claim", "evidence"])
encoded_dataset["test"]  = encoded_dataset["test"].remove_columns(["claim", "evidence"])

encoded_dataset["train"].set_format("torch")
encoded_dataset["dev"].set_format("torch")
encoded_dataset["test"].set_format("torch")

In [6]:
# ----------------------------------------------------------------------------
# 6. Load GloVe embeddings & Build an Embedding Matrix
# ----------------------------------------------------------------------------
def load_glove_embeddings(glove_file, vocab, embedding_dim=300):
    """
    Load GloVe 300d vectors and align them with the given vocab.
    vocab: a dict {token_string: token_index}
    Returns: a numpy array [vocab_size, embedding_dim]
    """
    embedding_matrix = np.random.normal(
        scale=0.1, 
        size=(len(vocab), embedding_dim)
    ).astype(np.float32)
    found = 0

    if not os.path.isfile(glove_file):
        print(f"GloVe file not found at {glove_file}, using random init.")
        return embedding_matrix, found

    print(f"Loading GloVe from {glove_file}...")
    glove_dict = {}
    with open(glove_file, "r", encoding="utf-8") as f:
        for line in f:
            values = line.split()
            if len(values) != embedding_dim + 1:
                continue
            word = values[0]
            coefs = np.asarray(values[1:], dtype="float32")
            glove_dict[word] = coefs

    # For each token in the BERT-based vocab, see if it matches a GloVe word
    for token, idx in vocab.items():
        normalized = token.replace("##", "").lower()
        if normalized in glove_dict:
            embedding_matrix[idx] = glove_dict[normalized]
            found += 1

    print(f"Initialized embedding_matrix with {found} GloVe tokens matched out of {len(vocab)}")
    return embedding_matrix, found

vocab_dict = tokenizer.get_vocab()  # {token_str: token_id}
embedding_matrix_np, glove_found = load_glove_embeddings(GLOVE_PATH, vocab_dict, EMBED_DIM)
embedding_matrix_tensor = torch.tensor(embedding_matrix_np)

Loading GloVe from data\glove.6B.300d.txt...
Initialized embedding_matrix with 26695 GloVe tokens matched out of 30522


In [7]:
# ----------------------------------------------------------------------------
# 7. Custom BiLSTM with optional attention
# ----------------------------------------------------------------------------
class SimpleAttention(nn.Module):
    """
    A simple additive attention: 
    score = tanh(W1*H + W2*h_context), 
    then softmax over time steps, 
    output = sum of weighted hidden states
    """
    def __init__(self, hidden_dim):
        super().__init__()
        self.hidden_dim = hidden_dim

        self.W = nn.Linear(2 * hidden_dim, 2 * hidden_dim)
        self.v = nn.Linear(2 * hidden_dim, 1, bias=False)

    def forward(self, lstm_outputs, mask=None):
        """
        lstm_outputs: (B, L, 2*hidden_dim)
        mask: (B, L) if needed (1 for real tokens, 0 for pad)
        Returns: (B, 2*hidden_dim) - the weighted sum
        """
        # Score calculation
        score = torch.tanh(self.W(lstm_outputs))  # (B, L, 2H)
        score = self.v(score).squeeze(-1)         # (B, L)
        
        if mask is not None:
            score = score.masked_fill(mask == 0, -1e9)

        attn_weights = F.softmax(score, dim=-1)   # (B, L)

        # Weighted sum
        attn_weights = attn_weights.unsqueeze(1)  # (B, 1, L)
        context = torch.bmm(attn_weights, lstm_outputs)  # (B, 1, 2H)
        context = context.squeeze(1)              # (B, 2H)

        return context


class CustomBiLSTMModel(nn.Module):
    def __init__(self, 
                 vocab_size, 
                 embed_dim=300, 
                 hidden_dim=256, 
                 num_labels=2, 
                 num_layers=2,
                 dropout=0.3,
                 use_attention=True,
                 use_focal_loss=False, 
                 gamma=2.0, 
                 label_smoothing=0.0,
                 embedding_matrix=None):
        super().__init__()
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.dropout = dropout
        self.num_labels = num_labels
        
        self.use_attention = use_attention
        self.use_focal_loss = use_focal_loss
        self.gamma = gamma
        self.label_smoothing = label_smoothing

        # Embedding layer
        self.embedding = nn.Embedding(self.vocab_size, self.embed_dim, padding_idx=0)
        if embedding_matrix is not None:
            with torch.no_grad():
                self.embedding.weight.copy_(embedding_matrix)

        # BiLSTM with multiple layers & dropout
        self.lstm = nn.LSTM(
            input_size=self.embed_dim,
            hidden_size=self.hidden_dim,
            num_layers=self.num_layers,
            dropout=self.dropout,
            batch_first=True,
            bidirectional=True
        )

        # Optional attention
        if self.use_attention:
            self.attn = SimpleAttention(self.hidden_dim)

        # Classification head
        self.classifier = nn.Linear(2 * self.hidden_dim, self.num_labels)

    def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
        # Embeddings
        embeds = self.embedding(input_ids)
        # zero out padding
        if attention_mask is not None:
            expand_mask = attention_mask.unsqueeze(-1).float()
            embeds = embeds * expand_mask
        
        # LSTM
        lstm_outputs, (h, c) = self.lstm(embeds)
        # shape of lstm_outputs: (B, L, 2H)

        if self.use_attention:
            # Weighted sum of outputs
            context = self.attn(lstm_outputs, mask=attention_mask)
        else:
            # h shape: (num_layers*2, B, H)
            h_forward = h[-2]  # last layer's forward state
            h_backward = h[-1] # last layer's backward state
            context = torch.cat((h_forward, h_backward), dim=-1)  # (B, 2H)

        logits = self.classifier(context)

        # Loss
        loss = None
        if labels is not None:
            if self.use_focal_loss:
                loss = self.focal_loss(logits, labels, self.gamma)
            else:
                loss = self.label_smoothing_loss(logits, labels, self.label_smoothing)

        return {"loss": loss, "logits": logits}

    def focal_loss(self, logits, targets, gamma=2.0):
        ce = nn.CrossEntropyLoss(reduction='none')(logits, targets)
        pt = torch.exp(-ce)
        focal = (1 - pt)**gamma * ce
        return focal.mean()

    def label_smoothing_loss(self, logits, targets, smoothing=0.0):
        if smoothing == 0.0:
            return nn.CrossEntropyLoss()(logits, targets)
        log_probs = F.log_softmax(logits, dim=-1)
        n_class = logits.size(1)
        with torch.no_grad():
            true_dist = torch.zeros_like(log_probs)
            true_dist.fill_(smoothing / (n_class - 1))
            true_dist.scatter_(1, targets.unsqueeze(1), 1.0 - smoothing)
        return torch.mean(torch.sum(-true_dist * log_probs, dim=1))


In [11]:
# ----------------------------------------------------------------------------
# 8. Build Trainer & Hyperopt
# ----------------------------------------------------------------------------
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"f1": f1, "accuracy": acc}

def make_trainer(model, train_ds, dev_ds, space):
    learning_rate = space["learning_rate"]
    epochs        = int(space["epochs"])
    batch_size    = int(space["batch_size"])

    training_args = TrainingArguments(
        output_dir="./enhanced-bilstm-ed-checkpoints",
        eval_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        learning_rate=learning_rate,
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=EVAL_BATCH_SIZE,
        load_best_model_at_end=True,
        metric_for_best_model=BEST_MODEL_METRIC,
        greater_is_better=True,
        save_total_limit=1,
        report_to="none",
        logging_steps=1
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=dev_ds,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )
    return trainer

def objective(space):
    use_focal_loss = space["use_focal_loss"]
    gamma = space["gamma"]
    label_smoothing = space["label_smoothing"]

    # Build model
    model = CustomBiLSTMModel(
        vocab_size=len(tokenizer.get_vocab()),
        embed_dim=EMBED_DIM,
        hidden_dim=HIDDEN_DIM,
        num_labels=2,
        num_layers=NUM_LAYERS,
        dropout=DROPOUT,
        use_attention=USE_ATTENTION,
        use_focal_loss=use_focal_loss,
        gamma=gamma,
        label_smoothing=label_smoothing,
        embedding_matrix=embedding_matrix_tensor
    )
    model.to(device)

    trainer = make_trainer(model, encoded_dataset["train"], encoded_dataset["dev"], space)
    trainer.train()
    metrics = trainer.evaluate(encoded_dataset["dev"])
    f1 = metrics["eval_f1"]

    if TESTING_FLAG:
        print(f"[Hyperopt] params={space} => F1={f1:.4f}")
    return {"loss": -f1, "status": STATUS_OK}

trials = Trials()
best = fmin(
    fn=objective,
    space=SEARCH_SPACE,
    algo=tpe.suggest,
    max_evals=MAX_EVALS,
    trials=trials
)

if TESTING_FLAG:
    print("\nHyperopt best param indices:", best)

  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]

  trainer = Trainer(



Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.042,0.038686,0.803801,0.805434
2,0.0339,0.037117,0.814423,0.814546
3,0.0284,0.037885,0.817896,0.821127
4,0.0245,0.041007,0.820118,0.823152


[Hyperopt] params={'batch_size': 8, 'epochs': 4, 'gamma': 3.5, 'label_smoothing': 0.08212135447736467, 'learning_rate': 0.000128486801463992, 'use_focal_loss': True} => F1=0.8201
 10%|█         | 1/10 [10:14<1:32:08, 614.23s/trial, best loss: -0.8201183725656067]

  trainer = Trainer(



Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0896,0.084731,0.755569,0.77219
2,0.08,0.082674,0.777375,0.784171
3,0.0769,0.085564,0.780069,0.788053


[Hyperopt] params={'batch_size': 8, 'epochs': 3, 'gamma': 2.5, 'label_smoothing': 0.03241339306919242, 'learning_rate': 3.203722949774699e-05, 'use_focal_loss': True} => F1=0.7801
 20%|██        | 2/10 [17:17<1:06:56, 502.03s/trial, best loss: -0.8201183725656067]

  trainer = Trainer(



Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0657,0.062078,0.739292,0.76409
2,0.0587,0.059637,0.761492,0.772865
3,0.0562,0.05928,0.763091,0.775059


[Hyperopt] params={'batch_size': 16, 'epochs': 3, 'gamma': 3.0, 'label_smoothing': 0.13536990039247074, 'learning_rate': 2.9094017853862346e-05, 'use_focal_loss': True} => F1=0.7631
 30%|███       | 3/10 [21:32<45:22, 388.92s/trial, best loss: -0.8201183725656067]  

  trainer = Trainer(



Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.4871,0.482049,0.797483,0.80189
2,0.3905,0.470738,0.813139,0.818934


[Hyperopt] params={'batch_size': 4, 'epochs': 2, 'gamma': 4.5, 'label_smoothing': 0.029043148896219263, 'learning_rate': 0.00029653493053302253, 'use_focal_loss': False} => F1=0.8131
 40%|████      | 4/10 [31:51<47:59, 479.88s/trial, best loss: -0.8201183725656067]

  trainer = Trainer(



Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.2245,0.209646,0.80546,0.803915
2,0.1564,0.20079,0.814747,0.818259
3,0.109,0.254356,0.811642,0.813871


[Hyperopt] params={'batch_size': 8, 'epochs': 3, 'gamma': 1.0, 'label_smoothing': 0.07636777850181752, 'learning_rate': 0.00039332294219049336, 'use_focal_loss': True} => F1=0.8147
 50%|█████     | 5/10 [39:07<38:39, 463.96s/trial, best loss: -0.8201183725656067]

  trainer = Trainer(



Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.5477,0.52289,0.789401,0.794128
2,0.4976,0.506956,0.80467,0.812352
3,0.4714,0.505083,0.80964,0.816909
4,0.4546,0.505331,0.814034,0.818765


[Hyperopt] params={'batch_size': 16, 'epochs': 4, 'gamma': 2.5, 'label_smoothing': 0.08027873709560314, 'learning_rate': 0.00010711311564179699, 'use_focal_loss': False} => F1=0.8140
 60%|██████    | 6/10 [44:55<28:19, 424.76s/trial, best loss: -0.8201183725656067]

  trainer = Trainer(



Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.5801,0.565388,0.758739,0.775397
2,0.5465,0.558494,0.772179,0.785521


[Hyperopt] params={'batch_size': 16, 'epochs': 2, 'gamma': 2.5, 'label_smoothing': 0.11296662853916198, 'learning_rate': 6.314723415788638e-05, 'use_focal_loss': False} => F1=0.7722
 70%|███████   | 7/10 [49:06<18:23, 367.88s/trial, best loss: -0.8201183725656067]

  trainer = Trainer(



Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.1697,0.15618,0.789546,0.792103
2,0.142,0.148525,0.802482,0.806446
3,0.1283,0.156273,0.806461,0.812184


[Hyperopt] params={'batch_size': 8, 'epochs': 3, 'gamma': 1.5, 'label_smoothing': 0.15982432891442486, 'learning_rate': 8.199005032718306e-05, 'use_focal_loss': True} => F1=0.8065
 80%|████████  | 8/10 [59:39<15:04, 452.40s/trial, best loss: -0.8201183725656067]

  trainer = Trainer(



Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0414,0.038841,0.804051,0.803577
2,0.0323,0.03809,0.807629,0.81404


[Hyperopt] params={'batch_size': 8, 'epochs': 2, 'gamma': 3.5, 'label_smoothing': 0.021942683480999727, 'learning_rate': 0.00021051981518692482, 'use_focal_loss': True} => F1=0.8076
 90%|█████████ | 9/10 [1:04:38<06:44, 404.36s/trial, best loss: -0.8201183725656067]

  trainer = Trainer(



Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.5658,0.558422,0.774955,0.789234
2,0.5356,0.553152,0.777958,0.793621


[Hyperopt] params={'batch_size': 4, 'epochs': 2, 'gamma': 1.0, 'label_smoothing': 0.09319025126344603, 'learning_rate': 4.2072403216879116e-05, 'use_focal_loss': False} => F1=0.7780
100%|██████████| 10/10 [1:14:39<00:00, 447.94s/trial, best loss: -0.8201183725656067]

Hyperopt best param indices: {'batch_size': np.int64(1), 'epochs': np.int64(2), 'gamma': np.float64(3.5), 'label_smoothing': np.float64(0.08212135447736467), 'learning_rate': np.float64(0.000128486801463992), 'use_focal_loss': np.int64(1)}


In [12]:
# ----------------------------------------------------------------------------
# 8.1 Interpret Best Hyperparams
# ----------------------------------------------------------------------------
EPOCH_OPTIONS = [2, 3, 4]
BATCH_OPTIONS = [4, 8, 16]
USE_FOCAL_OPTIONS = [False, True]

final_params = {
    "learning_rate":    best["learning_rate"],
    "epochs":           EPOCH_OPTIONS[ best["epochs"] ],
    "batch_size":       BATCH_OPTIONS[ best["batch_size"] ],
    "use_focal_loss":   USE_FOCAL_OPTIONS[ best["use_focal_loss"] ],
    "gamma":            best["gamma"],
    "label_smoothing":  best["label_smoothing"]
}

if TESTING_FLAG:
    print("Interpreted best hyperparams:\n", final_params)

Interpreted best hyperparams:
 {'learning_rate': np.float64(0.000128486801463992), 'epochs': 4, 'batch_size': 8, 'use_focal_loss': True, 'gamma': np.float64(3.5), 'label_smoothing': np.float64(0.08212135447736467)}


In [13]:
# ----------------------------------------------------------------------------
# 9. Train Final Model
# ----------------------------------------------------------------------------
best_model = CustomBiLSTMModel(
    vocab_size=len(tokenizer.get_vocab()),
    embed_dim=EMBED_DIM,
    hidden_dim=HIDDEN_DIM,
    num_layers=NUM_LAYERS,
    dropout=DROPOUT,
    use_attention=USE_ATTENTION,
    num_labels=2,
    use_focal_loss=final_params["use_focal_loss"],
    gamma=final_params["gamma"],
    label_smoothing=final_params["label_smoothing"],
    embedding_matrix=embedding_matrix_tensor
)
best_model.to(device)

training_args = TrainingArguments(
    output_dir="./final-enhanced-bilstm-model",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=final_params["learning_rate"],
    num_train_epochs=final_params["epochs"],
    per_device_train_batch_size=final_params["batch_size"],
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    load_best_model_at_end=True,
    metric_for_best_model=BEST_MODEL_METRIC,
    greater_is_better=True,
    save_total_limit=1,
    report_to="none",
    logging_steps=1
)

trainer = Trainer(
    model=best_model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["dev"],
    processing_class=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
results_dev = trainer.evaluate(encoded_dataset["dev"])
if TESTING_FLAG:
    print("Final Dev Results:", results_dev)

preds_output = trainer.predict(encoded_dataset["dev"])
dev_preds = np.argmax(preds_output.predictions, axis=1)
dev_labels = preds_output.label_ids
if TESTING_FLAG:
    print("\nDetailed Classification Report (Dev):")
    print(classification_report(dev_labels, dev_preds, digits=4))
    

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0422,0.039102,0.800073,0.79919
2,0.0339,0.037912,0.806167,0.805434
3,0.0287,0.039154,0.81051,0.814377
4,0.0246,0.044327,0.814528,0.817415


Final Dev Results: {'eval_loss': 0.04432706534862518, 'eval_f1': 0.8145284908612143, 'eval_accuracy': 0.817414782315221, 'eval_runtime': 13.3147, 'eval_samples_per_second': 445.073, 'eval_steps_per_second': 55.653, 'epoch': 4.0}

Detailed Classification Report (Dev):
              precision    recall  f1-score   support

           0     0.8608    0.8917    0.8760      4286
           1     0.6878    0.6232    0.6539      1640

    accuracy                         0.8174      5926
   macro avg     0.7743    0.7575    0.7649      5926
weighted avg     0.8129    0.8174    0.8145      5926



In [None]:
#Save the best model

#Prepare the dictionary to save
save_dict = {
    "model_state_dict": trainer.model.state_dict(),   # (A) model weights
    "hyperparams": {                                  # (B) essential model config
        "vocab_size": len(tokenizer.get_vocab()),
        "embed_dim": EMBED_DIM,
        "hidden_dim": HIDDEN_DIM,
        "num_labels": 2,
        "num_layers": NUM_LAYERS,
        "dropout": DROPOUT,
        "use_attention": USE_ATTENTION,
        "use_focal_loss": final_params["use_focal_loss"],
        "gamma": final_params["gamma"],
        "label_smoothing": final_params["label_smoothing"]
    },
    "embedding_matrix": embedding_matrix_tensor.cpu()  # (C) embedding weights
}

#Save the entire dictionary to BEST_MODEL_PATH
torch.save(save_dict, BEST_MODEL_PATH)
print(f"Best model + hyperparams + embedding matrix saved to: {BEST_MODEL_PATH}")

Best model + hyperparams + embedding matrix saved to: data\taskB\ED_B_Model.pt


In [23]:
from torch.utils.data import DataLoader
import pandas as pd
import torch

# 1) Remove unwanted columns so that only input_ids, attention_mask (and labels if needed) remain
encoded_dataset["test"] = encoded_dataset["test"].remove_columns(["claim", "evidence"])

# 2) Convert to torch format
encoded_dataset["test"].set_format(
    type="torch",
    columns=["input_ids", "attention_mask"]  # Add "labels" if your model needs them
)

# 3) Build the DataLoader
test_loader = DataLoader(encoded_dataset["test"], batch_size=8)

# 4) Evaluate
trainer.model.eval()
all_preds = []

with torch.no_grad():
    for batch in test_loader:
        # batch["input_ids"] and batch["attention_mask"] are now Tensors
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = trainer.model(**batch)
        logits = outputs["logits"]
        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.cpu().tolist())

# 5) Save predictions
test_pred_df = pd.DataFrame({"prediction": all_preds})
test_pred_df.to_csv("data\\predictions.csv", index=False)
print("Saved predictions")


Saved predictions


In [26]:
# ----------------------------------------------------------------------------
# 10.x. Inference on Dev Set (Using Freshly Trained Model)
# ----------------------------------------------------------------------------

import pandas as pd
import torch
from torch.utils.data import DataLoader

# 1) Check current columns for dev
print("Dev dataset columns before any changes:", encoded_dataset["dev"].column_names)
# e.g. might see: ['label', 'input_ids', 'token_type_ids', 'attention_mask']

# 2) If your model expects a 'labels' argument (typical in HF Trainer),
#    rename 'label' -> 'labels' if not done already.
if "label" in encoded_dataset["dev"].column_names and "labels" not in encoded_dataset["dev"].column_names:
    encoded_dataset["dev"] = encoded_dataset["dev"].rename_column("label", "labels")

# 3) Now set the format to return PyTorch tensors. Include only the columns your model needs.
#    Usually that's 'input_ids', 'attention_mask', and optionally 'labels' if you want to compare or compute a loss.
desired_columns = []
if "input_ids" in encoded_dataset["dev"].column_names:
    desired_columns.append("input_ids")
if "attention_mask" in encoded_dataset["dev"].column_names:
    desired_columns.append("attention_mask")
if "labels" in encoded_dataset["dev"].column_names:
    desired_columns.append("labels")
if "token_type_ids" in encoded_dataset["dev"].column_names:
    # If your model does not need token_type_ids, you can omit it or remove it
    desired_columns.append("token_type_ids")

encoded_dataset["dev"].set_format(
    type="torch",
    columns=desired_columns
)

print("Dev dataset columns after set_format:", encoded_dataset["dev"].column_names)
# Should look like the columns you included

# 4) Create the DataLoader
dev_loader = DataLoader(encoded_dataset["dev"], batch_size=8)

# 5) Perform inference with the freshly trained model
trainer.model.eval()
all_dev_preds = []

with torch.no_grad():
    for batch in dev_loader:
        # Move only tensors to device
        for key, value in batch.items():
            if isinstance(value, torch.Tensor):
                batch[key] = value.to(device)

        outputs = trainer.model(**batch)        # forward pass
        logits = outputs["logits"]             # shape: (B, num_labels)
        preds = torch.argmax(logits, dim=1)
        all_dev_preds.extend(preds.cpu().tolist())

# 6) Save predictions to a CSV
dev_pred_df = pd.DataFrame({"prediction": all_dev_preds})
dev_pred_df.to_csv("dev_predictions.csv", index=False, header=True)
print("Saved dev predictions to dev_predictions.csv")


Dev dataset columns before any changes: ['label', 'input_ids', 'token_type_ids', 'attention_mask']
Dev dataset columns after set_format: ['labels', 'input_ids', 'token_type_ids', 'attention_mask']
Saved dev predictions to dev_predictions.csv
