In [None]:
import os
import numpy as np
import pandas as pd
import torch
import random
import nltk
import torch.nn as nn
import torch.nn.functional as F

from tqdm import tqdm
from nltk.corpus import wordnet
from transformers import AutoTokenizer
from datasets import Dataset, DatasetDict
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.metrics import f1_score, accuracy_score, classification_report
from transformers import Trainer, TrainingArguments

# ----------------------------------------------------------------------------
# Global Variables and Flags
# ----------------------------------------------------------------------------
TESTING_FLAG = True  # If True, print debug info
DOWNLOAD_FLAG = True # If True, handle NLTK data

NLTK_DATA_DIR = "data\\nltk_data"
TRAIN_PATH = "data\\train.csv"
DEV_PATH   = "data\\dev.csv"
TEST_PATH  = "data\\test.csv" 

BEST_MODEL_PATH = "data\\ED_B_Model.pt"
OUTPUT_PATH = "data\\predictions.csv"

AUGMENTED_COPY_CHANCE = 0.15
EPOCH_OPTIONS = [2, 3, 4]
BATCH_OPTIONS  = [4, 8, 16]
USE_FOCAL_OPTIONS = [False, True]

SEARCH_SPACE = {
    "learning_rate":   hp.loguniform("learning_rate", np.log(1e-5), np.log(5e-4)),
    "epochs":          hp.choice("epochs", EPOCH_OPTIONS),
    "batch_size":      hp.choice("batch_size", BATCH_OPTIONS),
    "use_focal_loss":  hp.choice("use_focal_loss", USE_FOCAL_OPTIONS),
    "gamma":           hp.quniform("gamma", 1.0, 5.0, 0.5),
    "label_smoothing": hp.uniform("label_smoothing", 0.0, 0.2)
}

MAX_EVALS = 10  
EVAL_BATCH_SIZE = 8
BEST_MODEL_METRIC = "f1"

GLOVE_PATH = "data\\glove.6B.300d.txt"  
EMBED_DIM = 300         
HIDDEN_DIM = 256
NUM_LAYERS = 2
DROPOUT = 0.3           
USE_ATTENTION = True  

# Check GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if TESTING_FLAG:
    print("Using device:", device)


Using device: cuda


In [16]:
# ----------------------------------------------------------------------------
# 1. Fetch NLTK data
# ----------------------------------------------------------------------------
if DOWNLOAD_FLAG:
    nltk.download("wordnet")
    nltk.download("omw-1.4")

    #nltk.data.path.append(NLTK_DATA_DIR)
    #nltk.download("wordnet", download_dir=NLTK_DATA_DIR)
    #nltk.download("omw-1.4", download_dir=NLTK_DATA_DIR)
    #!unzip /kaggle/working/nltk_data/corpora/omw-1.4.zip -d /kaggle/working/nltk_data/corpora/
    #!unzip /kaggle/working/nltk_data/corpora/wordnet.zip -d /kaggle/working/nltk_data/corpora/
    print("Downloaded NLTK data")

Downloaded NLTK data


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Backe\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Backe\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
# ----------------------------------------------------------------------------
# 2. Load Data
# ----------------------------------------------------------------------------
train_df = pd.read_csv(TRAIN_PATH)
dev_df   = pd.read_csv(DEV_PATH)
test_df  = pd.read_csv(TEST_PATH)

train_df.rename(columns={"Claim": "claim", "Evidence": "evidence"}, inplace=True)
dev_df.rename(columns={"Claim": "claim", "Evidence": "evidence"}, inplace=True)
test_df.rename(columns={"Claim": "claim", "Evidence": "evidence"}, inplace=True)

if TESTING_FLAG:
    print("Train samples:", len(train_df))
    print("Dev samples:", len(dev_df))
    print("Test samples:", len(test_df))

train_df["label"] = train_df["label"].astype(int)
dev_df["label"]   = dev_df["label"].astype(int)


Train samples: 21508
Dev samples: 5926
Test samples: 4688


In [3]:
# ----------------------------------------------------------------------------
# 3. Data Augmentation (Synonym Replacement)
# ----------------------------------------------------------------------------
random.seed(42)

def synonym_replacement(sentence, n=1):
    words = sentence.split()
    if len(words) < 2:
        return sentence
    indices_to_replace = random.sample(range(len(words)), k=min(n, len(words)))
    new_words = words[:]
    for i in indices_to_replace:
        word = words[i]
        syns = wordnet.synsets(word)
        if not syns:
            continue
        lemmas = syns[0].lemma_names()
        lemmas = [l for l in lemmas if l.lower() != word.lower()]
        if len(lemmas) == 0:
            continue
        new_words[i] = random.choice(lemmas)
    return " ".join(new_words)

def augment_dataframe(df):
    augmented_rows = []
    for _, row in df.iterrows():
        augmented_rows.append(row.to_dict())
        if random.random() < AUGMENTED_COPY_CHANCE:
            new_row = row.copy()
            if random.random() < 0.5:
                new_row["claim"] = synonym_replacement(row["claim"], n=1)
            else:
                new_row["evidence"] = synonym_replacement(row["evidence"], n=1)
            augmented_rows.append(new_row.to_dict())
    return pd.DataFrame(augmented_rows)

augmented_train_df = augment_dataframe(train_df)
if TESTING_FLAG:
    print("Original train size:", len(train_df),
          "=> After augmentation:", len(augmented_train_df))

train_df = augmented_train_df.reset_index(drop=True)

Original train size: 21508 => After augmentation: 24819


In [4]:
# ----------------------------------------------------------------------------
# 4. Create Hugging Face Datasets
# ----------------------------------------------------------------------------
train_dataset = Dataset.from_pandas(train_df)
dev_dataset   = Dataset.from_pandas(dev_df)
test_dataset  = Dataset.from_pandas(test_df)

dataset_dict = DatasetDict({
    "train": train_dataset,
    "dev":   dev_dataset,
    "test":  test_dataset
})

if TESTING_FLAG:
    print(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['claim', 'evidence', 'label'],
        num_rows: 24819
    })
    dev: Dataset({
        features: ['claim', 'evidence', 'label'],
        num_rows: 5926
    })
    test: Dataset({
        features: ['claim', 'evidence'],
        num_rows: 4688
    })
})


In [5]:
# ----------------------------------------------------------------------------
# 5. Tokenization
# ----------------------------------------------------------------------------

TOKENIZER_NAME = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)

def tokenize_function(examples):
    return tokenizer(
        examples["claim"],
        examples["evidence"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

encoded_dataset = dataset_dict.map(tokenize_function, batched=True)

Map:   0%|          | 0/24819 [00:00<?, ? examples/s]

Map:   0%|          | 0/5926 [00:00<?, ? examples/s]

Map:   0%|          | 0/4688 [00:00<?, ? examples/s]

In [6]:
# ----------------------------------------------------------------------------
# 5.1. Format for PyTorch
# ----------------------------------------------------------------------------
encoded_dataset["train"] = encoded_dataset["train"].rename_column("label", "labels")
encoded_dataset["dev"]   = encoded_dataset["dev"].rename_column("label", "labels")

encoded_dataset["train"] = encoded_dataset["train"].remove_columns(["claim", "evidence"])
encoded_dataset["dev"]   = encoded_dataset["dev"].remove_columns(["claim", "evidence"])
encoded_dataset["test"]  = encoded_dataset["test"].remove_columns(["claim", "evidence"])

encoded_dataset["train"].set_format("torch")
encoded_dataset["dev"].set_format("torch")
encoded_dataset["test"].set_format("torch")

In [6]:
# ----------------------------------------------------------------------------
# 6. Load GloVe embeddings & Build an Embedding Matrix
# ----------------------------------------------------------------------------
def load_glove_embeddings(glove_file, vocab, embedding_dim=300):
    """
    Load GloVe 300d vectors and align them with the given vocab.
    vocab: a dict {token_string: token_index}
    Returns: a numpy array [vocab_size, embedding_dim]
    """
    embedding_matrix = np.random.normal(
        scale=0.1, 
        size=(len(vocab), embedding_dim)
    ).astype(np.float32)
    found = 0

    if not os.path.isfile(glove_file):
        print(f"GloVe file not found at {glove_file}, using random init.")
        return embedding_matrix, found

    print(f"Loading GloVe from {glove_file}...")
    glove_dict = {}
    with open(glove_file, "r", encoding="utf-8") as f:
        for line in f:
            values = line.split()
            if len(values) != embedding_dim + 1:
                continue
            word = values[0]
            coefs = np.asarray(values[1:], dtype="float32")
            glove_dict[word] = coefs

    # For each token in the BERT-based vocab, see if it matches a GloVe word
    for token, idx in vocab.items():
        normalized = token.replace("##", "").lower()
        if normalized in glove_dict:
            embedding_matrix[idx] = glove_dict[normalized]
            found += 1

    print(f"Initialized embedding_matrix with {found} GloVe tokens matched out of {len(vocab)}")
    return embedding_matrix, found

vocab_dict = tokenizer.get_vocab()  # {token_str: token_id}
embedding_matrix_np, glove_found = load_glove_embeddings(GLOVE_PATH, vocab_dict, EMBED_DIM)
embedding_matrix_tensor = torch.tensor(embedding_matrix_np)

Loading GloVe from data\glove.6B.300d.txt...
Initialized embedding_matrix with 26695 GloVe tokens matched out of 30522


In [7]:
# ----------------------------------------------------------------------------
# 7. Custom BiLSTM with optional attention
# ----------------------------------------------------------------------------
class SimpleAttention(nn.Module):
    """
    A simple additive attention: 
    score = tanh(W1*H + W2*h_context), 
    then softmax over time steps, 
    output = sum of weighted hidden states
    """
    def __init__(self, hidden_dim):
        super().__init__()
        self.hidden_dim = hidden_dim

        self.W = nn.Linear(2 * hidden_dim, 2 * hidden_dim)
        self.v = nn.Linear(2 * hidden_dim, 1, bias=False)

    def forward(self, lstm_outputs, mask=None):
        """
        lstm_outputs: (B, L, 2*hidden_dim)
        mask: (B, L) if needed (1 for real tokens, 0 for pad)
        Returns: (B, 2*hidden_dim) - the weighted sum
        """
        # Score calculation
        score = torch.tanh(self.W(lstm_outputs))  # (B, L, 2H)
        score = self.v(score).squeeze(-1)         # (B, L)
        
        if mask is not None:
            score = score.masked_fill(mask == 0, -1e9)

        attn_weights = F.softmax(score, dim=-1)   # (B, L)

        # Weighted sum
        attn_weights = attn_weights.unsqueeze(1)  # (B, 1, L)
        context = torch.bmm(attn_weights, lstm_outputs)  # (B, 1, 2H)
        context = context.squeeze(1)              # (B, 2H)

        return context


class CustomBiLSTMModel(nn.Module):
    def __init__(self, 
                 vocab_size, 
                 embed_dim=300, 
                 hidden_dim=256, 
                 num_labels=2, 
                 num_layers=2,
                 dropout=0.3,
                 use_attention=True,
                 use_focal_loss=False, 
                 gamma=2.0, 
                 label_smoothing=0.0,
                 embedding_matrix=None):
        super().__init__()
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.dropout = dropout
        self.num_labels = num_labels
        
        self.use_attention = use_attention
        self.use_focal_loss = use_focal_loss
        self.gamma = gamma
        self.label_smoothing = label_smoothing

        # Embedding layer
        self.embedding = nn.Embedding(self.vocab_size, self.embed_dim, padding_idx=0)
        if embedding_matrix is not None:
            with torch.no_grad():
                self.embedding.weight.copy_(embedding_matrix)

        # BiLSTM with multiple layers & dropout
        self.lstm = nn.LSTM(
            input_size=self.embed_dim,
            hidden_size=self.hidden_dim,
            num_layers=self.num_layers,
            dropout=self.dropout,
            batch_first=True,
            bidirectional=True
        )

        # Optional attention
        if self.use_attention:
            self.attn = SimpleAttention(self.hidden_dim)

        # Classification head
        self.classifier = nn.Linear(2 * self.hidden_dim, self.num_labels)

    def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
        # Embeddings
        embeds = self.embedding(input_ids)
        # zero out padding
        if attention_mask is not None:
            expand_mask = attention_mask.unsqueeze(-1).float()
            embeds = embeds * expand_mask
        
        # LSTM
        lstm_outputs, (h, c) = self.lstm(embeds)
        # shape of lstm_outputs: (B, L, 2H)

        if self.use_attention:
            # Weighted sum of outputs
            context = self.attn(lstm_outputs, mask=attention_mask)
        else:
            # h shape: (num_layers*2, B, H)
            h_forward = h[-2]  # last layer's forward state
            h_backward = h[-1] # last layer's backward state
            context = torch.cat((h_forward, h_backward), dim=-1)  # (B, 2H)

        logits = self.classifier(context)

        # Loss
        loss = None
        if labels is not None:
            if self.use_focal_loss:
                loss = self.focal_loss(logits, labels, self.gamma)
            else:
                loss = self.label_smoothing_loss(logits, labels, self.label_smoothing)

        return {"loss": loss, "logits": logits}

    def focal_loss(self, logits, targets, gamma=2.0):
        ce = nn.CrossEntropyLoss(reduction='none')(logits, targets)
        pt = torch.exp(-ce)
        focal = (1 - pt)**gamma * ce
        return focal.mean()

    def label_smoothing_loss(self, logits, targets, smoothing=0.0):
        if smoothing == 0.0:
            return nn.CrossEntropyLoss()(logits, targets)
        log_probs = F.log_softmax(logits, dim=-1)
        n_class = logits.size(1)
        with torch.no_grad():
            true_dist = torch.zeros_like(log_probs)
            true_dist.fill_(smoothing / (n_class - 1))
            true_dist.scatter_(1, targets.unsqueeze(1), 1.0 - smoothing)
        return torch.mean(torch.sum(-true_dist * log_probs, dim=1))


In [8]:
# ----------------------------------------------------------------------------
# 8. Build Trainer & Hyperopt
# ----------------------------------------------------------------------------
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"f1": f1, "accuracy": acc}

def make_trainer(model, train_ds, dev_ds, space):
    learning_rate = space["learning_rate"]
    epochs        = int(space["epochs"])
    batch_size    = int(space["batch_size"])

    training_args = TrainingArguments(
        output_dir="./enhanced-bilstm-ed-checkpoints",
        eval_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        learning_rate=learning_rate,
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=EVAL_BATCH_SIZE,
        load_best_model_at_end=True,
        metric_for_best_model=BEST_MODEL_METRIC,
        greater_is_better=True,
        save_total_limit=1,
        report_to="none",
        logging_steps=1
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=dev_ds,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )
    return trainer

def objective(space):
    use_focal_loss = space["use_focal_loss"]
    gamma = space["gamma"]
    label_smoothing = space["label_smoothing"]

    # Build model
    model = CustomBiLSTMModel(
        vocab_size=len(tokenizer.get_vocab()),
        embed_dim=EMBED_DIM,
        hidden_dim=HIDDEN_DIM,
        num_labels=2,
        num_layers=NUM_LAYERS,
        dropout=DROPOUT,
        use_attention=USE_ATTENTION,
        use_focal_loss=use_focal_loss,
        gamma=gamma,
        label_smoothing=label_smoothing,
        embedding_matrix=embedding_matrix_tensor
    )
    model.to(device)

    trainer = make_trainer(model, encoded_dataset["train"], encoded_dataset["dev"], space)
    trainer.train()
    metrics = trainer.evaluate(encoded_dataset["dev"])
    f1 = metrics["eval_f1"]

    if TESTING_FLAG:
        print(f"[Hyperopt] params={space} => F1={f1:.4f}")
    return {"loss": -f1, "status": STATUS_OK}

trials = Trials()
best = fmin(
    fn=objective,
    space=SEARCH_SPACE,
    algo=tpe.suggest,
    max_evals=MAX_EVALS,
    trials=trials
)

if TESTING_FLAG:
    print("\nHyperopt best param indices:", best)

  0%|          | 0/30 [00:00<?, ?trial/s, best loss=?]

  trainer = Trainer(



Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.5723,0.548748,0.725859,0.756666
2,0.529,0.533937,0.756262,0.770503
3,0.5154,0.531805,0.759093,0.774553
4,0.5086,0.527692,0.766745,0.776578


[Hyperopt] params={'batch_size': 16, 'epochs': 4, 'gamma': 1.0, 'label_smoothing': 0.06681121703375936, 'learning_rate': 2.26853860592903e-05, 'use_focal_loss': False} => F1=0.7667
  3%|▎         | 1/30 [06:47<3:16:49, 407.23s/trial, best loss: -0.7667449412571927]

  trainer = Trainer(



Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.5884,0.574721,0.807303,0.80594
2,0.5444,0.563702,0.813054,0.82079


[Hyperopt] params={'batch_size': 16, 'epochs': 2, 'gamma': 4.0, 'label_smoothing': 0.15302443824756262, 'learning_rate': 0.0003175164324673178, 'use_focal_loss': False} => F1=0.8131
  7%|▋         | 2/30 [10:45<2:23:34, 307.68s/trial, best loss: -0.813054053785342] 

  trainer = Trainer(



Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0329,0.031084,0.744076,0.765947
2,0.0294,0.029849,0.768259,0.777759
3,0.028,0.029595,0.766371,0.781809
4,0.0272,0.029438,0.776099,0.784003


[Hyperopt] params={'batch_size': 16, 'epochs': 4, 'gamma': 4.0, 'label_smoothing': 0.08629548205814437, 'learning_rate': 2.7229621451696198e-05, 'use_focal_loss': True} => F1=0.7761
 10%|█         | 3/30 [17:03<2:32:55, 339.85s/trial, best loss: -0.813054053785342]

  trainer = Trainer(



Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.5431,0.534424,0.766883,0.781303
2,0.5086,0.518958,0.783978,0.794803
3,0.4935,0.530159,0.785153,0.796153


[Hyperopt] params={'batch_size': 4, 'epochs': 3, 'gamma': 2.0, 'label_smoothing': 0.05624491659909645, 'learning_rate': 3.692355129532768e-05, 'use_focal_loss': False} => F1=0.7852
 13%|█▎        | 4/30 [37:01<4:54:08, 678.79s/trial, best loss: -0.813054053785342]

  trainer = Trainer(



Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.5752,0.563561,0.742869,0.763415
2,0.5474,0.553198,0.758994,0.770165
3,0.5378,0.559933,0.761026,0.776747


[Hyperopt] params={'batch_size': 4, 'epochs': 3, 'gamma': 4.0, 'label_smoothing': 0.08316079296536183, 'learning_rate': 1.617464213164228e-05, 'use_focal_loss': False} => F1=0.7610
 17%|█▋        | 5/30 [1:07:56<7:39:31, 1102.87s/trial, best loss: -0.813054053785342]

  trainer = Trainer(



Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.6142,0.603588,0.802042,0.808134
2,0.5856,0.595846,0.812853,0.820621
3,0.569,0.598522,0.816363,0.821465


[Hyperopt] params={'batch_size': 4, 'epochs': 3, 'gamma': 4.0, 'label_smoothing': 0.19409398476588627, 'learning_rate': 0.0001286741755187964, 'use_focal_loss': False} => F1=0.8164
 20%|██        | 6/30 [1:44:29<9:49:26, 1473.61s/trial, best loss: -0.8163627008008161]

  trainer = Trainer(



Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.1296,0.123541,0.762632,0.774047
2,0.1168,0.115615,0.789968,0.797671
3,0.1109,0.121139,0.790363,0.799528


[Hyperopt] params={'batch_size': 4, 'epochs': 3, 'gamma': 2.0, 'label_smoothing': 0.12234191285484404, 'learning_rate': 2.8460418852571993e-05, 'use_focal_loss': True} => F1=0.7904
 23%|██▎       | 7/30 [2:01:57<8:31:28, 1334.29s/trial, best loss: -0.8163627008008161]

  trainer = Trainer(



Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.5532,0.537159,0.753606,0.76949
2,0.5198,0.52822,0.773611,0.780796
3,0.5085,0.542757,0.765226,0.779109


[Hyperopt] params={'batch_size': 4, 'epochs': 3, 'gamma': 4.5, 'label_smoothing': 0.05864078807441606, 'learning_rate': 2.4503233661545793e-05, 'use_focal_loss': False} => F1=0.7736
 27%|██▋       | 8/30 [2:18:29<7:29:20, 1225.46s/trial, best loss: -0.8163627008008161]

  trainer = Trainer(



Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0304,0.027922,0.795607,0.796153
2,0.0246,0.027044,0.807738,0.806615
3,0.0212,0.027703,0.811973,0.816065


[Hyperopt] params={'batch_size': 16, 'epochs': 3, 'gamma': 4.0, 'label_smoothing': 0.07819207728669558, 'learning_rate': 0.000163256156163078, 'use_focal_loss': True} => F1=0.8120
 30%|███       | 9/30 [2:22:31<5:21:19, 918.08s/trial, best loss: -0.8163627008008161] 

  trainer = Trainer(



Epoch,Training Loss,Validation Loss


 30%|███       | 9/30 [2:23:27<5:34:44, 956.40s/trial, best loss: -0.8163627008008161]


KeyboardInterrupt: 

In [9]:
# ----------------------------------------------------------------------------
# 8.1 Interpret Best Hyperparams
# ----------------------------------------------------------------------------
EPOCH_OPTIONS = [2, 3, 4]
BATCH_OPTIONS = [4, 8, 16]
USE_FOCAL_OPTIONS = [False, True]

final_params = {
    "learning_rate":    best["learning_rate"],
    "epochs":           EPOCH_OPTIONS[ best["epochs"] ],
    "batch_size":       BATCH_OPTIONS[ best["batch_size"] ],
    "use_focal_loss":   USE_FOCAL_OPTIONS[ best["use_focal_loss"] ],
    "gamma":            best["gamma"],
    "label_smoothing":  best["label_smoothing"]
}

if TESTING_FLAG:
    print("Interpreted best hyperparams:\n", final_params)

NameError: name 'best' is not defined

In [None]:
# ----------------------------------------------------------------------------
# 9. Train Final Model
# ----------------------------------------------------------------------------
best_model = CustomBiLSTMModel(
    vocab_size=len(tokenizer.get_vocab()),
    embed_dim=EMBED_DIM,
    hidden_dim=HIDDEN_DIM,
    num_layers=NUM_LAYERS,
    dropout=DROPOUT,
    use_attention=USE_ATTENTION,
    num_labels=2,
    use_focal_loss=final_params["use_focal_loss"],
    gamma=final_params["gamma"],
    label_smoothing=final_params["label_smoothing"],
    embedding_matrix=embedding_matrix_tensor
)
best_model.to(device)

training_args = TrainingArguments(
    output_dir="./final-enhanced-bilstm-model",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=final_params["learning_rate"],
    num_train_epochs=final_params["epochs"],
    per_device_train_batch_size=final_params["batch_size"],
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    load_best_model_at_end=True,
    metric_for_best_model=BEST_MODEL_METRIC,
    greater_is_better=True,
    save_total_limit=1,
    report_to="none",
    logging_steps=1
)

trainer = Trainer(
    model=best_model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["dev"],
    processing_class=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
results_dev = trainer.evaluate(encoded_dataset["dev"])
if TESTING_FLAG:
    print("Final Dev Results:", results_dev)

preds_output = trainer.predict(encoded_dataset["dev"])
dev_preds = np.argmax(preds_output.predictions, axis=1)
dev_labels = preds_output.label_ids
if TESTING_FLAG:
    print("\nDetailed Classification Report (Dev):")
    print(classification_report(dev_labels, dev_preds, digits=4))
    

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.2284,0.212856,0.801472,0.798515
2,0.1739,0.196101,0.819288,0.819271
3,0.139,0.208608,0.823521,0.827202


Final Dev Results: {'eval_loss': 0.20860803127288818, 'eval_f1': 0.8235211803073061, 'eval_accuracy': 0.8272021599730003, 'eval_runtime': 13.1419, 'eval_samples_per_second': 450.926, 'eval_steps_per_second': 56.385, 'epoch': 3.0}

Detailed Classification Report (Dev):
              precision    recall  f1-score   support

           0     0.8634    0.9041    0.8833      4286
           1     0.7142    0.6262    0.6673      1640

    accuracy                         0.8272      5926
   macro avg     0.7888    0.7652    0.7753      5926
weighted avg     0.8221    0.8272    0.8235      5926



In [None]:
#Save the best model

# 3. Prepare the dictionary to save
save_dict = {
    "model_state_dict": trainer.model.state_dict(),   # (A) model weights
    "hyperparams": {                                  # (B) essential model config
        "vocab_size": len(tokenizer.get_vocab()),
        "embed_dim": EMBED_DIM,
        "hidden_dim": HIDDEN_DIM,
        "num_labels": 2,
        "num_layers": NUM_LAYERS,
        "dropout": DROPOUT,
        "use_attention": USE_ATTENTION,
        "use_focal_loss": final_params["use_focal_loss"],
        "gamma": final_params["gamma"],
        "label_smoothing": final_params["label_smoothing"]
    },
    "embedding_matrix": embedding_matrix_tensor.cpu()  # (C) embedding weights
}

# 4. Save the entire dictionary to BEST_MODEL_PATH
torch.save(save_dict, BEST_MODEL_PATH)
print(f"Best model + hyperparams + embedding matrix saved to: {BEST_MODEL_PATH}")

Best model saved to data\ED_B_Model.pt


In [12]:
#Loading the best model

loaded_model = CustomBiLSTMModel(
    vocab_size=len(tokenizer.get_vocab()),
    embed_dim=EMBED_DIM,
    hidden_dim=HIDDEN_DIM,
    num_layers=NUM_LAYERS,
    dropout=DROPOUT,
    use_attention=USE_ATTENTION,
    num_labels=2,
    use_focal_loss=True, # final_params["use_focal_loss"],
    gamma=np.float64(1.0), # final_params["gamma"],
    label_smoothing= np.float64(0.03442125718345023), #final_params["label_smoothing"] ,
    embedding_matrix=embedding_matrix_tensor
)

loaded_model.load_state_dict(torch.load(BEST_MODEL_PATH, map_location=device))
loaded_model.to(device)
loaded_model.eval()
print("Successfully loaded state_dict into loaded_model.")

trainer.model = loaded_model


Successfully loaded state_dict into loaded_model.


NameError: name 'trainer' is not defined

In [None]:
# ----------------------------------------------------------------------------
# 10. Inference on Test Set
# ----------------------------------------------------------------------------
from torch.utils.data import DataLoader

#You need to build the encoded dataset again, if you want to use the same tokeniser.
test_loader = DataLoader(encoded_dataset["test"], batch_size=8)

all_preds = []
loaded_model.eval()

for batch in test_loader:
    for key in batch:
        batch[key] = batch[key].to(device)

    with torch.no_grad():
        outputs = loaded_model(**batch)       
    logits = outputs["logits"]               
    preds = torch.argmax(logits, dim=1)
    all_preds.extend(preds.cpu().tolist())

# Convert to a DataFrame and save
test_pred_df = pd.DataFrame({"prediction": all_preds})
test_pred_df.to_csv(OUTPUT_PATH, index=False, header=True)
print("Saved predictions")


Saved predictions


In [None]:
# ----------------------------------------------------------------------------
# Inference on Dev Set (codebench debugging)
# ----------------------------------------------------------------------------
from torch.utils.data import DataLoader

dev_loader = DataLoader(encoded_dataset["dev"], batch_size=8)

all_dev_preds = []
loaded_model.eval()

for batch in dev_loader:
    for key in batch:
        batch[key] = batch[key].to(device)

    with torch.no_grad():
        outputs = loaded_model(**batch) 
    logits = outputs["logits"]          
    preds = torch.argmax(logits, dim=1) 
    all_dev_preds.extend(preds.cpu().tolist())


dev_pred_df = pd.DataFrame({"prediction": all_dev_preds})
dev_pred_df.to_csv("dev_predictions.csv", index=False, header=True)

print("Saved dev predictions to dev_predictions.csv")


Saved dev predictions to dev_predictions.csv
