In [23]:
import os
import numpy as np
import pandas as pd
import torch
import random
import nltk
import torch.nn as nn
import torch.nn.functional as F

from tqdm import tqdm
from nltk.corpus import wordnet
from transformers import AutoTokenizer
from datasets import Dataset, DatasetDict
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.metrics import f1_score, accuracy_score, classification_report
from transformers import Trainer, TrainingArguments

# ----------------------------------------------------------------------------
# Global Variables and Flags
# ----------------------------------------------------------------------------
TESTING_FLAG = True  # If True, print debug info
DOWNLOAD_FLAG = True # If True, handle NLTK data

NLTK_DATA_DIR = "data\\nltk_data"
TRAIN_PATH = "data\\train.csv"
DEV_PATH   = "data\\dev.csv"
TEST_PATH  = "data\\test.csv" 

BEST_MODEL_PATH = "data\\ED_B_Model.pt"
OUTPUT_PATH = "data\\predictions.csv"

AUGMENTED_COPY_CHANCE = 0.15
EPOCH_OPTIONS = [2, 3, 4]
BATCH_OPTIONS  = [4, 8, 16]
USE_FOCAL_OPTIONS = [False, True]

SEARCH_SPACE = {
    "learning_rate":   hp.loguniform("learning_rate", np.log(1e-5), np.log(5e-4)),
    "epochs":          hp.choice("epochs", EPOCH_OPTIONS),
    "batch_size":      hp.choice("batch_size", BATCH_OPTIONS),
    "use_focal_loss":  hp.choice("use_focal_loss", USE_FOCAL_OPTIONS),
    "gamma":           hp.quniform("gamma", 1.0, 5.0, 0.5),
    "label_smoothing": hp.uniform("label_smoothing", 0.0, 0.2)
}

MAX_EVALS = 30  
EVAL_BATCH_SIZE = 8
BEST_MODEL_METRIC = "f1"

GLOVE_PATH = "data\\glove.6B.300d.txt"  
EMBED_DIM = 300         
HIDDEN_DIM = 256
NUM_LAYERS = 2
DROPOUT = 0.3           
USE_ATTENTION = True  

# Check GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if TESTING_FLAG:
    print("Using device:", device)

Using device: cuda


In [16]:
# ----------------------------------------------------------------------------
# 1. Fetch NLTK data
# ----------------------------------------------------------------------------
if DOWNLOAD_FLAG:
    nltk.download("wordnet")
    nltk.download("omw-1.4")

    #nltk.data.path.append(NLTK_DATA_DIR)
    #nltk.download("wordnet", download_dir=NLTK_DATA_DIR)
    #nltk.download("omw-1.4", download_dir=NLTK_DATA_DIR)
    #!unzip /kaggle/working/nltk_data/corpora/omw-1.4.zip -d /kaggle/working/nltk_data/corpora/
    #!unzip /kaggle/working/nltk_data/corpora/wordnet.zip -d /kaggle/working/nltk_data/corpora/
    print("Downloaded NLTK data")

Downloaded NLTK data


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Backe\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Backe\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [17]:
# ----------------------------------------------------------------------------
# 2. Load Data
# ----------------------------------------------------------------------------
train_df = pd.read_csv(TRAIN_PATH)
dev_df   = pd.read_csv(DEV_PATH)
test_df  = pd.read_csv(TEST_PATH)

train_df.rename(columns={"Claim": "claim", "Evidence": "evidence"}, inplace=True)
dev_df.rename(columns={"Claim": "claim", "Evidence": "evidence"}, inplace=True)
test_df.rename(columns={"Claim": "claim", "Evidence": "evidence"}, inplace=True)

if TESTING_FLAG:
    print("Train samples:", len(train_df))
    print("Dev samples:", len(dev_df))
    print("Test samples:", len(test_df))

train_df["label"] = train_df["label"].astype(int)
dev_df["label"]   = dev_df["label"].astype(int)


Train samples: 21508
Dev samples: 5926
Test samples: 4688


In [4]:
# ----------------------------------------------------------------------------
# 3. Data Augmentation (Synonym Replacement)
# ----------------------------------------------------------------------------
random.seed(42)

def synonym_replacement(sentence, n=1):
    words = sentence.split()
    if len(words) < 2:
        return sentence
    indices_to_replace = random.sample(range(len(words)), k=min(n, len(words)))
    new_words = words[:]
    for i in indices_to_replace:
        word = words[i]
        syns = wordnet.synsets(word)
        if not syns:
            continue
        lemmas = syns[0].lemma_names()
        lemmas = [l for l in lemmas if l.lower() != word.lower()]
        if len(lemmas) == 0:
            continue
        new_words[i] = random.choice(lemmas)
    return " ".join(new_words)

def augment_dataframe(df):
    augmented_rows = []
    for _, row in df.iterrows():
        augmented_rows.append(row.to_dict())
        if random.random() < AUGMENTED_COPY_CHANCE:
            new_row = row.copy()
            if random.random() < 0.5:
                new_row["claim"] = synonym_replacement(row["claim"], n=1)
            else:
                new_row["evidence"] = synonym_replacement(row["evidence"], n=1)
            augmented_rows.append(new_row.to_dict())
    return pd.DataFrame(augmented_rows)

augmented_train_df = augment_dataframe(train_df)
if TESTING_FLAG:
    print("Original train size:", len(train_df),
          "=> After augmentation:", len(augmented_train_df))

train_df = augmented_train_df.reset_index(drop=True)

Original train size: 21508 => After augmentation: 24819


In [18]:
# ----------------------------------------------------------------------------
# 4. Create Hugging Face Datasets
# ----------------------------------------------------------------------------
train_dataset = Dataset.from_pandas(train_df)
dev_dataset   = Dataset.from_pandas(dev_df)
test_dataset  = Dataset.from_pandas(test_df)

dataset_dict = DatasetDict({
    "train": train_dataset,
    "dev":   dev_dataset,
    "test":  test_dataset
})

if TESTING_FLAG:
    print(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['claim', 'evidence', 'label'],
        num_rows: 21508
    })
    dev: Dataset({
        features: ['claim', 'evidence', 'label'],
        num_rows: 5926
    })
    test: Dataset({
        features: ['claim', 'evidence'],
        num_rows: 4688
    })
})


In [19]:
# ----------------------------------------------------------------------------
# 5. Tokenization
# ----------------------------------------------------------------------------

TOKENIZER_NAME = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)

def tokenize_function(examples):
    return tokenizer(
        examples["claim"],
        examples["evidence"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

encoded_dataset = dataset_dict.map(tokenize_function, batched=True)

Map:   0%|          | 0/21508 [00:00<?, ? examples/s]

Map:   0%|          | 0/5926 [00:00<?, ? examples/s]

Map:   0%|          | 0/4688 [00:00<?, ? examples/s]

In [20]:
# ----------------------------------------------------------------------------
# 5.1. Format for PyTorch
# ----------------------------------------------------------------------------
encoded_dataset["train"] = encoded_dataset["train"].rename_column("label", "labels")
encoded_dataset["dev"]   = encoded_dataset["dev"].rename_column("label", "labels")

encoded_dataset["train"] = encoded_dataset["train"].remove_columns(["claim", "evidence"])
encoded_dataset["dev"]   = encoded_dataset["dev"].remove_columns(["claim", "evidence"])
encoded_dataset["test"]  = encoded_dataset["test"].remove_columns(["claim", "evidence"])

encoded_dataset["train"].set_format("torch")
encoded_dataset["dev"].set_format("torch")
encoded_dataset["test"].set_format("torch")

In [21]:
# ----------------------------------------------------------------------------
# 6. Load GloVe embeddings & Build an Embedding Matrix
# ----------------------------------------------------------------------------
def load_glove_embeddings(glove_file, vocab, embedding_dim=300):
    """
    Load GloVe 300d vectors and align them with the given vocab.
    vocab: a dict {token_string: token_index}
    Returns: a numpy array [vocab_size, embedding_dim]
    """
    embedding_matrix = np.random.normal(
        scale=0.1, 
        size=(len(vocab), embedding_dim)
    ).astype(np.float32)
    found = 0

    if not os.path.isfile(glove_file):
        print(f"GloVe file not found at {glove_file}, using random init.")
        return embedding_matrix, found

    print(f"Loading GloVe from {glove_file}...")
    glove_dict = {}
    with open(glove_file, "r", encoding="utf-8") as f:
        for line in f:
            values = line.split()
            if len(values) != embedding_dim + 1:
                continue
            word = values[0]
            coefs = np.asarray(values[1:], dtype="float32")
            glove_dict[word] = coefs

    # For each token in the BERT-based vocab, see if it matches a GloVe word
    for token, idx in vocab.items():
        normalized = token.replace("##", "").lower()
        if normalized in glove_dict:
            embedding_matrix[idx] = glove_dict[normalized]
            found += 1

    print(f"Initialized embedding_matrix with {found} GloVe tokens matched out of {len(vocab)}")
    return embedding_matrix, found

vocab_dict = tokenizer.get_vocab()  # {token_str: token_id}
embedding_matrix_np, glove_found = load_glove_embeddings(GLOVE_PATH, vocab_dict, EMBED_DIM)
embedding_matrix_tensor = torch.tensor(embedding_matrix_np)

Loading GloVe from data\glove.6B.300d.txt...
Initialized embedding_matrix with 26695 GloVe tokens matched out of 30522


In [9]:
# ----------------------------------------------------------------------------
# 7. Custom BiLSTM with optional attention
# ----------------------------------------------------------------------------
class SimpleAttention(nn.Module):
    """
    A simple additive attention: 
    score = tanh(W1*H + W2*h_context), 
    then softmax over time steps, 
    output = sum of weighted hidden states
    """
    def __init__(self, hidden_dim):
        super().__init__()
        self.hidden_dim = hidden_dim

        self.W = nn.Linear(2 * hidden_dim, 2 * hidden_dim)
        self.v = nn.Linear(2 * hidden_dim, 1, bias=False)

    def forward(self, lstm_outputs, mask=None):
        """
        lstm_outputs: (B, L, 2*hidden_dim)
        mask: (B, L) if needed (1 for real tokens, 0 for pad)
        Returns: (B, 2*hidden_dim) - the weighted sum
        """
        # Score calculation
        score = torch.tanh(self.W(lstm_outputs))  # (B, L, 2H)
        score = self.v(score).squeeze(-1)         # (B, L)
        
        if mask is not None:
            score = score.masked_fill(mask == 0, -1e9)

        attn_weights = F.softmax(score, dim=-1)   # (B, L)

        # Weighted sum
        attn_weights = attn_weights.unsqueeze(1)  # (B, 1, L)
        context = torch.bmm(attn_weights, lstm_outputs)  # (B, 1, 2H)
        context = context.squeeze(1)              # (B, 2H)

        return context


class CustomBiLSTMModel(nn.Module):
    def __init__(self, 
                 vocab_size, 
                 embed_dim=300, 
                 hidden_dim=256, 
                 num_labels=2, 
                 num_layers=2,
                 dropout=0.3,
                 use_attention=True,
                 use_focal_loss=False, 
                 gamma=2.0, 
                 label_smoothing=0.0,
                 embedding_matrix=None):
        super().__init__()
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.dropout = dropout
        self.num_labels = num_labels
        
        self.use_attention = use_attention
        self.use_focal_loss = use_focal_loss
        self.gamma = gamma
        self.label_smoothing = label_smoothing

        # Embedding layer
        self.embedding = nn.Embedding(self.vocab_size, self.embed_dim, padding_idx=0)
        if embedding_matrix is not None:
            with torch.no_grad():
                self.embedding.weight.copy_(embedding_matrix)

        # BiLSTM with multiple layers & dropout
        self.lstm = nn.LSTM(
            input_size=self.embed_dim,
            hidden_size=self.hidden_dim,
            num_layers=self.num_layers,
            dropout=self.dropout,
            batch_first=True,
            bidirectional=True
        )

        # Optional attention
        if self.use_attention:
            self.attn = SimpleAttention(self.hidden_dim)

        # Classification head
        self.classifier = nn.Linear(2 * self.hidden_dim, self.num_labels)

    def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
        # Embeddings
        embeds = self.embedding(input_ids)
        # zero out padding
        if attention_mask is not None:
            expand_mask = attention_mask.unsqueeze(-1).float()
            embeds = embeds * expand_mask
        
        # LSTM
        lstm_outputs, (h, c) = self.lstm(embeds)
        # shape of lstm_outputs: (B, L, 2H)

        if self.use_attention:
            # Weighted sum of outputs
            context = self.attn(lstm_outputs, mask=attention_mask)
        else:
            # h shape: (num_layers*2, B, H)
            h_forward = h[-2]  # last layer's forward state
            h_backward = h[-1] # last layer's backward state
            context = torch.cat((h_forward, h_backward), dim=-1)  # (B, 2H)

        logits = self.classifier(context)

        # Loss
        loss = None
        if labels is not None:
            if self.use_focal_loss:
                loss = self.focal_loss(logits, labels, self.gamma)
            else:
                loss = self.label_smoothing_loss(logits, labels, self.label_smoothing)

        return {"loss": loss, "logits": logits}

    def focal_loss(self, logits, targets, gamma=2.0):
        ce = nn.CrossEntropyLoss(reduction='none')(logits, targets)
        pt = torch.exp(-ce)
        focal = (1 - pt)**gamma * ce
        return focal.mean()

    def label_smoothing_loss(self, logits, targets, smoothing=0.0):
        if smoothing == 0.0:
            return nn.CrossEntropyLoss()(logits, targets)
        log_probs = F.log_softmax(logits, dim=-1)
        n_class = logits.size(1)
        with torch.no_grad():
            true_dist = torch.zeros_like(log_probs)
            true_dist.fill_(smoothing / (n_class - 1))
            true_dist.scatter_(1, targets.unsqueeze(1), 1.0 - smoothing)
        return torch.mean(torch.sum(-true_dist * log_probs, dim=1))


In [None]:
# ----------------------------------------------------------------------------
# 8. Build Trainer & Hyperopt
# ----------------------------------------------------------------------------
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"f1": f1, "accuracy": acc}

def make_trainer(model, train_ds, dev_ds, space):
    learning_rate = space["learning_rate"]
    epochs        = int(space["epochs"])
    batch_size    = int(space["batch_size"])

    training_args = TrainingArguments(
        output_dir="./enhanced-bilstm-ed-checkpoints",
        eval_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        learning_rate=learning_rate,
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=EVAL_BATCH_SIZE,
        load_best_model_at_end=True,
        metric_for_best_model=BEST_MODEL_METRIC,
        greater_is_better=True,
        save_total_limit=1,
        report_to="none",
        logging_steps=1
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=dev_ds,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )
    return trainer

def objective(space):
    use_focal_loss = space["use_focal_loss"]
    gamma = space["gamma"]
    label_smoothing = space["label_smoothing"]

    # Build model
    model = CustomBiLSTMModel(
        vocab_size=len(tokenizer.get_vocab()),
        embed_dim=EMBED_DIM,
        hidden_dim=HIDDEN_DIM,
        num_labels=2,
        num_layers=NUM_LAYERS,
        dropout=DROPOUT,
        use_attention=USE_ATTENTION,
        use_focal_loss=use_focal_loss,
        gamma=gamma,
        label_smoothing=label_smoothing,
        embedding_matrix=embedding_matrix_tensor
    )
    model.to(device)

    trainer = make_trainer(model, encoded_dataset["train"], encoded_dataset["dev"], space)
    trainer.train()
    metrics = trainer.evaluate(encoded_dataset["dev"])
    f1 = metrics["eval_f1"]

    if TESTING_FLAG:
        print(f"[Hyperopt] params={space} => F1={f1:.4f}")
    return {"loss": -f1, "status": STATUS_OK}

trials = Trials()
best = fmin(
    fn=objective,
    space=SEARCH_SPACE,
    algo=tpe.suggest,
    max_evals=MAX_EVALS,
    trials=trials
)

if TESTING_FLAG:
    print("\nHyperopt best param indices:", best)

  0%|          | 0/30 [00:00<?, ?trial/s, best loss=?]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.5211,0.497177,0.80642,0.807796
2,0.4501,0.484111,0.810339,0.81809


[Hyperopt] params={'batch_size': 8, 'epochs': 2, 'gamma': 4.5, 'label_smoothing': 0.06392098697401234, 'learning_rate': 0.00020857096123170536, 'use_focal_loss': False} => F1=0.8103
  3%|▎         | 1/30 [05:35<2:41:55, 335.00s/trial, best loss: -0.8103394600055884]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.6266,0.618419,0.751597,0.772865
2,0.6086,0.614701,0.759518,0.778772


[Hyperopt] params={'batch_size': 16, 'epochs': 2, 'gamma': 4.0, 'label_smoothing': 0.1954384983310935, 'learning_rate': 5.449434198791054e-05, 'use_focal_loss': False} => F1=0.7595
  7%|▋         | 2/30 [08:25<1:51:17, 238.49s/trial, best loss: -0.8103394600055884]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.572,0.555875,0.739126,0.764597
2,0.5372,0.548552,0.756408,0.773372


[Hyperopt] params={'batch_size': 16, 'epochs': 2, 'gamma': 3.0, 'label_smoothing': 0.08665120467202699, 'learning_rate': 3.816967738971289e-05, 'use_focal_loss': False} => F1=0.7564
 10%|█         | 3/30 [11:21<1:34:21, 209.67s/trial, best loss: -0.8103394600055884]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0418,0.038687,0.804683,0.808977
2,0.0336,0.039111,0.805381,0.81269


[Hyperopt] params={'batch_size': 4, 'epochs': 2, 'gamma': 3.5, 'label_smoothing': 0.14445499145021085, 'learning_rate': 0.00011189466523847213, 'use_focal_loss': True} => F1=0.8054
 13%|█▎        | 4/30 [22:25<2:48:36, 389.11s/trial, best loss: -0.8103394600055884]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.5773,0.561319,0.74088,0.762403
2,0.5439,0.550131,0.763829,0.775059
3,0.5332,0.550317,0.762571,0.776578


[Hyperopt] params={'batch_size': 8, 'epochs': 3, 'gamma': 2.5, 'label_smoothing': 0.08750774079475261, 'learning_rate': 1.9945242167268704e-05, 'use_focal_loss': False} => F1=0.7638
 17%|█▋        | 5/30 [30:23<2:55:26, 421.07s/trial, best loss: -0.8103394600055884]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.5584,0.542411,0.792209,0.80054
2,0.5154,0.540737,0.801167,0.809652


[Hyperopt] params={'batch_size': 4, 'epochs': 2, 'gamma': 2.0, 'label_smoothing': 0.10200875271699716, 'learning_rate': 0.00010992192923440694, 'use_focal_loss': False} => F1=0.8012
 20%|██        | 6/30 [41:24<3:21:04, 502.68s/trial, best loss: -0.8103394600055884]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.5721,0.55551,0.748867,0.765609
2,0.538,0.545099,0.766996,0.776578
3,0.5263,0.545449,0.764789,0.778265


[Hyperopt] params={'batch_size': 8, 'epochs': 3, 'gamma': 4.0, 'label_smoothing': 0.08542251144659163, 'learning_rate': 2.389183109952775e-05, 'use_focal_loss': False} => F1=0.7670
 23%|██▎       | 7/30 [49:46<3:12:35, 502.41s/trial, best loss: -0.8103394600055884]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.5917,0.579953,0.806681,0.806446
2,0.5424,0.580635,0.806213,0.810496
3,0.509,0.589938,0.812103,0.814209
4,0.4866,0.603767,0.810095,0.812184


[Hyperopt] params={'batch_size': 8, 'epochs': 4, 'gamma': 3.0, 'label_smoothing': 0.16257473326682426, 'learning_rate': 0.0004932093839685862, 'use_focal_loss': False} => F1=0.8121
 27%|██▋       | 8/30 [1:00:28<3:20:31, 546.90s/trial, best loss: -0.8121026714955556]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0907,0.086409,0.742837,0.765103
2,0.0823,0.086207,0.757095,0.77354


[Hyperopt] params={'batch_size': 8, 'epochs': 2, 'gamma': 2.5, 'label_smoothing': 0.04753131952028109, 'learning_rate': 2.5585753375851585e-05, 'use_focal_loss': True} => F1=0.7571
 30%|███       | 9/30 [1:06:06<2:48:31, 481.50s/trial, best loss: -0.8121026714955556]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.4767,0.448695,0.802451,0.802902
2,0.3805,0.4455,0.804147,0.816234
3,0.321,0.475751,0.810611,0.814884


[Hyperopt] params={'batch_size': 8, 'epochs': 3, 'gamma': 3.5, 'label_smoothing': 0.01954907009651652, 'learning_rate': 0.0002614238971279913, 'use_focal_loss': False} => F1=0.8106
 33%|███▎      | 10/30 [1:14:19<2:41:40, 485.04s/trial, best loss: -0.8121026714955556]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0453,0.042784,0.756166,0.77489
2,0.04,0.041055,0.778161,0.784509
3,0.0375,0.040834,0.783436,0.791596


[Hyperopt] params={'batch_size': 16, 'epochs': 3, 'gamma': 3.5, 'label_smoothing': 0.1831003132753092, 'learning_rate': 4.7278271760243226e-05, 'use_focal_loss': True} => F1=0.7834
 37%|███▋      | 11/30 [1:19:02<2:14:02, 423.31s/trial, best loss: -0.8121026714955556]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.5383,0.526108,0.78022,0.796321
2,0.4912,0.514374,0.792592,0.803409


[Hyperopt] params={'batch_size': 4, 'epochs': 2, 'gamma': 5.0, 'label_smoothing': 0.05446724252893081, 'learning_rate': 6.446673760898945e-05, 'use_focal_loss': False} => F1=0.7926
 40%|████      | 12/30 [1:30:37<2:31:47, 505.97s/trial, best loss: -0.8121026714955556]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0419,0.03867,0.804083,0.808302
2,0.0337,0.039162,0.804778,0.812184


[Hyperopt] params={'batch_size': 4, 'epochs': 2, 'gamma': 3.5, 'label_smoothing': 0.17985286002590628, 'learning_rate': 0.00010851171308179183, 'use_focal_loss': True} => F1=0.8048
 43%|████▎     | 13/30 [1:42:19<2:40:12, 565.47s/trial, best loss: -0.8121026714955556]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.5484,0.541276,0.798044,0.801384
2,0.4702,0.530262,0.804229,0.817921
3,0.4208,0.546795,0.812456,0.815052


[Hyperopt] params={'batch_size': 8, 'epochs': 3, 'gamma': 3.0, 'label_smoothing': 0.10021363369176772, 'learning_rate': 0.0004959541400784636, 'use_focal_loss': False} => F1=0.8125
 47%|████▋     | 14/30 [1:50:36<2:25:17, 544.85s/trial, best loss: -0.8124559041448337]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.1158,0.111396,0.795381,0.790584
2,0.0868,0.099607,0.817505,0.818259
3,0.0673,0.11056,0.819872,0.822308


[Hyperopt] params={'batch_size': 16, 'epochs': 3, 'gamma': 2.0, 'label_smoothing': 0.001596788141987604, 'learning_rate': 0.00030544744169664826, 'use_focal_loss': True} => F1=0.8199
 50%|█████     | 15/30 [1:55:16<1:56:13, 464.93s/trial, best loss: -0.8198722747998499]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.5613,0.528412,0.690397,0.741141
2,0.5053,0.514586,0.720229,0.747216


[Hyperopt] params={'batch_size': 16, 'epochs': 2, 'gamma': 1.5, 'label_smoothing': 0.01675165929367779, 'learning_rate': 1.4595040124610039e-05, 'use_focal_loss': False} => F1=0.7202
 53%|█████▎    | 16/30 [1:58:23<1:28:56, 381.15s/trial, best loss: -0.8198722747998499]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0896,0.084154,0.768845,0.780459
2,0.0791,0.084837,0.782557,0.793453


[Hyperopt] params={'batch_size': 4, 'epochs': 2, 'gamma': 2.5, 'label_smoothing': 0.015217215887997538, 'learning_rate': 4.243887089374806e-05, 'use_focal_loss': True} => F1=0.7826
 57%|█████▋    | 17/30 [2:10:08<1:43:40, 478.48s/trial, best loss: -0.8198722747998499]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0946,0.088765,0.723921,0.753291
2,0.0842,0.085047,0.756561,0.769659
3,0.0809,0.085096,0.758464,0.775903
4,0.0792,0.083974,0.766491,0.776578


[Hyperopt] params={'batch_size': 16, 'epochs': 4, 'gamma': 2.5, 'label_smoothing': 0.1936092761633049, 'learning_rate': 1.900517860553418e-05, 'use_focal_loss': True} => F1=0.7665
 60%|██████    | 18/30 [2:16:20<1:29:18, 446.57s/trial, best loss: -0.8198722747998499]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.5316,0.510663,0.800432,0.802565
2,0.4512,0.492532,0.81557,0.823321
3,0.3965,0.51543,0.811854,0.814884


[Hyperopt] params={'batch_size': 16, 'epochs': 3, 'gamma': 2.0, 'label_smoothing': 0.0757258409426292, 'learning_rate': 0.00044881056090763187, 'use_focal_loss': False} => F1=0.8156
 63%|██████▎   | 19/30 [2:21:02<1:12:48, 397.11s/trial, best loss: -0.8198722747998499]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0492,0.046873,0.69252,0.739116
2,0.0448,0.045643,0.720338,0.749072


[Hyperopt] params={'batch_size': 16, 'epochs': 2, 'gamma': 3.5, 'label_smoothing': 0.14418503150233394, 'learning_rate': 1.3587175310198305e-05, 'use_focal_loss': True} => F1=0.7203
 67%|██████▋   | 20/30 [2:24:14<55:56, 335.68s/trial, best loss: -0.8198722747998499]  

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.2274,0.210252,0.805722,0.803409
2,0.1693,0.195307,0.818473,0.821633
3,0.1298,0.214985,0.820939,0.824333


[Hyperopt] params={'batch_size': 16, 'epochs': 3, 'gamma': 1.0, 'label_smoothing': 0.0010807772937372617, 'learning_rate': 0.00031300276112234337, 'use_focal_loss': True} => F1=0.8209
 70%|███████   | 21/30 [2:28:51<47:41, 317.94s/trial, best loss: -0.8209388765486151]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.2284,0.212856,0.801472,0.798515
2,0.1739,0.196101,0.819288,0.819271
3,0.139,0.208608,0.823521,0.827202


[Hyperopt] params={'batch_size': 16, 'epochs': 3, 'gamma': 1.0, 'label_smoothing': 0.03442125718345023, 'learning_rate': 0.000256275263307022, 'use_focal_loss': True} => F1=0.8235
 73%|███████▎  | 22/30 [2:33:29<40:48, 306.02s/trial, best loss: -0.8235211803073061]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.2325,0.212016,0.797649,0.79784
2,0.1843,0.200059,0.815368,0.817246
3,0.1573,0.209136,0.8128,0.81809


[Hyperopt] params={'batch_size': 16, 'epochs': 3, 'gamma': 1.0, 'label_smoothing': 0.03714865230379665, 'learning_rate': 0.00017381971319044515, 'use_focal_loss': True} => F1=0.8154
 77%|███████▋  | 23/30 [2:38:08<34:44, 297.78s/trial, best loss: -0.8235211803073061]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.227,0.216723,0.799392,0.794971
2,0.1666,0.19171,0.818977,0.821971
3,0.124,0.22016,0.817311,0.820115
4,0.0932,0.254726,0.813421,0.815727


[Hyperopt] params={'batch_size': 16, 'epochs': 4, 'gamma': 1.0, 'label_smoothing': 0.0037154612589670073, 'learning_rate': 0.00032702531761134607, 'use_focal_loss': True} => F1=0.8190
 80%|████████  | 24/30 [2:43:57<31:18, 313.16s/trial, best loss: -0.8235211803073061]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.1646,0.151602,0.800622,0.799021
2,0.1302,0.142686,0.813619,0.815727
3,0.1107,0.150342,0.812062,0.816909


[Hyperopt] params={'batch_size': 16, 'epochs': 3, 'gamma': 1.5, 'label_smoothing': 0.031687588572890585, 'learning_rate': 0.00017889463142200593, 'use_focal_loss': True} => F1=0.8136
 83%|████████▎ | 25/30 [2:48:13<24:40, 296.17s/trial, best loss: -0.8235211803073061]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.1613,0.155788,0.792492,0.786871
2,0.1165,0.139403,0.818208,0.820283
3,0.0866,0.162582,0.817898,0.820283


[Hyperopt] params={'batch_size': 16, 'epochs': 3, 'gamma': 1.5, 'label_smoothing': 0.11209584525567445, 'learning_rate': 0.0003836857154022111, 'use_focal_loss': True} => F1=0.8182
 87%|████████▋ | 26/30 [2:52:50<19:20, 290.24s/trial, best loss: -0.8235211803073061]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.2351,0.213845,0.792797,0.797165
2,0.1903,0.203959,0.807382,0.811509
3,0.1668,0.21045,0.807646,0.813196


[Hyperopt] params={'batch_size': 16, 'epochs': 3, 'gamma': 1.0, 'label_smoothing': 0.0027084523936727637, 'learning_rate': 0.00014295026104359354, 'use_focal_loss': True} => F1=0.8076
 90%|█████████ | 27/30 [2:57:28<14:20, 286.76s/trial, best loss: -0.8235211803073061]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.1722,0.15994,0.777187,0.781809
2,0.1452,0.150347,0.800262,0.804927
3,0.1306,0.151757,0.799997,0.807965
4,0.1221,0.153301,0.807053,0.81134


[Hyperopt] params={'batch_size': 16, 'epochs': 4, 'gamma': 1.5, 'label_smoothing': 0.03214572393619095, 'learning_rate': 8.082004713571994e-05, 'use_focal_loss': True} => F1=0.8071
 93%|█████████▎| 28/30 [3:03:32<10:19, 309.94s/trial, best loss: -0.8235211803073061]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.2287,0.208726,0.806468,0.805771
2,0.1758,0.196628,0.818082,0.819102
3,0.1418,0.208524,0.8178,0.82214


[Hyperopt] params={'batch_size': 16, 'epochs': 3, 'gamma': 1.0, 'label_smoothing': 0.06209749291621103, 'learning_rate': 0.00023564616210829996, 'use_focal_loss': True} => F1=0.8181
 97%|█████████▋| 29/30 [3:08:10<05:00, 300.19s/trial, best loss: -0.8235211803073061]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0151,0.014087,0.798415,0.796659
2,0.012,0.013809,0.805704,0.802734
3,0.0099,0.014533,0.814158,0.817415


[Hyperopt] params={'batch_size': 16, 'epochs': 3, 'gamma': 5.0, 'label_smoothing': 0.12100011963995672, 'learning_rate': 0.00023319039336802745, 'use_focal_loss': True} => F1=0.8142
100%|██████████| 30/30 [3:12:47<00:00, 385.58s/trial, best loss: -0.8235211803073061]

Hyperopt best param indices: {'batch_size': np.int64(2), 'epochs': np.int64(1), 'gamma': np.float64(1.0), 'label_smoothing': np.float64(0.03442125718345023), 'learning_rate': np.float64(0.000256275263307022), 'use_focal_loss': np.int64(1)}


In [11]:
# ----------------------------------------------------------------------------
# 8.1 Interpret Best Hyperparams
# ----------------------------------------------------------------------------
EPOCH_OPTIONS = [2, 3, 4]
BATCH_OPTIONS = [4, 8, 16]
USE_FOCAL_OPTIONS = [False, True]

final_params = {
    "learning_rate":    best["learning_rate"],
    "epochs":           EPOCH_OPTIONS[ best["epochs"] ],
    "batch_size":       BATCH_OPTIONS[ best["batch_size"] ],
    "use_focal_loss":   USE_FOCAL_OPTIONS[ best["use_focal_loss"] ],
    "gamma":            best["gamma"],
    "label_smoothing":  best["label_smoothing"]
}

if TESTING_FLAG:
    print("Interpreted best hyperparams:\n", final_params)

Interpreted best hyperparams:
 {'learning_rate': np.float64(0.000256275263307022), 'epochs': 3, 'batch_size': 16, 'use_focal_loss': True, 'gamma': np.float64(1.0), 'label_smoothing': np.float64(0.03442125718345023)}


In [None]:
# ----------------------------------------------------------------------------
# 9. Train Final Model
# ----------------------------------------------------------------------------
best_model = CustomBiLSTMModel(
    vocab_size=len(tokenizer.get_vocab()),
    embed_dim=EMBED_DIM,
    hidden_dim=HIDDEN_DIM,
    num_layers=NUM_LAYERS,
    dropout=DROPOUT,
    use_attention=USE_ATTENTION,
    num_labels=2,
    use_focal_loss=final_params["use_focal_loss"],
    gamma=final_params["gamma"],
    label_smoothing=final_params["label_smoothing"],
    embedding_matrix=embedding_matrix_tensor
)
best_model.to(device)

training_args = TrainingArguments(
    output_dir="./final-enhanced-bilstm-model",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=final_params["learning_rate"],
    num_train_epochs=final_params["epochs"],
    per_device_train_batch_size=final_params["batch_size"],
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    load_best_model_at_end=True,
    metric_for_best_model=BEST_MODEL_METRIC,
    greater_is_better=True,
    save_total_limit=1,
    report_to="none",
    logging_steps=1
)

trainer = Trainer(
    model=best_model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["dev"],
    processing_class=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
results_dev = trainer.evaluate(encoded_dataset["dev"])
if TESTING_FLAG:
    print("Final Dev Results:", results_dev)

preds_output = trainer.predict(encoded_dataset["dev"])
dev_preds = np.argmax(preds_output.predictions, axis=1)
dev_labels = preds_output.label_ids
if TESTING_FLAG:
    print("\nDetailed Classification Report (Dev):")
    print(classification_report(dev_labels, dev_preds, digits=4))
    

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.2284,0.212856,0.801472,0.798515
2,0.1739,0.196101,0.819288,0.819271
3,0.139,0.208608,0.823521,0.827202


Final Dev Results: {'eval_loss': 0.20860803127288818, 'eval_f1': 0.8235211803073061, 'eval_accuracy': 0.8272021599730003, 'eval_runtime': 13.1419, 'eval_samples_per_second': 450.926, 'eval_steps_per_second': 56.385, 'epoch': 3.0}

Detailed Classification Report (Dev):
              precision    recall  f1-score   support

           0     0.8634    0.9041    0.8833      4286
           1     0.7142    0.6262    0.6673      1640

    accuracy                         0.8272      5926
   macro avg     0.7888    0.7652    0.7753      5926
weighted avg     0.8221    0.8272    0.8235      5926



In [33]:
#Save the best model
torch.save(trainer.model.state_dict(), BEST_MODEL_PATH)
print(f"Best model saved to {BEST_MODEL_PATH}")

Best model saved to data\ED_B_Model.pt


In [36]:
loaded_model = CustomBiLSTMModel(
    vocab_size=len(tokenizer.get_vocab()),
    embed_dim=EMBED_DIM,
    hidden_dim=HIDDEN_DIM,
    num_layers=NUM_LAYERS,
    dropout=DROPOUT,
    use_attention=USE_ATTENTION,
    num_labels=2,
    use_focal_loss=final_params["use_focal_loss"],
    gamma=final_params["gamma"],
    label_smoothing=final_params["label_smoothing"],
    embedding_matrix=embedding_matrix_tensor
)

loaded_model.load_state_dict(torch.load(BEST_MODEL_PATH, map_location=device))
loaded_model.to(device)
loaded_model.eval()
print("Successfully loaded state_dict into loaded_model.")

trainer.model = loaded_model


Successfully loaded state_dict into loaded_model.


In [None]:
# ----------------------------------------------------------------------------
# 10. Inference on Test Set
# ----------------------------------------------------------------------------
from torch.utils.data import DataLoader

test_loader = DataLoader(encoded_dataset["test"], batch_size=8)

all_preds = []
loaded_model.eval()

for batch in test_loader:
    for key in batch:
        batch[key] = batch[key].to(device)

    with torch.no_grad():
        outputs = loaded_model(**batch)       
    logits = outputs["logits"]               
    preds = torch.argmax(logits, dim=1)
    all_preds.extend(preds.cpu().tolist())

# Convert to a DataFrame and save
test_pred_df = pd.DataFrame({"prediction": all_preds})
test_pred_df.to_csv(OUTPUT_PATH, index=False, header=True)
print("Saved predictions")


Saved predictions


In [None]:
# ----------------------------------------------------------------------------
# Inference on Dev Set (codebench debugging)
# ----------------------------------------------------------------------------
from torch.utils.data import DataLoader

dev_loader = DataLoader(encoded_dataset["dev"], batch_size=8)

all_dev_preds = []
loaded_model.eval()

for batch in dev_loader:
    for key in batch:
        batch[key] = batch[key].to(device)

    with torch.no_grad():
        outputs = loaded_model(**batch) 
    logits = outputs["logits"]          
    preds = torch.argmax(logits, dim=1) 
    all_dev_preds.extend(preds.cpu().tolist())


dev_pred_df = pd.DataFrame({"prediction": all_dev_preds})
dev_pred_df.to_csv("dev_predictions.csv", index=False, header=True)

print("Saved dev predictions to dev_predictions.csv")


Saved dev predictions to dev_predictions.csv
