In [1]:
import os
import numpy as np
import pandas as pd
import torch
import random
import nltk
import torch.nn as nn
import torch.nn.functional as F

from tqdm import tqdm
from nltk.corpus import wordnet
from transformers import AutoTokenizer
from datasets import Dataset, DatasetDict
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.metrics import f1_score, accuracy_score, classification_report
from transformers import Trainer, TrainingArguments

# ----------------------------------------------------------------------------
# Global Variables and Flags
# ----------------------------------------------------------------------------
TESTING_FLAG = True  # If True, print debug info
DOWNLOAD_FLAG = True # If True, handle NLTK data

NLTK_DATA_DIR = "/kaggle/working/nltk_data"
TRAIN_PATH = "/kaggle/input/nlu-ed-task/train.csv"
DEV_PATH   = "/kaggle/input/nlu-ed-task/dev.csv"
TEST_PATH  = "/kaggle/input/nlu-ed-task/train.csv"  # Temporary for demonstration
OUTPUT_PATH = "test_predictions.csv"

AUGMENTED_COPY_CHANCE = 0.15
EPOCH_OPTIONS = [2, 3, 4]
BATCH_OPTIONS  = [4, 8, 16]
USE_FOCAL_OPTIONS = [False, True]

SEARCH_SPACE = {
    "learning_rate":   hp.loguniform("learning_rate", np.log(1e-5), np.log(5e-4)),
    "epochs":          hp.choice("epochs", EPOCH_OPTIONS),
    "batch_size":      hp.choice("batch_size", BATCH_OPTIONS),
    "use_focal_loss":  hp.choice("use_focal_loss", USE_FOCAL_OPTIONS),
    "gamma":           hp.quniform("gamma", 1.0, 5.0, 0.5),
    "label_smoothing": hp.uniform("label_smoothing", 0.0, 0.2)
}

MAX_EVALS = 30  
EVAL_BATCH_SIZE = 8
BEST_MODEL_METRIC = "f1"

GLOVE_PATH = "/kaggle/input/glove6b300d/glove.6B.300d.txt"  # Adjust path if needed
EMBED_DIM = 300         
HIDDEN_DIM = 256
NUM_LAYERS = 2
DROPOUT = 0.3           
USE_ATTENTION = True    # If True, apply a simple attention layer

# Check GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if TESTING_FLAG:
    print("Using device:", device)

Using device: cuda


In [2]:
# ----------------------------------------------------------------------------
# 1. Fetch NLTK data
# ----------------------------------------------------------------------------
if DOWNLOAD_FLAG:
    nltk.data.path.append(NLTK_DATA_DIR)
    nltk.download("wordnet", download_dir=NLTK_DATA_DIR)
    nltk.download("omw-1.4", download_dir=NLTK_DATA_DIR)
    !unzip /kaggle/working/nltk_data/corpora/omw-1.4.zip -d /kaggle/working/nltk_data/corpora/
    !unzip /kaggle/working/nltk_data/corpora/wordnet.zip -d /kaggle/working/nltk_data/corpora/
    print("Downloaded NLTK data")

[nltk_data] Downloading package wordnet to
[nltk_data]     /kaggle/working/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /kaggle/working/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Archive:  /kaggle/working/nltk_data/corpora/omw-1.4.zip
replace /kaggle/working/nltk_data/corpora/omw-1.4/fin/LICENSE? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C
Archive:  /kaggle/working/nltk_data/corpora/wordnet.zip
replace /kaggle/working/nltk_data/corpora/wordnet/lexnames? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C
Downloaded NLTK data


In [3]:
# ----------------------------------------------------------------------------
# 2. Load Data
# ----------------------------------------------------------------------------
train_df = pd.read_csv(TRAIN_PATH)
dev_df   = pd.read_csv(DEV_PATH)
test_df  = pd.read_csv(TEST_PATH)

# rename columns if needed
train_df.rename(columns={"Claim": "claim", "Evidence": "evidence"}, inplace=True)
dev_df.rename(columns={"Claim": "claim", "Evidence": "evidence"}, inplace=True)
test_df.rename(columns={"Claim": "claim", "Evidence": "evidence"}, inplace=True)

if TESTING_FLAG:
    print("Train samples:", len(train_df))
    print("Dev samples:", len(dev_df))
    print("Test samples:", len(test_df))

train_df["label"] = train_df["label"].astype(int)
dev_df["label"]   = dev_df["label"].astype(int)


Train samples: 21508
Dev samples: 5926
Test samples: 21508


In [4]:
# ----------------------------------------------------------------------------
# 3. Data Augmentation (Synonym Replacement)
# ----------------------------------------------------------------------------
random.seed(42)

def synonym_replacement(sentence, n=1):
    words = sentence.split()
    if len(words) < 2:
        return sentence
    indices_to_replace = random.sample(range(len(words)), k=min(n, len(words)))
    new_words = words[:]
    for i in indices_to_replace:
        word = words[i]
        syns = wordnet.synsets(word)
        if not syns:
            continue
        lemmas = syns[0].lemma_names()
        lemmas = [l for l in lemmas if l.lower() != word.lower()]
        if len(lemmas) == 0:
            continue
        new_words[i] = random.choice(lemmas)
    return " ".join(new_words)

def augment_dataframe(df):
    augmented_rows = []
    for _, row in df.iterrows():
        augmented_rows.append(row.to_dict())
        if random.random() < AUGMENTED_COPY_CHANCE:
            new_row = row.copy()
            if random.random() < 0.5:
                new_row["claim"] = synonym_replacement(row["claim"], n=1)
            else:
                new_row["evidence"] = synonym_replacement(row["evidence"], n=1)
            augmented_rows.append(new_row.to_dict())
    return pd.DataFrame(augmented_rows)

augmented_train_df = augment_dataframe(train_df)
if TESTING_FLAG:
    print("Original train size:", len(train_df),
          "=> After augmentation:", len(augmented_train_df))

train_df = augmented_train_df.reset_index(drop=True)

Original train size: 21508 => After augmentation: 24819


In [5]:
# ----------------------------------------------------------------------------
# 4. Create Hugging Face Datasets
# ----------------------------------------------------------------------------
train_dataset = Dataset.from_pandas(train_df)
dev_dataset   = Dataset.from_pandas(dev_df)
test_dataset  = Dataset.from_pandas(test_df)

dataset_dict = DatasetDict({
    "train": train_dataset,
    "dev":   dev_dataset,
    "test":  test_dataset
})

if TESTING_FLAG:
    print(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['claim', 'evidence', 'label'],
        num_rows: 24819
    })
    dev: Dataset({
        features: ['claim', 'evidence', 'label'],
        num_rows: 5926
    })
    test: Dataset({
        features: ['claim', 'evidence', 'label'],
        num_rows: 21508
    })
})


In [6]:
# ----------------------------------------------------------------------------
# 5. Tokenization
# ----------------------------------------------------------------------------
# We'll use a BERT tokenizer just to produce consistent token IDs & attention mask
TOKENIZER_NAME = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)

def tokenize_function(examples):
    return tokenizer(
        examples["claim"],
        examples["evidence"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

encoded_dataset = dataset_dict.map(tokenize_function, batched=True)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/24819 [00:00<?, ? examples/s]

Map:   0%|          | 0/5926 [00:00<?, ? examples/s]

Map:   0%|          | 0/21508 [00:00<?, ? examples/s]

In [7]:
# ----------------------------------------------------------------------------
# 5.1. Format for PyTorch
# ----------------------------------------------------------------------------
encoded_dataset["train"] = encoded_dataset["train"].rename_column("label", "labels")
encoded_dataset["dev"]   = encoded_dataset["dev"].rename_column("label", "labels")

encoded_dataset["train"] = encoded_dataset["train"].remove_columns(["claim", "evidence"])
encoded_dataset["dev"]   = encoded_dataset["dev"].remove_columns(["claim", "evidence"])
encoded_dataset["test"]  = encoded_dataset["test"].remove_columns(["claim", "evidence"])

encoded_dataset["train"].set_format("torch")
encoded_dataset["dev"].set_format("torch")
encoded_dataset["test"].set_format("torch")

In [8]:
# ----------------------------------------------------------------------------
# 6. Load GloVe embeddings & Build an Embedding Matrix
# ----------------------------------------------------------------------------
def load_glove_embeddings(glove_file, vocab, embedding_dim=300):
    """
    Attempt to load GloVe 300d vectors and align them with the given vocab.
    vocab: a dict {token_string: token_index}
    Returns: a numpy array [vocab_size, embedding_dim]
    """
    embedding_matrix = np.random.normal(
        scale=0.1, 
        size=(len(vocab), embedding_dim)
    ).astype(np.float32)
    found = 0

    if not os.path.isfile(glove_file):
        print(f"GloVe file not found at {glove_file}, using random init.")
        return embedding_matrix, found

    print(f"Loading GloVe from {glove_file}...")
    glove_dict = {}
    with open(glove_file, "r", encoding="utf-8") as f:
        for line in f:
            values = line.split()
            if len(values) != embedding_dim + 1:
                continue
            word = values[0]
            coefs = np.asarray(values[1:], dtype="float32")
            glove_dict[word] = coefs

    # For each token in the BERT-based vocab, see if it matches a GloVe word
    for token, idx in vocab.items():
        # BERT tokens can have wordpiece prefixes like ##ing
        # We'll do a simple check: if the token starts with "##", remove it
        # Also try all-lowercase for matching.
        normalized = token.replace("##", "").lower()
        if normalized in glove_dict:
            embedding_matrix[idx] = glove_dict[normalized]
            found += 1

    print(f"Initialized embedding_matrix with {found} GloVe tokens matched out of {len(vocab)}")
    return embedding_matrix, found

vocab_dict = tokenizer.get_vocab()  # {token_str: token_id}
embedding_matrix_np, glove_found = load_glove_embeddings(GLOVE_PATH, vocab_dict, EMBED_DIM)
embedding_matrix_tensor = torch.tensor(embedding_matrix_np)

GloVe file not found at /kaggle/input/glove6b300d/glove.6B.300d.txt, using random init.


In [9]:
# ----------------------------------------------------------------------------
# 7. Custom BiLSTM with optional attention
# ----------------------------------------------------------------------------
class SimpleAttention(nn.Module):
    """
    A simple additive attention: 
    score = tanh(W1*H + W2*h_context), 
    then softmax over time steps, 
    output = sum of weighted hidden states
    """
    def __init__(self, hidden_dim):
        super().__init__()
        self.hidden_dim = hidden_dim
        # Because LSTM is bidirectional, total hidden is 2 * hidden_dim
        self.W = nn.Linear(2 * hidden_dim, 2 * hidden_dim)
        self.v = nn.Linear(2 * hidden_dim, 1, bias=False)

    def forward(self, lstm_outputs, mask=None):
        """
        lstm_outputs: (B, L, 2*hidden_dim)
        mask: (B, L) if needed (1 for real tokens, 0 for pad)
        Returns: (B, 2*hidden_dim) - the weighted sum
        """
        # Score calculation
        score = torch.tanh(self.W(lstm_outputs))  # (B, L, 2H)
        score = self.v(score).squeeze(-1)         # (B, L)
        
        # Optional mask
        if mask is not None:
            # mask=0 => shouldn't contribute, so set score to -inf
            score = score.masked_fill(mask == 0, -1e9)

        attn_weights = F.softmax(score, dim=-1)   # (B, L)
        # Weighted sum
        attn_weights = attn_weights.unsqueeze(1)  # (B, 1, L)
        context = torch.bmm(attn_weights, lstm_outputs)  # (B, 1, 2H)
        context = context.squeeze(1)              # (B, 2H)

        return context


class CustomBiLSTMModel(nn.Module):
    def __init__(self, 
                 vocab_size, 
                 embed_dim=300, 
                 hidden_dim=256, 
                 num_labels=2, 
                 num_layers=2,
                 dropout=0.3,
                 use_attention=True,
                 use_focal_loss=False, 
                 gamma=2.0, 
                 label_smoothing=0.0,
                 embedding_matrix=None):
        super().__init__()
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.dropout = dropout
        self.num_labels = num_labels
        
        self.use_attention = use_attention
        self.use_focal_loss = use_focal_loss
        self.gamma = gamma
        self.label_smoothing = label_smoothing

        # Embedding layer
        self.embedding = nn.Embedding(self.vocab_size, self.embed_dim, padding_idx=0)
        if embedding_matrix is not None:
            with torch.no_grad():
                self.embedding.weight.copy_(embedding_matrix)

        # BiLSTM with multiple layers & dropout
        self.lstm = nn.LSTM(
            input_size=self.embed_dim,
            hidden_size=self.hidden_dim,
            num_layers=self.num_layers,
            dropout=self.dropout,
            batch_first=True,
            bidirectional=True
        )

        # Optional attention
        if self.use_attention:
            self.attn = SimpleAttention(self.hidden_dim)

        # Classification head
        # If we have 2 directions => 2H
        self.classifier = nn.Linear(2 * self.hidden_dim, self.num_labels)

    def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
        # Embeddings
        embeds = self.embedding(input_ids)
        # zero out padding
        if attention_mask is not None:
            expand_mask = attention_mask.unsqueeze(-1).float()
            embeds = embeds * expand_mask
        
        # LSTM
        lstm_outputs, (h, c) = self.lstm(embeds)
        # shape of lstm_outputs: (B, L, 2H)

        if self.use_attention:
            # Weighted sum of outputs
            context = self.attn(lstm_outputs, mask=attention_mask)
        else:
            # We'll just take final hidden states from both directions
            # h shape: (num_layers*2, B, H)
            h_forward = h[-2]  # last layer's forward state
            h_backward = h[-1] # last layer's backward state
            context = torch.cat((h_forward, h_backward), dim=-1)  # (B, 2H)

        logits = self.classifier(context)

        # Loss
        loss = None
        if labels is not None:
            if self.use_focal_loss:
                loss = self.focal_loss(logits, labels, self.gamma)
            else:
                loss = self.label_smoothing_loss(logits, labels, self.label_smoothing)

        return {"loss": loss, "logits": logits}

    def focal_loss(self, logits, targets, gamma=2.0):
        ce = nn.CrossEntropyLoss(reduction='none')(logits, targets)
        pt = torch.exp(-ce)
        focal = (1 - pt)**gamma * ce
        return focal.mean()

    def label_smoothing_loss(self, logits, targets, smoothing=0.0):
        if smoothing == 0.0:
            return nn.CrossEntropyLoss()(logits, targets)
        log_probs = F.log_softmax(logits, dim=-1)
        n_class = logits.size(1)
        with torch.no_grad():
            true_dist = torch.zeros_like(log_probs)
            true_dist.fill_(smoothing / (n_class - 1))
            true_dist.scatter_(1, targets.unsqueeze(1), 1.0 - smoothing)
        return torch.mean(torch.sum(-true_dist * log_probs, dim=1))


In [10]:
# ----------------------------------------------------------------------------
# 8. Build Trainer & Hyperopt
# ----------------------------------------------------------------------------
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"f1": f1, "accuracy": acc}

def make_trainer(model, train_ds, dev_ds, space):
    learning_rate = space["learning_rate"]
    epochs        = int(space["epochs"])
    batch_size    = int(space["batch_size"])

    training_args = TrainingArguments(
        output_dir="./enhanced-bilstm-ed-checkpoints",
        eval_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        learning_rate=learning_rate,
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=EVAL_BATCH_SIZE,
        load_best_model_at_end=True,
        metric_for_best_model=BEST_MODEL_METRIC,
        greater_is_better=True,
        save_total_limit=1,
        report_to="none",
        logging_steps=1
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=dev_ds,
        processing_class=tokenizer,
        compute_metrics=compute_metrics
    )
    return trainer

def objective(space):
    use_focal_loss = space["use_focal_loss"]
    gamma = space["gamma"]
    label_smoothing = space["label_smoothing"]

    # Build model
    model = CustomBiLSTMModel(
        vocab_size=len(tokenizer.get_vocab()),
        embed_dim=EMBED_DIM,
        hidden_dim=HIDDEN_DIM,
        num_labels=2,
        num_layers=NUM_LAYERS,
        dropout=DROPOUT,
        use_attention=USE_ATTENTION,
        use_focal_loss=use_focal_loss,
        gamma=gamma,
        label_smoothing=label_smoothing,
        embedding_matrix=embedding_matrix_tensor
    )
    model.to(device)

    trainer = make_trainer(model, encoded_dataset["train"], encoded_dataset["dev"], space)
    trainer.train()
    metrics = trainer.evaluate(encoded_dataset["dev"])
    f1 = metrics["eval_f1"]

    if TESTING_FLAG:
        print(f"[Hyperopt] params={space} => F1={f1:.4f}")
    return {"loss": -f1, "status": STATUS_OK}

trials = Trials()
best = fmin(
    fn=objective,
    space=SEARCH_SPACE,
    algo=tpe.suggest,
    max_evals=MAX_EVALS,
    trials=trials
)

if TESTING_FLAG:
    print("\nHyperopt best param indices:", best)

  0%|          | 0/30 [00:00<?, ?trial/s, best loss=?]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0494,0.044498,0.722082,0.763247
2,0.0409,0.042593,0.759018,0.776747


[Hyperopt] params={'batch_size': 16, 'epochs': 2, 'gamma': 3.5, 'label_smoothing': 0.15599873886629706, 'learning_rate': 3.170526971699426e-05, 'use_focal_loss': True} => F1=0.7590
  3%|▎         | 1/30 [01:46<51:21, 106.26s/trial, best loss: -0.7590180872514256]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.1054,0.104673,0.607102,0.723253
2,0.1012,0.100414,0.607499,0.723422


[Hyperopt] params={'batch_size': 8, 'epochs': 2, 'gamma': 2.5, 'label_smoothing': 0.028835345786337997, 'learning_rate': 1.0304624272436615e-05, 'use_focal_loss': True} => F1=0.6075
  7%|▋         | 2/30 [04:22<1:03:22, 135.81s/trial, best loss: -0.7590180872514256]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0325,0.029251,0.782434,0.789234
2,0.0262,0.028527,0.797402,0.802227
3,0.0243,0.03464,0.798839,0.803746
4,0.0236,0.038624,0.800356,0.804927


[Hyperopt] params={'batch_size': 4, 'epochs': 4, 'gamma': 4.0, 'label_smoothing': 0.048510621605982275, 'learning_rate': 2.90498654240857e-05, 'use_focal_loss': True} => F1=0.8004
 10%|█         | 3/30 [13:00<2:19:40, 310.41s/trial, best loss: -0.800355651832787] 

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.4886,0.454297,0.800204,0.801384
2,0.3899,0.475443,0.79796,0.804421


[Hyperopt] params={'batch_size': 8, 'epochs': 2, 'gamma': 2.0, 'label_smoothing': 0.021342332085049834, 'learning_rate': 0.0001245387711327438, 'use_focal_loss': False} => F1=0.8002
 13%|█▎        | 4/30 [15:43<1:49:13, 252.06s/trial, best loss: -0.800355651832787]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.5441,0.503527,0.75341,0.774722
2,0.453,0.474724,0.77624,0.78299


[Hyperopt] params={'batch_size': 16, 'epochs': 2, 'gamma': 1.5, 'label_smoothing': 0.007807440513335706, 'learning_rate': 4.11167894582814e-05, 'use_focal_loss': False} => F1=0.7762
 17%|█▋        | 5/30 [17:31<1:23:26, 200.25s/trial, best loss: -0.800355651832787]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.5617,0.560445,0.793685,0.797165
2,0.4984,0.558794,0.801718,0.803409
3,0.4563,0.570823,0.798446,0.803409
4,0.4253,0.595695,0.797752,0.801721


[Hyperopt] params={'batch_size': 4, 'epochs': 4, 'gamma': 1.5, 'label_smoothing': 0.10998744379026944, 'learning_rate': 0.0002398018217274467, 'use_focal_loss': False} => F1=0.8017
 20%|██        | 6/30 [26:20<2:04:46, 311.93s/trial, best loss: -0.8017180465085869]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0377,0.037586,0.607102,0.723253
2,0.0371,0.037238,0.607102,0.723253


[Hyperopt] params={'batch_size': 16, 'epochs': 2, 'gamma': 4.0, 'label_smoothing': 0.07094317542816368, 'learning_rate': 1.0230798732370338e-05, 'use_focal_loss': True} => F1=0.6071
 23%|██▎       | 7/30 [28:05<1:33:36, 244.20s/trial, best loss: -0.8017180465085869]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0958,0.085287,0.754181,0.775228
2,0.0794,0.085415,0.772823,0.785015


[Hyperopt] params={'batch_size': 8, 'epochs': 2, 'gamma': 2.5, 'label_smoothing': 0.022277476101585748, 'learning_rate': 2.70583025787652e-05, 'use_focal_loss': True} => F1=0.7728
 27%|██▋       | 8/30 [30:40<1:19:10, 215.92s/trial, best loss: -0.8017180465085869]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.6186,0.59146,0.77056,0.786196
2,0.5763,0.584999,0.784611,0.795815
3,0.5655,0.586227,0.785324,0.793959


[Hyperopt] params={'batch_size': 8, 'epochs': 3, 'gamma': 4.5, 'label_smoothing': 0.15631430529930673, 'learning_rate': 3.098579348728949e-05, 'use_focal_loss': False} => F1=0.7853
 30%|███       | 9/30 [34:37<1:17:52, 222.49s/trial, best loss: -0.8017180465085869]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.6413,0.627767,0.607499,0.723422
2,0.6031,0.599908,0.754691,0.774215
3,0.5864,0.596667,0.766883,0.781303


[Hyperopt] params={'batch_size': 16, 'epochs': 3, 'gamma': 3.0, 'label_smoothing': 0.1584789861040354, 'learning_rate': 2.1212588253369e-05, 'use_focal_loss': False} => F1=0.7669
 33%|███▎      | 10/30 [37:12<1:07:11, 201.57s/trial, best loss: -0.8017180465085869]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0296,0.027402,0.803647,0.803409
2,0.0202,0.029042,0.804674,0.810496


[Hyperopt] params={'batch_size': 16, 'epochs': 2, 'gamma': 4.0, 'label_smoothing': 0.18891778650060695, 'learning_rate': 0.0002940519922101875, 'use_focal_loss': True} => F1=0.8047
 37%|███▋      | 11/30 [38:58<54:31, 172.21s/trial, best loss: -0.80467388224386]    

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.6084,0.581426,0.782261,0.794128
2,0.5643,0.5788,0.787592,0.798178


[Hyperopt] params={'batch_size': 16, 'epochs': 2, 'gamma': 4.0, 'label_smoothing': 0.1492101758865726, 'learning_rate': 5.874867261779884e-05, 'use_focal_loss': False} => F1=0.7876
 40%|████      | 12/30 [40:46<45:52, 152.89s/trial, best loss: -0.80467388224386]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.5865,0.547985,0.783525,0.79379
2,0.5252,0.540888,0.79002,0.799865
3,0.5074,0.543335,0.793697,0.800877


[Hyperopt] params={'batch_size': 16, 'epochs': 3, 'gamma': 5.0, 'label_smoothing': 0.1039113173167458, 'learning_rate': 5.136767899570798e-05, 'use_focal_loss': False} => F1=0.7937
 43%|████▎     | 13/30 [43:25<43:50, 154.76s/trial, best loss: -0.80467388224386]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.1996,0.188551,0.776113,0.781471
2,0.1688,0.169152,0.791876,0.795478
3,0.1579,0.185759,0.794793,0.797503
4,0.1528,0.207669,0.796377,0.802059


[Hyperopt] params={'batch_size': 4, 'epochs': 4, 'gamma': 1.5, 'label_smoothing': 0.14049237387885763, 'learning_rate': 2.6546375788382142e-05, 'use_focal_loss': True} => F1=0.7964
 47%|████▋     | 14/30 [52:15<1:11:28, 268.05s/trial, best loss: -0.80467388224386]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.5884,0.575557,0.799904,0.801046
2,0.5377,0.582531,0.797158,0.802059


[Hyperopt] params={'batch_size': 8, 'epochs': 2, 'gamma': 2.5, 'label_smoothing': 0.1501581473984195, 'learning_rate': 0.0001694997765515589, 'use_focal_loss': False} => F1=0.7999
 50%|█████     | 15/30 [54:57<59:01, 236.12s/trial, best loss: -0.80467388224386]  

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.1643,0.151979,0.800987,0.806446
2,0.1149,0.159792,0.797238,0.804421


[Hyperopt] params={'batch_size': 16, 'epochs': 2, 'gamma': 1.5, 'label_smoothing': 0.024914529251201568, 'learning_rate': 0.0004197092558764367, 'use_focal_loss': True} => F1=0.8010
 53%|█████▎    | 16/30 [56:46<46:08, 197.72s/trial, best loss: -0.80467388224386]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.5603,0.545129,0.791216,0.797165
2,0.5076,0.560702,0.794191,0.802734


[Hyperopt] params={'batch_size': 4, 'epochs': 2, 'gamma': 4.0, 'label_smoothing': 0.09701321232039915, 'learning_rate': 8.451230032623634e-05, 'use_focal_loss': False} => F1=0.7942
 57%|█████▋    | 17/30 [1:01:13<47:23, 218.70s/trial, best loss: -0.80467388224386]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.1186,0.113702,0.795459,0.793959
2,0.0887,0.111443,0.795968,0.795815
3,0.0732,0.134395,0.797786,0.801552


[Hyperopt] params={'batch_size': 8, 'epochs': 3, 'gamma': 2.0, 'label_smoothing': 0.056643147754697504, 'learning_rate': 0.00011831588529408005, 'use_focal_loss': True} => F1=0.7978
 60%|██████    | 18/30 [1:05:12<44:57, 224.77s/trial, best loss: -0.80467388224386]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0332,0.029981,0.768531,0.78434
2,0.027,0.029444,0.77913,0.790078


[Hyperopt] params={'batch_size': 16, 'epochs': 2, 'gamma': 4.0, 'label_smoothing': 0.015568230030244434, 'learning_rate': 5.197559784390158e-05, 'use_focal_loss': True} => F1=0.7791
 63%|██████▎   | 19/30 [1:06:59<34:42, 189.33s/trial, best loss: -0.80467388224386]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0266,0.026568,0.607102,0.723253
2,0.0261,0.026161,0.607102,0.723253


[Hyperopt] params={'batch_size': 16, 'epochs': 2, 'gamma': 4.5, 'label_smoothing': 0.13635106686204593, 'learning_rate': 1.1577999041738486e-05, 'use_focal_loss': True} => F1=0.6071
 67%|██████▋   | 20/30 [1:08:45<27:22, 164.21s/trial, best loss: -0.80467388224386]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.6142,0.605654,0.800933,0.808134
2,0.5714,0.609624,0.804129,0.808302
3,0.5414,0.620283,0.797789,0.802396
4,0.5226,0.636327,0.792747,0.796153


[Hyperopt] params={'batch_size': 4, 'epochs': 4, 'gamma': 3.0, 'label_smoothing': 0.19991770119774901, 'learning_rate': 0.00047023484216293963, 'use_focal_loss': False} => F1=0.8041
 70%|███████   | 21/30 [1:17:22<40:31, 270.13s/trial, best loss: -0.80467388224386]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.6139,0.602272,0.798172,0.809821
2,0.5726,0.606532,0.806976,0.811002
3,0.5437,0.617318,0.7997,0.805265
4,0.5229,0.633303,0.792945,0.795309


[Hyperopt] params={'batch_size': 4, 'epochs': 4, 'gamma': 3.0, 'label_smoothing': 0.1970405195193367, 'learning_rate': 0.0004955511800197659, 'use_focal_loss': False} => F1=0.8070
 73%|███████▎  | 22/30 [1:26:02<46:02, 345.33s/trial, best loss: -0.8069762909611161]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.6152,0.607267,0.794578,0.804927
2,0.5771,0.608177,0.80417,0.807796
3,0.5507,0.617685,0.799634,0.807796
4,0.5328,0.62842,0.796925,0.801046


[Hyperopt] params={'batch_size': 4, 'epochs': 4, 'gamma': 3.5, 'label_smoothing': 0.19841756680684028, 'learning_rate': 0.00028938735323599964, 'use_focal_loss': False} => F1=0.8042
 77%|███████▋  | 23/30 [1:34:44<46:28, 398.31s/trial, best loss: -0.8069762909611161]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0409,0.038743,0.803888,0.807627
2,0.0279,0.040352,0.803754,0.807627
3,0.0181,0.058803,0.795166,0.797334
4,0.0099,0.098109,0.790256,0.793115


[Hyperopt] params={'batch_size': 4, 'epochs': 4, 'gamma': 3.5, 'label_smoothing': 0.18188580660630393, 'learning_rate': 0.00032756957638555706, 'use_focal_loss': True} => F1=0.8039
 80%|████████  | 24/30 [1:43:16<43:13, 432.22s/trial, best loss: -0.8069762909611161]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0149,0.013771,0.803215,0.807121
2,0.0105,0.013932,0.809048,0.811509
3,0.0077,0.018103,0.80293,0.80459
4,0.0054,0.026158,0.799684,0.801721


[Hyperopt] params={'batch_size': 4, 'epochs': 4, 'gamma': 5.0, 'label_smoothing': 0.17681928136354222, 'learning_rate': 0.00018668565591363687, 'use_focal_loss': True} => F1=0.8090
 83%|████████▎ | 25/30 [1:51:47<38:00, 456.07s/trial, best loss: -0.8090475209245707]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.6067,0.599574,0.796695,0.799359
2,0.5666,0.600935,0.796328,0.803746
3,0.5404,0.61015,0.799745,0.803409
4,0.5229,0.621269,0.7987,0.802227


[Hyperopt] params={'batch_size': 4, 'epochs': 4, 'gamma': 5.0, 'label_smoothing': 0.18133233736063667, 'learning_rate': 0.0001880236236852539, 'use_focal_loss': False} => F1=0.7997
 87%|████████▋ | 26/30 [2:00:38<31:53, 478.34s/trial, best loss: -0.8090475209245707]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0147,0.014372,0.802764,0.810496
2,0.01,0.014579,0.801859,0.806784
3,0.006,0.020717,0.794356,0.795984
4,0.0026,0.044044,0.790644,0.792946


[Hyperopt] params={'batch_size': 4, 'epochs': 4, 'gamma': 5.0, 'label_smoothing': 0.12037570496993175, 'learning_rate': 0.0004927922027484601, 'use_focal_loss': True} => F1=0.8028
 90%|█████████ | 27/30 [2:09:20<24:34, 491.58s/trial, best loss: -0.8090475209245707]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.6044,0.596161,0.790733,0.795478
2,0.5668,0.596527,0.799646,0.806446
3,0.5469,0.609937,0.790557,0.794296
4,0.5323,0.619751,0.791326,0.796153


[Hyperopt] params={'batch_size': 4, 'epochs': 4, 'gamma': 1.0, 'label_smoothing': 0.17277028367989872, 'learning_rate': 9.945677193172808e-05, 'use_focal_loss': False} => F1=0.7996
 93%|█████████▎| 28/30 [2:18:22<16:53, 506.61s/trial, best loss: -0.8090475209245707]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0209,0.019351,0.804071,0.80729
2,0.0147,0.019644,0.807885,0.810159
3,0.0107,0.025832,0.801661,0.803071
4,0.0073,0.037228,0.799253,0.801046


[Hyperopt] params={'batch_size': 4, 'epochs': 4, 'gamma': 4.5, 'label_smoothing': 0.1671975277624146, 'learning_rate': 0.0001932750781421523, 'use_focal_loss': True} => F1=0.8079
 97%|█████████▋| 29/30 [2:26:51<08:27, 507.39s/trial, best loss: -0.8090475209245707]

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0209,0.019298,0.802784,0.807459
2,0.0148,0.019315,0.807871,0.810327
3,0.011,0.024953,0.80323,0.804927
4,0.0079,0.035246,0.802317,0.804084


[Hyperopt] params={'batch_size': 4, 'epochs': 4, 'gamma': 4.5, 'label_smoothing': 0.17123828114873232, 'learning_rate': 0.00016460269780220149, 'use_focal_loss': True} => F1=0.8079
100%|██████████| 30/30 [2:35:18<00:00, 310.61s/trial, best loss: -0.8090475209245707]

Hyperopt best param indices: {'batch_size': 0, 'epochs': 2, 'gamma': 5.0, 'label_smoothing': 0.17681928136354222, 'learning_rate': 0.00018668565591363687, 'use_focal_loss': 1}


In [11]:
# ----------------------------------------------------------------------------
# 8.1 Interpret Best Hyperparams
# ----------------------------------------------------------------------------
EPOCH_OPTIONS = [2, 3, 4]
BATCH_OPTIONS = [4, 8, 16]
USE_FOCAL_OPTIONS = [False, True]

final_params = {
    "learning_rate":    best["learning_rate"],
    "epochs":           EPOCH_OPTIONS[ best["epochs"] ],
    "batch_size":       BATCH_OPTIONS[ best["batch_size"] ],
    "use_focal_loss":   USE_FOCAL_OPTIONS[ best["use_focal_loss"] ],
    "gamma":            best["gamma"],
    "label_smoothing":  best["label_smoothing"]
}

if TESTING_FLAG:
    print("Interpreted best hyperparams:\n", final_params)

Interpreted best hyperparams:
 {'learning_rate': 0.00018668565591363687, 'epochs': 4, 'batch_size': 4, 'use_focal_loss': True, 'gamma': 5.0, 'label_smoothing': 0.17681928136354222}


In [12]:
# ----------------------------------------------------------------------------
# 9. Train Final Model
# ----------------------------------------------------------------------------
best_model = CustomBiLSTMModel(
    vocab_size=len(tokenizer.get_vocab()),
    embed_dim=EMBED_DIM,
    hidden_dim=HIDDEN_DIM,
    num_layers=NUM_LAYERS,
    dropout=DROPOUT,
    use_attention=USE_ATTENTION,
    num_labels=2,
    use_focal_loss=final_params["use_focal_loss"],
    gamma=final_params["gamma"],
    label_smoothing=final_params["label_smoothing"],
    embedding_matrix=embedding_matrix_tensor
)
best_model.to(device)

training_args = TrainingArguments(
    output_dir="./final-enhanced-bilstm-model",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=final_params["learning_rate"],
    num_train_epochs=final_params["epochs"],
    per_device_train_batch_size=final_params["batch_size"],
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    load_best_model_at_end=True,
    metric_for_best_model=BEST_MODEL_METRIC,
    greater_is_better=True,
    save_total_limit=1,
    report_to="none",
    logging_steps=1
)

trainer = Trainer(
    model=best_model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["dev"],
    processing_class=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
results_dev = trainer.evaluate(encoded_dataset["dev"])
if TESTING_FLAG:
    print("Final Dev Results:", results_dev)

preds_output = trainer.predict(encoded_dataset["dev"])
dev_preds = np.argmax(preds_output.predictions, axis=1)
dev_labels = preds_output.label_ids
if TESTING_FLAG:
    print("\nDetailed Classification Report (Dev):")
    print(classification_report(dev_labels, dev_preds, digits=4))

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0149,0.013771,0.803215,0.807121
2,0.0105,0.013932,0.809048,0.811509
3,0.0077,0.018103,0.80293,0.80459
4,0.0054,0.026158,0.799684,0.801721


Final Dev Results: {'eval_loss': 0.013931580819189548, 'eval_f1': 0.8090475209245707, 'eval_accuracy': 0.8115086061424233, 'eval_runtime': 6.5113, 'eval_samples_per_second': 910.116, 'eval_steps_per_second': 113.803, 'epoch': 4.0}

Detailed Classification Report (Dev):
              precision    recall  f1-score   support

           0     0.8589    0.8847    0.8716      4286
           1     0.6731    0.6201    0.6455      1640

    accuracy                         0.8115      5926
   macro avg     0.7660    0.7524    0.7586      5926
weighted avg     0.8075    0.8115    0.8090      5926



In [13]:
# ----------------------------------------------------------------------------
# 10. Inference on Test Set
# ----------------------------------------------------------------------------
test_predictions = trainer.predict(encoded_dataset["test"])
test_preds = np.argmax(test_predictions.predictions, axis=1)
test_df["label"] = test_preds
test_df.head()

Unnamed: 0,claim,evidence,label
0,We should introduce school vouchers,"Among the many educational reform efforts, suc...",0
1,We should legalize insider trading,The U.S. Securities and Exchange Commission wa...,0
2,We should subsidize investigative journalism,"The film won an Emmy Award (1980), George Polk...",0
3,We should further exploit nuclear power,a 2001 survey by the European Commission found...,1
4,We should ban whaling,The US and several other nations are whaling u...,0


In [14]:
# ----------------------------------------------------------------------------
# 10.1 Save Predictions
# ----------------------------------------------------------------------------
test_df.to_csv(OUTPUT_PATH, index=False)
if TESTING_FLAG:
    print(f"Test predictions saved to: {OUTPUT_PATH}")

Test predictions saved to: test_predictions.csv
