In [1]:
import os
import sys
import json
import torch

sys.path.append("../")
from lib.utils import get_device
from lib.utils.constants import Subtask, Track, PreprocessTextLevel, DatasetType
from lib.data.loading import load_train_dev_test_df
from lib.data.tokenizer import get_tokenizer
from lib.training.optimizer import get_optimizer, get_scheduler
from lib.training.loss import get_loss_fn
from lib.training.metric import get_metric

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
CONFIG_FILE_PATH = os.path.relpath("../config.json")

config = {}
with open(CONFIG_FILE_PATH, "r") as config_file:
    config = json.load(config_file)

DEVICE = get_device()
print(f"Using device: {DEVICE}")

results_dir = os.path.relpath("../runs/SubtaskC")
if not os.path.exists(results_dir):
    os.makedirs(results_dir)

print(f"Will save results to: {results_dir}")

Using device: mps
Will save results to: ../runs/SubtaskC


In [14]:
config

{'task': 'SubtaskC',
 'submission_format': 'csv',
 'model': 'longformer_crf',
 'tokenizer': {'model_name': 'longformer',
  'pretrained_name': 'allenai/longformer-base-4096'},
 'data': {'dataset_type': 'longformer_dataset',
  'dataset_type_settings': {},
  'data_dir': './data/original_data',
  'label_column': 'label',
  'max_len': 1024,
  'batch_size': 16,
  'test_size': 0.2,
  'preprocess_text_level': 0},
 'model_config': {'pretrained_model_name': 'allenai/longformer-base-4096',
  'out_size': 2,
  'dropout_p': 0.2},
 'training': {'num_epochs': 5,
  'num_epochs_before_finetune': 0,
  'optimizer': {'AdamW': {'freeze_lr': 0.001, 'finetune_lr': 2e-05}},
  'scheduler': {'num_warmup_steps': 50},
  'early_stopping': {'patience': 1, 'delta': 0.001},
  'loss': 'cross_entropy',
  'metric': 'mae'}}

In [15]:
task = None
if "task" in config:
    task = Subtask(config["task"])
else:
    raise ValueError("Task not specified in config")

track = None
if "track" in config:
    track = Track(config["track"])
else:
    print(f"Warning: Track not specified in config for subtask: {task}")

dataset_type = DatasetType.TransformerTruncationDataset
if "dataset_type" in config["data"]:
    dataset_type = DatasetType(config["data"]["dataset_type"])

dataset_type_settings = None
if "dataset_type_settings" in config["data"]:
    dataset_type_settings = config["data"]["dataset_type_settings"]

df_train, df_dev, df_test = load_train_dev_test_df(
    task=task,
    track=track,
    data_dir=f"../{config['data']['data_dir']}",
    label_column=config["data"]["label_column"],
    test_size=config["data"]["test_size"],
    preprocess_text_level=PreprocessTextLevel(
        config["data"]["preprocess_text_level"]
    ),
)

print(f"df_train.shape: {df_train.shape}")
print(f"df_dev.shape: {df_dev.shape}")
print(f"df_test.shape: {df_test.shape}")

Loading train data...
Train/dev split... (df_train.shape: (3649, 3))
Loading test data....././data/original_data/SubtaskC/SubtaskC_dev.jsonl
df_train.shape: (2919, 3)
df_dev.shape: (730, 3)
df_test.shape: (505, 3)


# Build the dataset

In [16]:
import numpy as np
from torch.utils.data import Dataset
from transformers import PreTrainedTokenizer


class TokenClassificationDataset(Dataset):
    def __init__(
        self,
        ids: np.ndarray,
        texts: np.ndarray,
        targets: np.ndarray | None,
        tokenizer: PreTrainedTokenizer | None,
        max_len: int,
        debug: bool = False,
    ):
        super().__init__()

        if tokenizer is None:
            raise ValueError("Tokenizer cannot be None")

        self.ids = ids
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.debug = debug

    def __len__(self):
        return len(self.ids)
    
    def __getitem__(self, index):
        item_id = self.ids[index]
        text = self.texts[index]
        target = -1 if self.targets is None else self.targets[index]
        targets_available = False if target == -1 else True

        text = text.replace("\n", " ")
        text = text.replace("\t", " ")
        text = text.replace("\r", " ")

        words = [w for w in text.split(" ") if w != ""]

        if self.debug:
            print(f"Text: {text}")
            print(f"Words: {words}")
            print(f"Machine text start position: {target}")
            print()

        targets = []
        corresponding_word = []
        tokens = []
        input_ids = []
        attention_mask = []

        for idx, word in enumerate(words):
            word_encoded = self.tokenizer.tokenize(word)  # No [CLS] or [SEP]
            sub_words = len(word_encoded)

            if targets_available:
                is_machine_text = 1 if idx >= target else 0
                targets.extend([is_machine_text] * sub_words)

            corresponding_word.extend([idx] * sub_words)
            tokens.extend(word_encoded)
            input_ids.extend(self.tokenizer.convert_tokens_to_ids(word_encoded))
            attention_mask.extend([1] * sub_words)

            if self.debug:
                print(
                    f"word[{idx}]:\n"
                    f"{'':-<5}> tokens: {word_encoded} (no. of subwords: {sub_words})\n"
                    f"{'':-<5}> corresponding_word: {corresponding_word[-sub_words:]}\n"
                    f"{'':-<5}> input_ids: {input_ids[-sub_words:]}\n"
                    f"{'':-<5}> is_machine_text: {is_machine_text}"
                )

        if self.debug:
            print()

            print(f"corresponding_word: {corresponding_word}")
            print(f"tokens: {tokens}")
            print(f"input_ids: {input_ids}")
            print(f"attention_mask: {attention_mask}")

            print()

            print(f"Machine text start word: {words[corresponding_word[targets.index(1)]]}")
            print(f"True machine text start word: {words[target]}")

            print()

        if len(input_ids) < self.max_len - 2:
            if targets_available:
                targets = (
                    [-100]
                    + targets
                    + [-100] * (self.max_len - len(input_ids) - 1)
                )

            corresponding_word = (
                [-100]
                + corresponding_word
                + [-100] * (self.max_len - len(input_ids) - 1)
            )
            tokens = (
                [self.tokenizer.bos_token]
                + tokens
                + [self.tokenizer.eos_token]
                + [self.tokenizer.pad_token] * (self.max_len - len(tokens) - 2)
            )
            input_ids = (
                [self.tokenizer.bos_token_id]
                + input_ids
                + [self.tokenizer.eos_token_id]
                + [self.tokenizer.pad_token_id] * (self.max_len - len(input_ids) - 2)
            )
            attention_mask = (
                [1]
                + attention_mask
                + [1]
                + [0] * (self.max_len - len(attention_mask) - 2)
            )
        else:
            if targets_available:
                targets = [-100] + targets[: self.max_len - 2] + [-100]

            corresponding_word = (
                [-100]
                + corresponding_word[: self.max_len - 2]
                + [-100]
            )
            tokens = (
                [self.tokenizer.bos_token]
                + tokens[: self.max_len - 2]
                + [self.tokenizer.eos_token]
            )
            input_ids = (
                [self.tokenizer.bos_token_id]
                + input_ids[: self.max_len - 2]
                + [self.tokenizer.eos_token_id]
            )
            attention_mask = (
                [1]
                + attention_mask[: self.max_len - 2]
                + [1]
            )

        encoded = {}
        encoded["id"] = item_id
        encoded["text"] = text
        encoded["true_target"] = torch.tensor(target)
        encoded["corresponding_word"] = torch.tensor(corresponding_word)
        encoded["input_ids"] = torch.tensor(input_ids)
        encoded["attention_mask"] = torch.tensor(attention_mask)
        if targets_available:
            encoded["target"] = torch.tensor(targets)

        if self.debug:
            print(f"Tokenized human position: {targets.index(1)}")
            print(f"Original human position: {target}")
            print(f"Full human text: {text}\n\n")
            print(f"Human truncated text: {[w for w in text.split(' ')[:target] if w != '']}\n\n")

            encoded["partial_human_review"] = " ".join(
                [w for w in text.split(' ')[:target] if w != '']
            )

        return encoded

In [17]:
from torch.utils.data import DataLoader

tokenizer = get_tokenizer(**config["tokenizer"])

train_dataset = TokenClassificationDataset(
    ids=df_train["id"].values,  # [:10],
    texts=df_train["text"].values,  # [:10],
    targets=df_train["label"].values,  # [:10],
    tokenizer=tokenizer,
    max_len=config["data"]["max_len"],
    debug=False,
)
dev_dataset = TokenClassificationDataset(
    ids=df_dev["id"].values,  # [:10],
    texts=df_dev["text"].values,  # [:10],
    targets=df_dev["label"].values,  # [:10],
    tokenizer=tokenizer,
    max_len=config["data"]["max_len"],
    debug=False,
)
test_dataset = TokenClassificationDataset(
    ids=df_test["id"].values,  # [:10],
    texts=df_test["text"].values,  # [:10],
    targets=df_test["label"].values,  # [:10],
    tokenizer=tokenizer,
    max_len=config["data"]["max_len"],
    debug=False,
)

train_dataloader = DataLoader(
    train_dataset,
    batch_size=config["data"]["batch_size"],
    shuffle=True,
)
dev_dataloader = DataLoader(
    dev_dataset,
    batch_size=config["data"]["batch_size"],
    shuffle=False,
)
test_dataloader = DataLoader(
    test_dataset,
    batch_size=config["data"]["batch_size"],
    shuffle=False,
)

In [166]:
# for i, batch in enumerate(train_dataloader):
#     print(f"Batch=[{i + 1}/{len(train_dataloader)}]")
#     # break

# for i, batch in enumerate(dev_dataloader):
#     print(f"Batch=[{i + 1}/{len(dev_dataloader)}]")
#     # break

# Create Longformer CRF model for token classification

In [18]:
import torch.nn as nn
from torchcrf import CRF
from transformers import LongformerModel


class LongformerCRFForTokenClassification(nn.Module):
    def __init__(self, pretrained_model_name, out_size, device, dropout_p=0.3):
        super().__init__()

        self.out_size = out_size
        self.device = device

        self.longformer = LongformerModel.from_pretrained(
            pretrained_model_name, return_dict=False
        )
        self.dropout = nn.Dropout(p=dropout_p)
        self.classifier = nn.Linear(self.longformer.config.hidden_size, out_size)

        self.crf = CRF(num_tags=out_size, batch_first=True)

        self.freeze_transformer_layer()

    def forward(self, input_ids, attention_mask, labels=None):
        sequence_output, _ = self.longformer(
            input_ids=input_ids, attention_mask=attention_mask
        )

        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)

        loss = None
        if labels is not None:
            log_likelihood = self.crf(logits, labels)
            logits = self.crf.decode(logits)

            loss = 0 - log_likelihood
        else:
            logits = self.crf.decode(logits)
        logits = torch.Tensor(logits).to(self.device)

        return loss, logits

    def freeze_transformer_layer(self):
        for param in self.longformer.parameters():
            param.requires_grad = False

    def unfreeze_transformer_layer(self):
        # Fine-tune only the last 4 layer
        for layer in self.longformer.encoder.layer[-4:]:
            for param in layer.parameters():
                param.requires_grad = True

    def get_predictions_from_logits(self, logits, labels=None, corresponding_word=None):
        # logits: (batch_size, max_seq_len)
        # labels: (batch_size, max_seq_len)
        # corresponding_word: (batch_size, max_seq_len)

        # print(f"logits.shape: {logits.shape}")
        # print(f"logits: {logits}")

        # preds: (batch_size, max_seq_len)
        # preds = torch.argmax(logits, dim=-1)
        preds = logits.clone()

        # print(f"preds.shape: {preds.shape}")
        # print(f"preds: {preds}")

        if labels is not None:
            # print(f"labels.shape: {labels.shape}")
            # print(f"labels: {labels}")

            # Keep only predictions where labels are not -100
            # clean_preds = preds[labels != -100].reshape(batch_size, -1)
            # clean_labels = labels[labels != -100].reshape(batch_size, -1)

            # print(f"clean_preds.shape: {clean_preds.shape}")
            # print(f"clean_preds: {clean_preds}")

            # print(f"clean_labels.shape: {clean_labels.shape}")
            # print(f"clean_labels: {clean_labels}")

            # Get the index of the first machine text word
            # predicted_positions = clean_preds.argmax(dim=-1)
            # true_positions = clean_labels.argmax(dim=-1)

            predicted_positions = []
            true_positions = []
            for p, l in zip(preds, labels):
                mask = l != -100

                clean_pred = p[mask]
                clean_label = l[mask]

                # print(f"clean_pred.shape: {clean_pred.shape}")
                # print(f"clean_pred: {clean_pred}")
                # print(f"clean_label.shape: {clean_label.shape}")
                # print(f"clean_label: {clean_label}")

                predicted_position = clean_pred.argmax(dim=-1)
                true_position = clean_label.argmax(dim=-1)

                # print(f"predicted_position: {predicted_position}")
                # print(f"true_position: {true_position}")

                predicted_positions.append(predicted_position.item())
                true_positions.append(true_position.item())

            # print(f"predicted_positions.shape: {predicted_positions.shape}")
            # print(f"predicted_positions: {predicted_positions}")

            # print(f"true_positions.shape: {true_positions.shape}")
            # print(f"true_positions: {true_positions}")

            # print(f"predicted_positions type: {type(predicted_positions)}")
            # print(f"true_positions type: {type(true_positions)}")

            return torch.Tensor(predicted_positions), torch.Tensor(true_positions)
        elif corresponding_word is not None:
            # print(f"corresponding_word.shape: {corresponding_word.shape}")
            # print(f"corresponding_word: {corresponding_word}")

            # Keep only predictions where corresponding_word are not -100
            # clean_preds = preds[corresponding_word != -100].reshape(
            #     batch_size, -1
            # ).detach().cpu().numpy()
            # clean_corresponding_word = corresponding_word[corresponding_word != -100].reshape(
            #     batch_size, -1
            # ).detach().cpu().numpy()

            # print(f"clean_preds.shape: {clean_preds.shape}")
            # print(f"clean_preds: {clean_preds}")

            # print(f"clean_corresponding_word.shape: {clean_corresponding_word.shape}")
            # print(f"clean_corresponding_word: {clean_corresponding_word}")

            predicted_positions = []
            for p, w in zip(preds, corresponding_word):
                mask = w != -100

                clean_pred = p[mask]
                clean_corresponding_word = w[mask]

                # print(f"clean_pred.shape: {clean_pred.shape}")
                # print(f"clean_pred: {clean_pred}")
                # print(f"clean_corresponding_word.shape: {clean_corresponding_word.shape}")
                # print(f"clean_corresponding_word: {clean_corresponding_word}")

                # Get the index of the first machine text word
                index = torch.where(clean_pred == 1)[0]
                value = index[0] if index.size else len(clean_pred) - 1
                position = clean_corresponding_word[value]

                # print(f"index: {index}")
                # print(f"value: {value}")
                # print(f"position: {position}")

                predicted_positions.append(position.item())
            #     # pred = pred.detach().cpu().numpy()

            #     index = np.where(pred == 1)[0]
            #     value = index[0] if index.size else len(pred) - 1
            #     position = clean_corresponding_word[idx][value]

            #     predicted_positions.append(position.item())

            print(f"predicted_positions: {predicted_positions}")

            return predicted_positions, None
        else:
            raise ValueError("Either labels or corresponding_word must be provided")

# Train model

In [19]:
import pandas as pd
from tqdm import tqdm
# from time import time
from collections import defaultdict


def train_epoch(
    model,
    dataloader,
    loss_fn,
    optimizer,
    device,
    scheduler,
    metric_fn,
    print_freq=10,
):
    model.train()

    losses = []

    all_predictions = []
    all_true = []
    all_ids = []

    for i, batch in enumerate(dataloader):
        ids = batch["id"]
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        targets = batch["target"].to(device)
        corresponding_word = batch["corresponding_word"].to(device)

        loss, logits = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=targets,
        )

        predictions, true_predictions = model.get_predictions_from_logits(
            logits=logits,
            labels=targets,
            corresponding_word=corresponding_word
        )

        # print(f"predictions: {predictions}")
        # print(f"true_predictions: {true_predictions}")

        losses.append(loss.item())

        all_predictions.extend(predictions.tolist())
        all_true.extend(true_predictions.tolist())
        all_ids.extend(ids)

        if i % print_freq == 0:
            print(
                f"Batch [{i + 1}/{len(dataloader)}]; "
                f"Loss: {loss.item():.5f}; "
                f"Mean absolute error: {metric_fn(true_predictions, predictions):.5f}"
            )

        loss.backward()

        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()
        if scheduler is not None:
            scheduler.step()
        optimizer.zero_grad()

    return np.mean(losses), (all_ids, all_true, all_predictions)


def validation_epoch(
    model,
    dataloader,
    loss_fn,
    device,
    metric_fn,
):
    model.eval()

    losses = []
    all_predictions = []
    all_true = []
    all_ids = []

    with torch.no_grad():
        for i, batch in enumerate(tqdm(dataloader)):
            ids = batch["id"]
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            targets = batch["target"].to(device)
            corresponding_word = batch["corresponding_word"].to(device)

            loss, logits = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=targets,
            )

            predictions, true_predictions = model.get_predictions_from_logits(
                logits=logits,
                labels=targets,
                corresponding_word=corresponding_word
            )

            losses.append(loss.item())

            all_predictions.extend(predictions.tolist())
            all_true.extend(true_predictions.tolist())
            all_ids.extend(ids)

    return np.mean(losses), (all_ids, all_true, all_predictions)


def training_loop(
    model,
    num_epochs,
    train_dataloader,
    dev_dataloader,
    loss_fn,
    optimizer_config,
    scheduler_config,
    device,
    metric_fn,
    is_better_metric_fn,
    num_epochs_before_finetune,
    results_dir,
):
    history = defaultdict(list)
    best_metric = None
    best_model_state = None

    optimizer = get_optimizer(model, optimizer_config, finetune=False)
    scheduler = None

    for epoch in range(1, num_epochs + 1):
        print(f"Epoch {epoch}/{num_epochs}")
        if epoch <= num_epochs_before_finetune:
            print("Freeze transformer")
        else:
            print("Finetune transformer")
        print("-" * 10)

        if epoch == num_epochs_before_finetune + 1:
            model.unfreeze_transformer_layer()
            optimizer = get_optimizer(model, optimizer_config, finetune=True)
            scheduler = get_scheduler(
                optimizer,
                num_training_steps=len(train_dataloader) * num_epochs,
                **scheduler_config,
            )

        train_loss, (train_ids, train_true, train_predict) = train_epoch(
            model,
            train_dataloader,
            loss_fn,
            optimizer,
            device,
            scheduler,
            metric_fn,
        )

        train_metric = metric_fn(train_true, train_predict)

        print(f"Train Loss: {train_loss:.5f}; Train Metric: {train_metric:.5f}")

        dev_loss, (dev_ids, dev_true, dev_predict) = validation_epoch(
            model,
            dev_dataloader,
            loss_fn,
            device,
            metric_fn,
        )

        dev_metric = metric_fn(dev_true, dev_predict)

        print(
            f"Validation Loss: {dev_loss:.5f}; "
            f"Validation Metric: {dev_metric:.5f}"
        )

        history["train_metric"].append(train_metric)
        history["train_loss"].append(train_loss)
        history["dev_metric"].append(dev_metric)
        history["dev_loss"].append(dev_loss)

        if best_metric is None or is_better_metric_fn(train_metric, best_metric):
            best_metric = train_metric
            best_model_state = model.state_dict()
            
            if results_dir is not None:
                torch.save(
                    best_model_state,
                    os.path.join(results_dir, "best_model.bin"),
                )

                df_train_predictions = pd.DataFrame(
                    {
                        "id": train_ids,
                        "true": train_true,
                        "predict": train_predict,
                    }
                )
                df_train_predictions.to_csv(
                    os.path.join(results_dir, "best_model_train_predict.csv"),
                    index=False
                )

                df_dev_predictions = pd.DataFrame(
                    {
                        "id": dev_ids,
                        "true": dev_true,
                        "predict": dev_predict,
                    }
                )
                df_dev_predictions.to_csv(
                    os.path.join(results_dir, "best_model_dev_predict.csv"),
                    index=False
                )

    df_history = pd.DataFrame(history)
    if results_dir is not None:
        df_history.to_csv(os.path.join(results_dir, "history.csv"), index=False)

        model.load_state_dict(torch.load(os.path.join(results_dir, "best_model.bin")))
    else:
        model.load_state_dict(best_model_state)

    return model, df_history

In [20]:
num_epochs = config["training"]["num_epochs"]
model = LongformerCRFForTokenClassification(
    device=DEVICE, **config["model_config"]
).to(DEVICE)
loss_fn = get_loss_fn(config["training"]["loss"], DEVICE)
optimizer_config = config["training"]["optimizer"]
scheduler_config = config["training"]["scheduler"]
metric_fn, is_better_metric_fn = get_metric(config["training"]["metric"])
num_epochs_before_finetune = config["training"]["num_epochs_before_finetune"]

best_model, df_history = training_loop(
    model,
    num_epochs,
    train_dataloader,
    dev_dataloader,
    loss_fn,
    optimizer_config,
    scheduler_config,
    DEVICE,
    metric_fn,
    is_better_metric_fn,
    num_epochs_before_finetune,
    results_dir,
)

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing LongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch 1/5
Finetune transformer
----------
Batch [1/183]; Loss: nan; Mean absolute error: 80.87500
Batch [11/183]; Loss: 9766.71289; Mean absolute error: 143.00000
Batch [21/183]; Loss: nan; Mean absolute error: 107.81250
Batch [31/183]; Loss: nan; Mean absolute error: 129.00000
Batch [41/183]; Loss: 8029.16113; Mean absolute error: 72.50000
Batch [51/183]; Loss: nan; Mean absolute error: 86.31250
Batch [61/183]; Loss: nan; Mean absolute error: 107.31250
Batch [71/183]; Loss: 6612.65039; Mean absolute error: 134.31250
Batch [81/183]; Loss: nan; Mean absolute error: 95.43750
Batch [91/183]; Loss: nan; Mean absolute error: 54.75000
Batch [101/183]; Loss: 5013.09814; Mean absolute error: 34.81250
Batch [111/183]; Loss: nan; Mean absolute error: 32.75000
Batch [121/183]; Loss: nan; Mean absolute error: 54.00000
Batch [131/183]; Loss: 7236.47559; Mean absolute error: 52.18750
Batch [141/183]; Loss: nan; Mean absolute error: 34.93750
Batch [151/183]; Loss: nan; Mean absolute error: 69.18750
B

100%|██████████| 46/46 [04:58<00:00,  6.50s/it]


Validation Loss: nan; Validation Metric: 64.01507
Epoch 2/5
Finetune transformer
----------
Batch [1/183]; Loss: nan; Mean absolute error: 37.50000
Batch [11/183]; Loss: 5534.94580; Mean absolute error: 35.50000
Batch [21/183]; Loss: nan; Mean absolute error: 54.00000
Batch [31/183]; Loss: nan; Mean absolute error: 27.43750
Batch [41/183]; Loss: 4961.21582; Mean absolute error: 26.12500
Batch [51/183]; Loss: nan; Mean absolute error: 14.37500
Batch [61/183]; Loss: nan; Mean absolute error: 66.25000
Batch [71/183]; Loss: 3124.94531; Mean absolute error: 45.00000
Batch [81/183]; Loss: nan; Mean absolute error: 25.25000
Batch [91/183]; Loss: nan; Mean absolute error: 19.43750
Batch [101/183]; Loss: 5074.36523; Mean absolute error: 23.50000
Batch [111/183]; Loss: nan; Mean absolute error: 21.81250
Batch [121/183]; Loss: nan; Mean absolute error: 32.37500
Batch [131/183]; Loss: 3878.21191; Mean absolute error: 32.56250
Batch [141/183]; Loss: nan; Mean absolute error: 7.75000
Batch [151/183]

100%|██████████| 46/46 [04:51<00:00,  6.35s/it]


Validation Loss: nan; Validation Metric: 45.69178
Epoch 3/5
Finetune transformer
----------
Batch [1/183]; Loss: nan; Mean absolute error: 34.06250
Batch [11/183]; Loss: 6530.47705; Mean absolute error: 22.43750
Batch [21/183]; Loss: nan; Mean absolute error: 16.62500
Batch [31/183]; Loss: nan; Mean absolute error: 34.18750
Batch [41/183]; Loss: 5435.17969; Mean absolute error: 27.25000
Batch [51/183]; Loss: nan; Mean absolute error: 25.75000
Batch [61/183]; Loss: nan; Mean absolute error: 26.50000
Batch [71/183]; Loss: 4317.05420; Mean absolute error: 10.50000
Batch [81/183]; Loss: nan; Mean absolute error: 28.68750
Batch [91/183]; Loss: nan; Mean absolute error: 22.12500
Batch [101/183]; Loss: 5348.78711; Mean absolute error: 30.37500
Batch [111/183]; Loss: nan; Mean absolute error: 18.37500
Batch [121/183]; Loss: nan; Mean absolute error: 37.18750
Batch [131/183]; Loss: 5803.58398; Mean absolute error: 13.31250
Batch [141/183]; Loss: nan; Mean absolute error: 20.31250
Batch [151/183

100%|██████████| 46/46 [04:54<00:00,  6.40s/it]


Validation Loss: nan; Validation Metric: 43.26438
Epoch 4/5
Finetune transformer
----------
Batch [1/183]; Loss: nan; Mean absolute error: 23.31250
Batch [11/183]; Loss: 2062.53223; Mean absolute error: 25.18750
Batch [21/183]; Loss: nan; Mean absolute error: 44.43750
Batch [31/183]; Loss: nan; Mean absolute error: 17.75000
Batch [41/183]; Loss: 2100.91797; Mean absolute error: 25.93750
Batch [51/183]; Loss: nan; Mean absolute error: 19.18750
Batch [61/183]; Loss: nan; Mean absolute error: 44.62500
Batch [71/183]; Loss: 4257.60352; Mean absolute error: 27.50000
Batch [81/183]; Loss: nan; Mean absolute error: 29.50000
Batch [91/183]; Loss: nan; Mean absolute error: 43.56250
Batch [101/183]; Loss: 2088.25195; Mean absolute error: 27.56250
Batch [111/183]; Loss: nan; Mean absolute error: 18.56250
Batch [121/183]; Loss: nan; Mean absolute error: 22.00000
Batch [131/183]; Loss: 4261.31006; Mean absolute error: 28.06250
Batch [141/183]; Loss: nan; Mean absolute error: 37.75000
Batch [151/183

100%|██████████| 46/46 [04:59<00:00,  6.50s/it]


Validation Loss: nan; Validation Metric: 38.36575
Epoch 5/5
Finetune transformer
----------
Batch [1/183]; Loss: nan; Mean absolute error: 24.93750
Batch [11/183]; Loss: 2879.02881; Mean absolute error: 34.31250
Batch [21/183]; Loss: nan; Mean absolute error: 18.31250
Batch [31/183]; Loss: nan; Mean absolute error: 30.68750
Batch [41/183]; Loss: 4056.57812; Mean absolute error: 25.56250
Batch [51/183]; Loss: nan; Mean absolute error: 22.81250
Batch [61/183]; Loss: nan; Mean absolute error: 31.68750
Batch [71/183]; Loss: 2102.71484; Mean absolute error: 23.62500
Batch [81/183]; Loss: nan; Mean absolute error: 23.93750
Batch [91/183]; Loss: nan; Mean absolute error: 20.37500
Batch [101/183]; Loss: 2399.46436; Mean absolute error: 28.31250
Batch [111/183]; Loss: nan; Mean absolute error: 70.68750
Batch [121/183]; Loss: nan; Mean absolute error: 18.87500
Batch [131/183]; Loss: 4223.10938; Mean absolute error: 68.12500
Batch [141/183]; Loss: nan; Mean absolute error: 20.18750
Batch [151/183

100%|██████████| 46/46 [04:55<00:00,  6.41s/it]


Validation Loss: nan; Validation Metric: 38.70274


# Make predictions

In [21]:
import pandas as pd


def make_predictions(
    model,
    dataloader,
    device,
    results_dir,
    label_column,
    file_format="csv",
):
    model.eval()

    all_predictions = []
    all_true = []
    all_ids = []

    with torch.no_grad():
        for i, batch in enumerate(tqdm(dataloader)):
            ids = batch["id"]
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            targets = batch["target"].to(device)
            corresponding_word = batch["corresponding_word"].to(device)

            _, logits = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=targets,
            )

            predictions, true_predictions = model.get_predictions_from_logits(
                logits=logits,
                labels=targets,
                corresponding_word=corresponding_word
            )

            all_predictions.extend(predictions.tolist())
            all_true.extend(true_predictions.tolist())
            all_ids.extend(ids)

    df_predictions = pd.DataFrame(
        {
            "id": all_ids,
            "true": all_true,
            label_column: all_predictions,
        }
    )

    if results_dir is not None:
        if file_format == "csv":
            df_predictions.to_csv(
                os.path.join(results_dir, "submission.csv"),
                index=False,
            )
        elif file_format == "jsonl":
            df_predictions.to_json(
                os.path.join(results_dir, "submission.jsonl"),
                orient="records",
                lines=True,
            )
        else:
            raise ValueError(f"Unknown file format: {file_format}")
    else:
        print("Missing results_dir, not saving predictions to file!")

    return df_predictions

In [22]:
predictions = make_predictions(
    best_model,
    test_dataloader,
    DEVICE,
    results_dir,
    config["data"]["label_column"],
    file_format="csv",
)

100%|██████████| 32/32 [03:22<00:00,  6.33s/it]


In [23]:
!python ../scores_and_plots.py --results-dir "../runs/SubtaskC"

Results on validation
MAE: 38.70274
--------------------
Results on test
MAE: 32.22178
--------------------
