In [1]:
import os
import sys
import json
import torch

sys.path.append("../")
from lib.utils import get_device, get_current_date
from lib.utils.constants import Subtask, Track, PreprocessTextLevel, DatasetType
from lib.utils.models import sequential_fully_connected
from lib.data.loading import load_train_dev_test_df, build_data_loader
from lib.data.tokenizer import get_tokenizer
from lib.data.vocabulary import get_vocabulary, WordVocabulary, CharacterVocabulary
from lib.training.optimizer import get_optimizer, get_scheduler
from lib.training.loss import get_loss_fn
from lib.training.metric import get_metric

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
CONFIG_FILE_PATH = os.path.relpath("../config.json")

config = {}
with open(CONFIG_FILE_PATH, "r") as config_file:
    config = json.load(config_file)

DEVICE = get_device()
print(f"Using device: {DEVICE}")

Using device: mps


In [None]:
# config

In [3]:
task = None
if "task" in config:
    task = Subtask(config["task"])
else:
    raise ValueError("Task not specified in config")

track = None
if "track" in config:
    track = Track(config["track"])
else:
    print(f"Warning: Track not specified in config for subtask: {task}")

dataset_type = DatasetType.TransformerTruncationDataset
if "dataset_type" in config["data"]:
    dataset_type = DatasetType(config["data"]["dataset_type"])

dataset_type_settings = None
if "dataset_type_settings" in config["data"]:
    dataset_type_settings = config["data"]["dataset_type_settings"]

test_size = (
    None if "test_size" not in config["data"] else config["data"]["test_size"]
)
df_train, df_dev, df_test = load_train_dev_test_df(
    task=task,
    track=track,
    data_dir=f"../{config['data']['data_dir']}",
    label_column=config["data"]["label_column"],
    test_size=test_size,
    preprocess_text_level=PreprocessTextLevel(
        config["data"]["preprocess_text_level"]
    ),
)

print(f"df_train.shape: {df_train.shape}")
print(f"df_dev.shape: {df_dev.shape}")
print(f"df_test.shape: {df_test.shape}")

Loading train data...
.././data/original_data/SubtaskC/SubtaskC_train.jsonl
df_train.shape: (3649, 3)
df_dev.shape: (505, 3)
df_test.shape: (11123, 2)


In [4]:
DEBUG = False
if DEBUG:
    results_dir = os.path.relpath("../runs/SubtaskC/")
else:
    results_dir = os.path.relpath(
        f"../runs/{get_current_date()}-{task.value}-{config['model']}"
    )
if not os.path.exists(results_dir):
    os.makedirs(results_dir)

print(f"Will save results to: {results_dir}")

with open(results_dir + "/config.json", "w") as f:
    json.dump(config, f, indent=4)

Will save results to: ../runs/20-01-2024_09:30:34-SubtaskC-cnn_bilstm_with_crf_for_token_classification


In [5]:
tokenizer = get_tokenizer(**config["tokenizer"])

In [6]:
char_vocabulary, word_vocabulary = None, None
char_max_len, word_max_len = None, config["data"]["max_len"]
if dataset_type == DatasetType.TokenClassificationDataset:
    if dataset_type_settings is not None:
        if "chars" in dataset_type_settings:
            char_vocabulary = get_vocabulary("chars")
            char_vocabulary.build_vocabulary(df_train)

            char_max_len = dataset_type_settings["chars"]["max_len"]

        if "words" in dataset_type_settings:
            word_vocabulary = get_vocabulary("words")
            word_vocabulary.build_vocabulary(df_train)

            word_max_len = dataset_type_settings["words"]["max_len"]
    else:
        word_vocabulary = get_vocabulary("words")
        word_vocabulary.build_vocabulary(df_train)

        word_max_len = config["data"]["max_len"]

if "vocab_size" in config["model_config"]:
    config["model_config"]["vocab_size"] = word_vocabulary.vocab_size()
if "char_vocab_size" in config["model_config"]:
    config["model_config"]["char_vocab_size"] = char_vocabulary.vocab_size()
if "word_vocab_size" in config["model_config"]:
    config["model_config"]["word_vocab_size"] = word_vocabulary.vocab_size()

if "char_max_len" in config["model_config"]:
    config["model_config"]["char_max_len"] = char_max_len

# Save vocab size to config
with open(results_dir + "/config.json", "w") as f:
    json.dump(config, f, indent=4)

Building vocabulary: 100%|██████████| 3649/3649 [00:00<00:00, 41058.63it/s]
Building word vocabulary: 100%|██████████| 3649/3649 [00:00<00:00, 50081.69it/s]


In [None]:
test_size

In [7]:
train_dataloader = build_data_loader(
    df_train,
    tokenizer,
    max_len=word_max_len,
    batch_size=config["data"]["batch_size"],
    label_column=config["data"]["label_column"],
    shuffle=True,
    dataset_type=dataset_type,
    dataset_type_settings=dataset_type_settings,
    char_vocabulary=char_vocabulary,
    char_max_len=char_max_len,
    word_vocabulary=word_vocabulary,
    device=DEVICE,
)
dev_dataloader = build_data_loader(
    df_dev,
    tokenizer,
    max_len=word_max_len,
    batch_size=config["data"]["batch_size"],
    label_column=config["data"]["label_column"],
    dataset_type=dataset_type,
    dataset_type_settings=dataset_type_settings,
    char_vocabulary=char_vocabulary,
    char_max_len=char_max_len,
    word_vocabulary=word_vocabulary,
    device=DEVICE,
)
test_dataloader = build_data_loader(
    df_test,
    tokenizer,
    max_len=word_max_len,
    batch_size=config["data"]["batch_size"],
    label_column=config["data"]["label_column"],
    has_targets=False if test_size is None else True,
    dataset_type=dataset_type,
    dataset_type_settings=dataset_type_settings,
    char_vocabulary=char_vocabulary,
    char_max_len=char_max_len,
    word_vocabulary=word_vocabulary,
    device=DEVICE,
)

In [None]:
char_vocabulary.idx2char[3]

In [None]:
word_vocabulary.idx2word[3]

In [None]:
for i, batch in enumerate(train_dataloader):
    print(f"Batch=[{i + 1}/{len(train_dataloader)}]")
    print(f"batch['input_ids'].shape: {batch['input_ids'].shape}")
    print(f"batch['char_input_ids'].shape: {batch['char_input_ids'].shape}")
    print(f"batch['attention_mask'].shape: {batch['attention_mask'].shape}")
    print(f"batch['char_attention_mask'].shape: {batch['char_attention_mask'].shape}")
    print(f"batch['target'].shape: {batch['target'].shape}")
    print(f"batch['target']: {batch['target']}")
    print(f"batch['corresponding_word']: {batch['corresponding_word']}")
    break

# for i, batch in enumerate(dev_dataloader):
#     print(f"Batch=[{i + 1}/{len(dev_dataloader)}]")
# #     # break

In [None]:
# vocabulary

# Load Pretrained FastText Vectors

In [8]:
import numpy as np
from tqdm import tqdm


def load_pretrained_vectors(word2idx, pretrained_vectors_path, pad_token="<pad>"):
    fin = open(pretrained_vectors_path, "r", encoding="utf-8", newline="\n", errors="ignore")
    n, d = map(int, fin.readline().split())

    embeddings = np.random.uniform(-0.25, 0.25, (len(word2idx), d))
    embeddings[word2idx[pad_token]] = np.zeros((d,))

    count = 0
    for line in tqdm(fin):
        tokens = line.rstrip().split(" ")
        word = tokens[0]
        if word in word2idx:
            count += 1
            embeddings[word2idx[word]] = np.array(tokens[1:], dtype=np.float32)

    coverage = count / len(word2idx) * 100
    print(f"There are {count}/{len(word2idx)} pretraied vectors found ({coverage:.4f}%)")

    return embeddings

In [9]:
embeddings = load_pretrained_vectors(
    word_vocabulary.word2idx,
    "../data/fasttext/crawl-300d-2M.vec",
    pad_token=word_vocabulary.padding_token,
)
embeddings = torch.Tensor(embeddings)

1999995it [00:17, 114150.22it/s]

There are 12694/29594 pretraied vectors found (42.8938%)





In [None]:
word_vocabulary.padding_token_idx

# Create CNN model for token classification

In [10]:
import torch.nn as nn
import torch.nn.functional as F
from transformers import LongformerModel

try:
    from torchcrf import CRF
except ImportError:
    print(f"Warning: CRF module not found. Install it with pip install torchcrf")

# TODO: Implement this: https://medium.com/illuin/named-entity-recognition-with-bilstm-cnns-632ba83d3d41
# Use the CNN as feature extractor for the character level
# Use FastText or something similar for the word level embeddings
# Combine them with BiLSTM

class CharacterLevelCNNEmbedding(nn.Module):
    def __init__(
        self,
        vocab_size,
        max_len,
        embedding_dim,
        filter_size=3,
        num_filters=30,
        dropout_p=0.5,
    ):
        super(CharacterLevelCNNEmbedding, self).__init__()

        self.vocab_size = vocab_size
        # self.max_len = max_len
        self.embedding_dim = embedding_dim
        self.filter_size = filter_size
        self.num_filters = num_filters

        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_dim,
            padding_idx=1,
        )

        self.conv = nn.Conv1d(
            in_channels=embedding_dim,
            out_channels=num_filters,
            kernel_size=filter_size,
            padding=(filter_size - 1) // 2,
        )

        self.max_pool_1d = nn.MaxPool1d(kernel_size=max_len)
        self.dropout = nn.Dropout(dropout_p)

        self._init_embedding_weights()

    def forward(self, input_ids, attention_mask, labels=None):
        # input_ids: (batch_size, max_seq_len, max_char_len)
        # print(f"input_ids.shape: {input_ids.shape}")

        outputs = []
        for i, x in enumerate(input_ids):
            # print(f"Token {i + 1}/{input_ids.shape[0]}")
            # x.shape: (max_seq_len, max_char_len)

            # embeddings.shape: (max_seq_len, max_char_len, embedding_dim)
            embeddings = self.embedding(x)
            # print(f"embeddings.shape: {embeddings.shape}")

            embeddings = self.dropout(embeddings)
            # print(f"embeddings.shape: {embeddings.shape}")

            # embeddings.shape: (max_seq_len, embedding_dim, max_char_len)
            embeddings = embeddings.permute(0, 2, 1)
            # print(f"embeddings.shape: {embeddings.shape}")

            # conv_out.shape: (max_seq_len, num_filters, max_char_len)
            conv_out = F.tanh(self.conv(embeddings))
            # print(f"conv_out.shape: {conv_out.shape}")

            # max_pool_out.shape: (max_seq_len, num_filters, 1)
            max_pool_out = self.max_pool_1d(conv_out)
            # print(f"max_pool_out.shape: {max_pool_out.shape}")

            # max_pool_out.shape: (max_seq_len, num_filters)
            max_pool_out = max_pool_out.view(max_pool_out.size(0), -1)
            # print(f"max_pool_out.shape: {max_pool_out.shape}")

            # output.shape: (max_seq_len, num_filters)
            output = self.dropout(max_pool_out)
            # print(f"output.shape: {output.shape}")

            outputs.append(output)

        # outputs.shape: (batch_size, max_seq_len, num_filters)
        outputs = torch.stack(outputs)
        # print(f"outputs.shape: {outputs.shape}")

        return outputs

    def _init_embedding_weights(self):
        self.embedding.weight.data = self.embedding.weight.data.uniform_(
            -0.5, 0.5
        )


class CNNBiLSTMForTokenClassification(nn.Module):
    def __init__(
        self,
        char_vocab_size,
        char_max_len,
        char_embedding_dim,
        char_filter_size=3,
        char_num_filters=30,
        char_dropout_p=0.5,
        word_pretrained_embedding=None,
        word_freeze_embedding=False,
        word_vocab_size=None,
        word_embedding_dim=300,
        # filter_sizes=[3, 4, 5],
        # num_filters=[100, 100, 100],
        n_layers=1,
        hidden_dim=32,
        dropout_p=0.5,
        fc=[],
        out_size=2,
    ):
        super(CNNBiLSTMForTokenClassification, self).__init__()

        self.out_size = out_size

        self.char_embedding = CharacterLevelCNNEmbedding(
            char_vocab_size,
            char_max_len,
            char_embedding_dim,
            char_filter_size,
            char_num_filters,
            char_dropout_p,
        )

        if word_pretrained_embedding is not None:
            self.word_vocab_size, self.word_embedding_dim = word_pretrained_embedding.shape
            self.word_embedding = nn.Embedding.from_pretrained(
                word_pretrained_embedding,
                freeze=word_freeze_embedding,
            )
        else:
            self.vocab_size = word_vocab_size
            self.word_embedding_dim = word_embedding_dim
            self.word_embedding = nn.Embedding(
                num_embeddings=word_vocab_size,
                embedding_dim=word_embedding_dim,
                padding_idx=1,
            )

            self._init_embedding_weights()

        # self.filter_sizes = filter_sizes
        # self.num_filters = num_filters
        # self.convs = nn.ModuleList(
        #     nn.Conv1d(
        #         in_channels=self.embed_dim,
        #         out_channels=num_filters[i],
        #         kernel_size=filter_sizes[i],
        #         padding=(filter_sizes[i] - 1) // 2,
        #     ) for i in range(len(filter_sizes))
        # )

        # self.fc = sequential_fully_connected(
        #     np.sum(num_filters), out_size, fc, dropout_p,
        # )

        self.lstm = nn.LSTM(
            char_num_filters + self.word_embedding_dim,
            hidden_dim,
            n_layers,
            bidirectional=True,
            batch_first=True,
        )

        # self.fc = nn.Linear(np.sum(num_filters), out_size)
        self.classifier = sequential_fully_connected(
            2 * hidden_dim, out_size, fc, dropout_p,
        )
        # self.dropout = nn.Dropout(dropout_p)

        # self._compute_output_dim()

    @property
    def output_dim(self):
        return self._output_dim

    def freeze_transformer_layer(self):
        pass

    def unfreeze_transformer_layer(self):
        pass

    def forward(
        self,
        input_ids,
        attention_mask,
        char_input_ids,
        char_attention_mask,
        device,
        labels=None,
    ):
        # input_ids: (batch_size, max_seq_len, max_char_len)
        # print(f"input_ids.shape: {input_ids.shape}")

        char_embeddings = self.char_embedding(
            char_input_ids,
            char_attention_mask,
        )

        # input_ids.shape: (batch_size, max_seq_len)
        word_embeddings = self.word_embedding(input_ids)
        # embeddings = embeddings.permute(0, 2, 1)
        # print(f"embeddings.shape: {embeddings.shape}")

        embeddings = torch.cat(
            [char_embeddings, word_embeddings],
            dim=-1,
        )
        # print(f"embeddings.shape: {embeddings.shape}")

        lengths = attention_mask.sum(dim=1)
        # print(f"lengths.shape: {lengths.shape}")

        packed_embeddings = nn.utils.rnn.pack_padded_sequence(
            embeddings, lengths.cpu(), batch_first=True, enforce_sorted=False
        )
        # print(f"packed_embeddings.data.shape: {packed_embeddings.data.shape}")

        packed_output, (_, _) = self.lstm(packed_embeddings)
        # print(f"packed_output.data.shape: {packed_output.data.shape}")

        output, _ = nn.utils.rnn.pad_packed_sequence(
            packed_output, batch_first=True, total_length=embeddings.shape[1],
        )
        # print(f"output.shape: {output.shape}")

        # output = self.dropout(output)
        logits = self.classifier(output)

        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss().to(device)
            loss = loss_fn(logits.view(-1, self.out_size), labels.view(-1))

        return loss, logits

        # x_conv.shape: (batch_size, num_filters[i], L_out)
        # x_conv = [
        #     F.relu(conv1d(embeddings)) for conv1d in self.convs
        # ]
        # for i in range(len(x_conv)):
        #     print(f"x_conv[{i}].shape: {x_conv[i].shape}")
        # print(f"x_conv.shape: {x_conv[0].shape}")
        # print(f"x_conv: {x_conv[0]}")

        # x_max_pool.shape: (batch_size, num_filters[i], 1)
        # x_max_pool = [
        #     F.max_pool1d(
        #         conv,
        #         kernel_size=self.filter_sizes[i],
        #         stride=1,
        #         padding=self.filter_sizes[i] // 2,
        #     ) for i, conv in enumerate(x_conv)
        # ]
        # for i in range(len(x_max_pool)):
        #     print(f"x_max_pool[{i}].shape: {x_max_pool[i].shape}")
        # print(f"x_max_pool.shape: {x_max_pool[0].shape}")
        # print(f"x_max_pool: {x_max_pool[0]}")

        # x_fc = torch.cat(
        #     x_max_pool,
        #     dim=1,
        # )
        # x_fc = x_fc.permute(0, 2, 1)
        # print(f"x_fc.shape: {x_fc.shape}")

        # logits = self.fc(self.dropout(x_fc))
        # logits = self.fc(x_fc)

        # print(f"logits.shape: {logits.shape}")

        # return logits

    def get_predictions_from_logits(self, logits, labels=None, corresponding_word=None):
        # batch_size = logits.shape[0]

        # logits: (batch_size, max_seq_len, out_size)
        # labels: (batch_size, max_seq_len)
        # corresponding_word: (batch_size, max_seq_len)

        # print(f"logits.shape: {logits.shape}")
        # print(f"logits: {logits}")

        # preds: (batch_size, max_seq_len)
        preds = torch.argmax(logits, dim=-1)

        # print(f"preds.shape: {preds.shape}")
        # print(f"preds: {preds}")

        if labels is not None:
            # print(f"labels.shape: {labels.shape}")
            # print(f"labels: {labels}")

            # Keep only predictions where labels are not -100
            # clean_preds = preds[labels != -100].reshape(batch_size, -1)
            # clean_labels = labels[labels != -100].reshape(batch_size, -1)

            # print(f"clean_preds.shape: {clean_preds.shape}")
            # print(f"clean_preds: {clean_preds}")

            # print(f"clean_labels.shape: {clean_labels.shape}")
            # print(f"clean_labels: {clean_labels}")

            # Get the index of the first machine text word
            # predicted_positions = clean_preds.argmax(dim=-1)
            # true_positions = clean_labels.argmax(dim=-1)

            predicted_positions = []
            true_positions = []
            for p, l in zip(preds, labels):
                mask = l != -100

                clean_pred = p[mask]
                clean_label = l[mask]

                print(f"clean_pred.shape: {clean_pred.shape}")
                # print(f"clean_pred: {clean_pred}")
                # print(f"clean_label.shape: {clean_label.shape}")
                # print(f"clean_label: {clean_label}")

                predicted_position = clean_pred.argmax(dim=-1)
                true_position = clean_label.argmax(dim=-1)

                # print(f"predicted_position: {predicted_position}")
                # print(f"true_position: {true_position}")

                predicted_positions.append(predicted_position.item())
                true_positions.append(true_position.item())

            # print(f"predicted_positions.shape: {predicted_positions.shape}")
            # print(f"predicted_positions: {predicted_positions}")

            # print(f"true_positions.shape: {true_positions.shape}")
            # print(f"true_positions: {true_positions}")

            # print(f"predicted_positions type: {type(predicted_positions)}")
            # print(f"true_positions type: {type(true_positions)}")

            return torch.Tensor(predicted_positions), torch.Tensor(true_positions)
        elif corresponding_word is not None:
            # print(f"corresponding_word.shape: {corresponding_word.shape}")
            # print(f"corresponding_word: {corresponding_word}")

            # Keep only predictions where corresponding_word are not -100
            # clean_preds = preds[corresponding_word != -100].reshape(
            #     batch_size, -1
            # ).detach().cpu().numpy()
            # clean_corresponding_word = corresponding_word[corresponding_word != -100].reshape(
            #     batch_size, -1
            # ).detach().cpu().numpy()

            # print(f"clean_preds.shape: {clean_preds.shape}")
            # print(f"clean_preds: {clean_preds}")

            # print(f"clean_corresponding_word.shape: {clean_corresponding_word.shape}")
            # print(f"clean_corresponding_word: {clean_corresponding_word}")

            predicted_positions = []
            for p, w in zip(preds, corresponding_word):
                mask = w != -100

                clean_pred = p[mask]
                clean_corresponding_word = w[mask]

                # print(f"clean_pred.shape: {clean_pred.shape}")
                # print(f"clean_pred: {clean_pred}")
                # print(f"clean_corresponding_word.shape: {clean_corresponding_word.shape}")
                # print(f"clean_corresponding_word: {clean_corresponding_word}")

                # Get the index of the first machine text word
                index = torch.where(clean_pred == 1)[0]
                value = index[0] if index.size else len(clean_pred) - 1
                position = clean_corresponding_word[value]

                # print(f"index: {index}")
                # print(f"value: {value}")
                # print(f"position: {position}")

                predicted_positions.append(position.item())
            #     # pred = pred.detach().cpu().numpy()

            #     index = np.where(pred == 1)[0]
            #     value = index[0] if index.size else len(pred) - 1
            #     position = clean_corresponding_word[idx][value]

            #     predicted_positions.append(position.item())

            # print(f"predicted_positions: {predicted_positions}")

            return predicted_positions, None
        else:
            raise ValueError("Either labels or corresponding_word must be provided")

    def _init_embedding_weights(self):
        self.word_embedding.weight.data = self.word_embedding.weight.data.uniform_(
            -0.5, 0.5
        )


class CNNBiLSTMWithCRFForTokenClassification(nn.Module):
    def __init__(
        self,
        char_vocab_size,
        char_max_len,
        char_embedding_dim,
        char_filter_size=3,
        char_num_filters=30,
        char_dropout_p=0.5,
        word_pretrained_embedding=None,
        word_freeze_embedding=False,
        word_vocab_size=None,
        word_embedding_dim=300,
        # filter_sizes=[3, 4, 5],
        # num_filters=[100, 100, 100],
        n_layers=1,
        hidden_dim=32,
        dropout_p=0.5,
        fc=[],
        out_size=2,
    ):
        super(CNNBiLSTMWithCRFForTokenClassification, self).__init__()

        self.out_size = out_size

        self.char_embedding = CharacterLevelCNNEmbedding(
            char_vocab_size,
            char_max_len,
            char_embedding_dim,
            char_filter_size,
            char_num_filters,
            char_dropout_p,
        )

        if word_pretrained_embedding is not None:
            self.word_vocab_size, self.word_embedding_dim = word_pretrained_embedding.shape
            self.word_embedding = nn.Embedding.from_pretrained(
                word_pretrained_embedding,
                freeze=word_freeze_embedding,
            )
        else:
            self.vocab_size = word_vocab_size
            self.word_embedding_dim = word_embedding_dim
            self.word_embedding = nn.Embedding(
                num_embeddings=word_vocab_size,
                embedding_dim=word_embedding_dim,
                padding_idx=1,
            )

            self._init_embedding_weights()

        # self.filter_sizes = filter_sizes
        # self.num_filters = num_filters
        # self.convs = nn.ModuleList(
        #     nn.Conv1d(
        #         in_channels=self.embed_dim,
        #         out_channels=num_filters[i],
        #         kernel_size=filter_sizes[i],
        #         padding=(filter_sizes[i] - 1) // 2,
        #     ) for i in range(len(filter_sizes))
        # )

        # self.fc = sequential_fully_connected(
        #     np.sum(num_filters), out_size, fc, dropout_p,
        # )

        self.lstm = nn.LSTM(
            char_num_filters + self.word_embedding_dim,
            hidden_dim,
            n_layers,
            bidirectional=True,
            batch_first=True,
        )

        # self.fc = nn.Linear(np.sum(num_filters), out_size)
        self.classifier = sequential_fully_connected(
            2 * hidden_dim, out_size, fc, dropout_p,
        )
        # self.dropout = nn.Dropout(dropout_p)

        self.crf = CRF(out_size, batch_first=True)

        # self._compute_output_dim()

    @property
    def output_dim(self):
        return self._output_dim

    def freeze_transformer_layer(self):
        pass

    def unfreeze_transformer_layer(self):
        pass

    def forward(
        self,
        input_ids,
        attention_mask,
        char_input_ids,
        char_attention_mask,
        device,
        labels=None,
    ):
        # input_ids: (batch_size, max_seq_len, max_char_len)
        # print(f"input_ids.shape: {input_ids.shape}")

        char_embeddings = self.char_embedding(
            char_input_ids,
            char_attention_mask,
        )

        # input_ids.shape: (batch_size, max_seq_len)
        word_embeddings = self.word_embedding(input_ids)
        # embeddings = embeddings.permute(0, 2, 1)
        # print(f"embeddings.shape: {embeddings.shape}")

        embeddings = torch.cat(
            [char_embeddings, word_embeddings],
            dim=-1,
        )
        # print(f"embeddings.shape: {embeddings.shape}")

        lengths = attention_mask.sum(dim=1)
        # print(f"lengths.shape: {lengths.shape}")

        packed_embeddings = nn.utils.rnn.pack_padded_sequence(
            embeddings, lengths.cpu(), batch_first=True, enforce_sorted=False
        )
        # print(f"packed_embeddings.data.shape: {packed_embeddings.data.shape}")

        packed_output, (_, _) = self.lstm(packed_embeddings)
        # print(f"packed_output.data.shape: {packed_output.data.shape}")

        output, _ = nn.utils.rnn.pad_packed_sequence(
            packed_output, batch_first=True, total_length=embeddings.shape[1],
        )
        # print(f"output.shape: {output.shape}")

        # output = self.dropout(output)
        logits = self.classifier(output)

        mask = attention_mask.bool()

        loss = None
        if labels is not None:
            # print("YAY1")
            log_likelihood = self.crf(logits, labels, mask=mask)
            logits = self.crf.decode(logits, mask=mask)

            loss = 0 - log_likelihood
        else:
            # print("YAY2")
            logits = self.crf.decode(logits, mask=mask)

        for i in range(len(logits)):
            if len(logits[i]) < len(attention_mask[i]):
                logits[i] = (
                    [-100]
                    + logits[i]
                    + [-100] * (len(attention_mask[i]) - len(logits[i]) - 1)
                )

        # print(len(logits))
        # print(logits)
        logits = torch.Tensor(logits).to(device)
        # print(logits.shape)

        return loss, logits

        # x_conv.shape: (batch_size, num_filters[i], L_out)
        # x_conv = [
        #     F.relu(conv1d(embeddings)) for conv1d in self.convs
        # ]
        # for i in range(len(x_conv)):
        #     print(f"x_conv[{i}].shape: {x_conv[i].shape}")
        # print(f"x_conv.shape: {x_conv[0].shape}")
        # print(f"x_conv: {x_conv[0]}")

        # x_max_pool.shape: (batch_size, num_filters[i], 1)
        # x_max_pool = [
        #     F.max_pool1d(
        #         conv,
        #         kernel_size=self.filter_sizes[i],
        #         stride=1,
        #         padding=self.filter_sizes[i] // 2,
        #     ) for i, conv in enumerate(x_conv)
        # ]
        # for i in range(len(x_max_pool)):
        #     print(f"x_max_pool[{i}].shape: {x_max_pool[i].shape}")
        # print(f"x_max_pool.shape: {x_max_pool[0].shape}")
        # print(f"x_max_pool: {x_max_pool[0]}")

        # x_fc = torch.cat(
        #     x_max_pool,
        #     dim=1,
        # )
        # x_fc = x_fc.permute(0, 2, 1)
        # print(f"x_fc.shape: {x_fc.shape}")

        # logits = self.fc(self.dropout(x_fc))
        # logits = self.fc(x_fc)

        # print(f"logits.shape: {logits.shape}")

        # return logits

    def get_predictions_from_logits(self, logits, labels=None, corresponding_word=None):
        # batch_size = logits.shape[0]

        # logits: (batch_size, max_seq_len, out_size)
        # labels: (batch_size, max_seq_len)
        # corresponding_word: (batch_size, max_seq_len)

        # print(f"logits.shape: {logits.shape}")
        # print(f"logits: {logits}")

        # preds: (batch_size, max_seq_len)
        # preds = torch.argmax(logits, dim=-1)
        preds = logits.clone()

        # print(f"preds.shape: {preds.shape}")
        # print(f"preds: {preds}")

        if labels is not None:
            # print(f"labels.shape: {labels.shape}")
            # print(f"labels: {labels}")

            # Keep only predictions where labels are not -100
            # clean_preds = preds[labels != -100].reshape(batch_size, -1)
            # clean_labels = labels[labels != -100].reshape(batch_size, -1)

            # print(f"clean_preds.shape: {clean_preds.shape}")
            # print(f"clean_preds: {clean_preds}")

            # print(f"clean_labels.shape: {clean_labels.shape}")
            # print(f"clean_labels: {clean_labels}")

            # Get the index of the first machine text word
            # predicted_positions = clean_preds.argmax(dim=-1)
            # true_positions = clean_labels.argmax(dim=-1)

            predicted_positions = []
            true_positions = []
            for p, l in zip(preds, labels):
                mask = l != -100

                clean_pred = p[mask]
                clean_label = l[mask]

                # print(f"clean_pred.shape: {clean_pred.shape}")
                # print(f"clean_pred: {clean_pred}")
                # print(f"clean_label.shape: {clean_label.shape}")
                # print(f"clean_label: {clean_label}")

                predicted_position = clean_pred.argmax(dim=-1)
                true_position = clean_label.argmax(dim=-1)

                # print(f"predicted_position: {predicted_position}")
                # print(f"true_position: {true_position}")

                predicted_positions.append(predicted_position.item())
                true_positions.append(true_position.item())

            # print(f"predicted_positions.shape: {predicted_positions.shape}")
            # print(f"predicted_positions: {predicted_positions}")

            # print(f"true_positions.shape: {true_positions.shape}")
            # print(f"true_positions: {true_positions}")

            # print(f"predicted_positions type: {type(predicted_positions)}")
            # print(f"true_positions type: {type(true_positions)}")

            return torch.Tensor(predicted_positions), torch.Tensor(true_positions)
        elif corresponding_word is not None:
            # print(f"corresponding_word.shape: {corresponding_word.shape}")
            # print(f"corresponding_word: {corresponding_word}")

            # Keep only predictions where corresponding_word are not -100
            # clean_preds = preds[corresponding_word != -100].reshape(
            #     batch_size, -1
            # ).detach().cpu().numpy()
            # clean_corresponding_word = corresponding_word[corresponding_word != -100].reshape(
            #     batch_size, -1
            # ).detach().cpu().numpy()

            # print(f"clean_preds.shape: {clean_preds.shape}")
            # print(f"clean_preds: {clean_preds}")

            # print(f"clean_corresponding_word.shape: {clean_corresponding_word.shape}")
            # print(f"clean_corresponding_word: {clean_corresponding_word}")

            predicted_positions = []
            for p, w in zip(preds, corresponding_word):
                mask = w != -100

                clean_pred = p[mask]
                clean_corresponding_word = w[mask]

                # print(f"clean_pred.shape: {clean_pred.shape}")
                # print(f"clean_pred: {clean_pred}")
                # print(f"clean_corresponding_word.shape: {clean_corresponding_word.shape}")
                # print(f"clean_corresponding_word: {clean_corresponding_word}")

                # Get the index of the first machine text word
                index = torch.where(clean_pred == 1)[0]
                value = index[0] if index.size else len(clean_pred) - 1
                position = clean_corresponding_word[value]

                # print(f"index: {index}")
                # print(f"value: {value}")
                # print(f"position: {position}")

                predicted_positions.append(position.item())
            #     # pred = pred.detach().cpu().numpy()

            #     index = np.where(pred == 1)[0]
            #     value = index[0] if index.size else len(pred) - 1
            #     position = clean_corresponding_word[idx][value]

            #     predicted_positions.append(position.item())

            # print(f"predicted_positions: {predicted_positions}")

            return torch.Tensor(predicted_positions), torch.Tensor([-1] * len(predicted_positions))
        else:
            raise ValueError("Either labels or corresponding_word must be provided")

    def _init_embedding_weights(self):
        self.word_embedding.weight.data = self.word_embedding.weight.data.uniform_(
            -0.5, 0.5
        )

In [57]:
# char_cnn_embedding = CharacterLevelCNNEmbedding(
#     vocab_size=char_vocabulary.vocab_size(),
#     max_len=char_max_len,
#     embedding_dim=10,
#     filter_size=3,
#     num_filters=30,
#     dropout_p=0.5,
# ).to(DEVICE)
model = CNNBiLSTMWithCRFForTokenClassification(
    char_vocab_size=char_vocabulary.vocab_size(),
    char_max_len=char_max_len,
    char_embedding_dim=10,
    char_filter_size=3,
    char_num_filters=30,
    char_dropout_p=0.5,
    word_pretrained_embedding=None,
    word_freeze_embedding=False,
    word_vocab_size=word_vocabulary.vocab_size(),
    word_embedding_dim=50,
    n_layers=1,
    hidden_dim=32,
    dropout_p=0.5,
    fc=[16],
    out_size=2,
).to(DEVICE)

for i, batch in enumerate(train_dataloader):
    print(f"Batch=[{i + 1}/{len(train_dataloader)}]")
    input_ids = batch["input_ids"].to(DEVICE)
    attention_mask = batch["attention_mask"].to(DEVICE)
    char_input_ids = batch["char_input_ids"].to(DEVICE)
    char_attention_mask = batch["char_attention_mask"].to(DEVICE)
    labels = batch["target"].to(DEVICE)

    # print(f"char_input_ids.shape: {char_input_ids.shape}")
    loss, output = model(
        input_ids,
        attention_mask,
        char_input_ids,
        char_attention_mask,
        DEVICE,
        labels=labels,
    )

    print(f"loss: {loss}")

    break

Batch=[1/305]
loss: 2199.6484375


# Train model

In [11]:
import pandas as pd
from tqdm import tqdm
# from time import time
from torch.autograd import Variable
from collections import defaultdict


def train_epoch(
    model,
    dataloader,
    loss_fn,
    optimizer,
    device,
    scheduler,
    metric_fn,
    print_freq=10,
):
    model.train()

    losses = []

    all_predictions = []
    all_true = []
    all_ids = []

    for i, batch in enumerate(dataloader):
        ids = batch["id"]
        input_ids = batch["input_ids"].to(device)
        char_input_ids = batch["char_input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        char_attention_mask = batch["char_attention_mask"].to(device)
        targets = batch["target"].to(device)
        corresponding_word = batch["corresponding_word"].to(device)

        loss, logits = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            char_input_ids=char_input_ids,
            char_attention_mask=char_attention_mask,
            device=device,
            labels=targets,
        )

        predictions, true_predictions = model.get_predictions_from_logits(
            logits=logits,
            labels=targets,
            corresponding_word=corresponding_word
        )

        # loss = loss_fn(predictions, true_predictions)
        # loss = Variable(loss, requires_grad=True)

        # print(f"predictions: {predictions}")
        # print(f"true_predictions: {true_predictions}")

        losses.append(loss.item())

        all_predictions.extend(predictions.tolist())
        all_true.extend(true_predictions.tolist())
        all_ids.extend(ids)

        if i % print_freq == 0:
            print(
                f"Batch [{i + 1}/{len(dataloader)}]; "
                f"Loss: {loss.item():.5f}; "
                f"Mean absolute error: {metric_fn(true_predictions, predictions):.5f}"
            )

        loss.backward()

        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()
        if scheduler is not None:
            scheduler.step()
        optimizer.zero_grad()

    return np.mean(losses), (all_ids, all_true, all_predictions)


def validation_epoch(
    model,
    dataloader,
    loss_fn,
    device,
    metric_fn,
):
    model.eval()

    losses = []
    all_predictions = []
    all_true = []
    all_ids = []

    with torch.no_grad():
        for i, batch in enumerate(tqdm(dataloader)):
            ids = batch["id"]
            input_ids = batch["input_ids"].to(device)
            char_input_ids = batch["char_input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            char_attention_mask = batch["char_attention_mask"].to(device)
            targets = batch["target"].to(device)
            corresponding_word = batch["corresponding_word"].to(device)

            loss, logits = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                char_input_ids=char_input_ids,
                char_attention_mask=char_attention_mask,
                device=device,
                labels=targets,
            )

            predictions, true_predictions = model.get_predictions_from_logits(
                logits=logits,
                labels=targets,
                corresponding_word=corresponding_word
            )

            # loss = loss_fn(predictions, true_predictions)
            # loss = Variable(loss, requires_grad=True)

            losses.append(loss.item())

            all_predictions.extend(predictions.tolist())
            all_true.extend(true_predictions.tolist())
            all_ids.extend(ids)

    return np.mean(losses), (all_ids, all_true, all_predictions)


def training_loop(
    model,
    num_epochs,
    train_dataloader,
    dev_dataloader,
    loss_fn,
    optimizer_config,
    scheduler_config,
    device,
    metric_fn,
    is_better_metric_fn,
    num_epochs_before_finetune,
    results_dir,
):
    history = defaultdict(list)
    best_metric = None
    best_model_state = None

    optimizer = get_optimizer(model, optimizer_config, finetune=False)
    scheduler = None

    for epoch in range(1, num_epochs + 1):
        print(f"Epoch {epoch}/{num_epochs}")
        if epoch <= num_epochs_before_finetune:
            print("Freeze transformer")
        else:
            print("Finetune transformer")
        print("-" * 10)

        if epoch == num_epochs_before_finetune + 1:
            model.unfreeze_transformer_layer()
            optimizer = get_optimizer(model, optimizer_config, finetune=True)
            scheduler = get_scheduler(
                optimizer,
                num_training_steps=len(train_dataloader) * num_epochs,
                **scheduler_config,
            )

        train_loss, (train_ids, train_true, train_predict) = train_epoch(
            model,
            train_dataloader,
            loss_fn,
            optimizer,
            device,
            scheduler,
            metric_fn,
        )

        train_metric = metric_fn(train_true, train_predict)

        print(f"Train Loss: {train_loss:.5f}; Train Metric: {train_metric:.5f}")

        dev_loss, (dev_ids, dev_true, dev_predict) = validation_epoch(
            model,
            dev_dataloader,
            loss_fn,
            device,
            metric_fn,
        )

        dev_metric = metric_fn(dev_true, dev_predict)

        print(
            f"Validation Loss: {dev_loss:.5f}; "
            f"Validation Metric: {dev_metric:.5f}"
        )

        history["train_metric"].append(train_metric)
        history["train_loss"].append(train_loss)
        history["dev_metric"].append(dev_metric)
        history["dev_loss"].append(dev_loss)

        if best_metric is None or is_better_metric_fn(train_metric, best_metric):
            best_metric = train_metric
            best_model_state = model.state_dict()
            
            if results_dir is not None:
                torch.save(
                    best_model_state,
                    os.path.join(results_dir, "best_model.bin"),
                )

                df_train_predictions = pd.DataFrame(
                    {
                        "id": train_ids,
                        "true": train_true,
                        "predict": train_predict,
                    }
                )
                df_train_predictions.to_csv(
                    os.path.join(results_dir, "best_model_train_predict.csv"),
                    index=False
                )

                df_dev_predictions = pd.DataFrame(
                    {
                        "id": dev_ids,
                        "true": dev_true,
                        "predict": dev_predict,
                    }
                )
                df_dev_predictions.to_csv(
                    os.path.join(results_dir, "best_model_dev_predict.csv"),
                    index=False
                )

    df_history = pd.DataFrame(history)
    if results_dir is not None:
        df_history.to_csv(os.path.join(results_dir, "history.csv"), index=False)

        model.load_state_dict(torch.load(os.path.join(results_dir, "best_model.bin")))
    else:
        model.load_state_dict(best_model_state)

    return model, df_history

In [12]:
import torch

torch.cuda.empty_cache()

In [13]:
config["model_config"]["word_embedding_dim"] = embeddings.shape[1]

In [14]:
char_vocabulary.save_vocabulary(results_dir)
word_vocabulary.save_vocabulary(results_dir)

In [15]:
with open(results_dir + "/config.json", "w") as f:
    json.dump(config, f, indent=4)

In [16]:
config["model_config"]["word_pretrained_embedding"] = embeddings
config["model_config"]["word_freeze_embedding"] = False

In [17]:
num_epochs = config["training"]["num_epochs"]
# model = CNNBiLSTMForTokenClassification(
#     **config["model_config"]
# ).to(DEVICE)
model = CNNBiLSTMWithCRFForTokenClassification(
    **config["model_config"]
).to(DEVICE)
loss_fn = get_loss_fn(config["training"]["loss"], DEVICE)
optimizer_config = config["training"]["optimizer"]
scheduler_config = config["training"]["scheduler"]
metric_fn, is_better_metric_fn = get_metric(config["training"]["metric"])
num_epochs_before_finetune = config["training"]["num_epochs_before_finetune"]

best_model, df_history = training_loop(
    model,
    num_epochs,
    train_dataloader,
    dev_dataloader,
    loss_fn,
    optimizer_config,
    scheduler_config,
    DEVICE,
    metric_fn,
    is_better_metric_fn,
    num_epochs_before_finetune,
    results_dir,
)

Epoch 1/4
Freeze transformer
----------
Batch [1/229]; Loss: 2970.34375; Mean absolute error: 47.75000
Batch [11/229]; Loss: 2958.27417; Mean absolute error: 96.62500
Batch [21/229]; Loss: 1233.69031; Mean absolute error: 37.68750
Batch [31/229]; Loss: 976.74646; Mean absolute error: 29.56250
Batch [41/229]; Loss: 1364.63062; Mean absolute error: 39.37500
Batch [51/229]; Loss: 972.86224; Mean absolute error: 24.50000
Batch [61/229]; Loss: 867.24030; Mean absolute error: 26.25000
Batch [71/229]; Loss: 392.81506; Mean absolute error: 11.12500
Batch [81/229]; Loss: 499.95718; Mean absolute error: 17.06250
Batch [91/229]; Loss: 265.18158; Mean absolute error: 9.31250
Batch [101/229]; Loss: 352.47333; Mean absolute error: 9.81250
Batch [111/229]; Loss: 221.82578; Mean absolute error: 10.06250
Batch [121/229]; Loss: 336.50708; Mean absolute error: 8.93750
Batch [131/229]; Loss: 225.04614; Mean absolute error: 8.93750
Batch [141/229]; Loss: 391.61224; Mean absolute error: 23.81250
Batch [151/

# Make predictions

In [None]:
import pandas as pd


def make_predictions(
    model,
    dataloader,
    device,
    results_dir,
    label_column,
    file_format="csv",
):
    model.eval()

    all_predictions = []
    all_true = []
    all_ids = []

    with torch.no_grad():
        for i, batch in enumerate(tqdm(dataloader)):
            # print(batch)
            ids = batch["id"]
            input_ids = batch["input_ids"].to(device)
            char_input_ids = batch["char_input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            char_attention_mask = batch["char_attention_mask"].to(device)
            targets = batch["target"].to(device)
            corresponding_word = batch["corresponding_word"].to(device)

            _, logits = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                char_input_ids=char_input_ids,
                char_attention_mask=char_attention_mask,
                device=device,
                labels=None,
            )

            # print(f"logits.shape: {logits.shape}")
            # print(f"targets.shape: {targets.shape}")

            predictions, true_predictions = get_predictions_from_logits(
                logits=logits,
                labels=None,
                corresponding_word=corresponding_word
            )

            all_predictions.extend(predictions.tolist())
            all_true.extend(true_predictions.tolist())
            all_ids.extend(ids)

    df_predictions = pd.DataFrame(
        {
            "id": all_ids,
            "true": all_true,
            label_column: all_predictions,
        }
    )

    if results_dir is not None:
        if file_format == "csv":
            df_predictions.to_csv(
                os.path.join(results_dir, "submission.csv"),
                index=False,
            )
        elif file_format == "jsonl":
            df_predictions.to_json(
                os.path.join(results_dir, "submission.jsonl"),
                orient="records",
                lines=True,
            )
        else:
            raise ValueError(f"Unknown file format: {file_format}")
    else:
        print("Missing results_dir, not saving predictions to file!")

    return df_predictions

In [19]:
# results_dir = "../runs/12-01-2024_19:43:36-SubtaskC-cnn_bilstm_for_token_classification"

In [20]:
# char_vocabulary, word_vocabulary = None, None
# char_max_len, word_max_len = None, config["data"]["max_len"]
# if dataset_type == DatasetType.TokenClassificationDataset:
#     if dataset_type_settings is not None:
#         if "chars" in dataset_type_settings:
#             char_vocabulary = CharacterVocabulary()
#             char_vocabulary.load_vocabulary(results_dir)

#             char_max_len = dataset_type_settings["chars"]["max_len"]

#         if "words" in dataset_type_settings:
#             word_vocabulary = WordVocabulary()
#             word_vocabulary.load_vocabulary(results_dir)

#             word_max_len = dataset_type_settings["words"]["max_len"]
#     else:
#         word_vocabulary = WordVocabulary()
#         word_vocabulary.load_vocabulary(results_dir)

#         word_max_len = config["data"]["max_len"]

In [21]:
# with open(f"{results_dir}/config.json", "r") as config_file:
#     config = json.load(config_file)

In [22]:
# best_model = CNNBiLSTMForTokenClassification(**config["model_config"]).to(DEVICE)
# best_model.load_state_dict(
#     torch.load(os.path.join(results_dir, "best_model.bin"))
# )
# best_model.to(DEVICE)

In [23]:
# from tqdm import tqdm

# test_dataloader = build_data_loader(
#     df_test,
#     tokenizer,
#     max_len=word_max_len,
#     batch_size=config["data"]["batch_size"],
#     label_column=config["data"]["label_column"],
#     has_targets=False,
#     dataset_type=dataset_type,
#     dataset_type_settings=dataset_type_settings,
#     char_vocabulary=char_vocabulary,
#     char_max_len=char_max_len,
#     word_vocabulary=word_vocabulary,
#     device=DEVICE,
# )

# make_predictions(
#     best_model,
#     test_dataloader,
#     DEVICE,
#     results_dir,
#     label_column=config["data"]["label_column"],
#     file_format="csv",
# )

In [None]:
def get_predictions_from_logits(logits, labels=None, corresponding_word=None):
    # batch_size = logits.shape[0]

    # logits: (batch_size, max_seq_len, out_size)
    # labels: (batch_size, max_seq_len)
    # corresponding_word: (batch_size, max_seq_len)

    # print(f"logits.shape: {logits.shape}")
    # print(f"logits: {logits}")

    # preds: (batch_size, max_seq_len)
    # preds = torch.argmax(logits, dim=-1)
    preds = logits.clone()

    # print(f"preds.shape: {preds.shape}")
    # print(f"preds: {preds}")

    if labels is not None:
        # print(f"labels.shape: {labels.shape}")
        # print(f"labels: {labels}")

        # Keep only predictions where labels are not -100
        # clean_preds = preds[labels != -100].reshape(batch_size, -1)
        # clean_labels = labels[labels != -100].reshape(batch_size, -1)

        # print(f"clean_preds.shape: {clean_preds.shape}")
        # print(f"clean_preds: {clean_preds}")

        # print(f"clean_labels.shape: {clean_labels.shape}")
        # print(f"clean_labels: {clean_labels}")

        # Get the index of the first machine text word
        # predicted_positions = clean_preds.argmax(dim=-1)
        # true_positions = clean_labels.argmax(dim=-1)

        predicted_positions = []
        true_positions = []
        for p, l in zip(preds, labels):
            mask = l != -100

            # print(f"mask: {mask}")

            clean_pred = p[mask]
            clean_label = l[mask]

            # print(f"clean_pred.shape: {clean_pred.shape}")
            # print(f"clean_pred: {clean_pred}")
            # print(f"clean_label.shape: {clean_label.shape}")
            # print(f"clean_label: {clean_label}")

            predicted_position = clean_pred.argmax(dim=-1)
            true_position = clean_label.argmax(dim=-1)

            # print(f"predicted_position: {predicted_position}")
            # print(f"true_position: {true_position}")

            predicted_positions.append(predicted_position.item())
            true_positions.append(true_position.item())

        # print(f"predicted_positions.shape: {predicted_positions.shape}")
        # print(f"predicted_positions: {predicted_positions}")

        # print(f"true_positions.shape: {true_positions.shape}")
        # print(f"true_positions: {true_positions}")

        # print(f"predicted_positions type: {type(predicted_positions)}")
        # print(f"true_positions type: {type(true_positions)}")

        return torch.Tensor(predicted_positions), torch.Tensor(true_positions)
    elif corresponding_word is not None:
        # print(f"corresponding_word.shape: {corresponding_word.shape}")
        # print(f"corresponding_word: {corresponding_word}")

        # Keep only predictions where corresponding_word are not -100
        # clean_preds = preds[corresponding_word != -100].reshape(
        #     batch_size, -1
        # ).detach().cpu().numpy()
        # clean_corresponding_word = corresponding_word[corresponding_word != -100].reshape(
        #     batch_size, -1
        # ).detach().cpu().numpy()

        # print(f"clean_preds.shape: {clean_preds.shape}")
        # print(f"clean_preds: {clean_preds}")

        # print(f"clean_corresponding_word.shape: {clean_corresponding_word.shape}")
        # print(f"clean_corresponding_word: {clean_corresponding_word}")

        predicted_positions = []
        for p, w in zip(preds, corresponding_word):
            mask = w != -100

            # print(f"mask.shape: {mask.shape}")
            # print(f"mask: {mask}")

            clean_pred = p[mask]
            clean_corresponding_word = w[mask]

            # print(f"clean_pred.shape: {clean_pred.shape}")
            # print(f"clean_pred: {clean_pred}")
            # print(f"clean_corresponding_word.shape: {clean_corresponding_word.shape}")
            # print(f"clean_corresponding_word: {clean_corresponding_word}")

            # Get the index of the first machine text word
            index = torch.where(clean_pred == 1)[0]
            # print(f"index: {index}")
            # print(f"index.size: {index.size}")
            value = index[0] if len(index) > 0 else len(clean_pred) - 1
            position = clean_corresponding_word[value]

            # print(f"index: {index}")
            # print(f"value: {value}")
            # print(f"position: {position}")

            predicted_positions.append(position.item())
        #     # pred = pred.detach().cpu().numpy()

        #     index = np.where(pred == 1)[0]
        #     value = index[0] if index.size else len(pred) - 1
        #     position = clean_corresponding_word[idx][value]

        #     predicted_positions.append(position.item())

        # print(f"predicted_positions: {predicted_positions}")

        return torch.Tensor(predicted_positions), torch.Tensor([-1] * len(predicted_positions))
    else:
        raise ValueError("Either labels or corresponding_word must be provided")

In [None]:
predictions = make_predictions(
    best_model,
    test_dataloader,
    DEVICE,
    results_dir,
    config["data"]["label_column"],
    file_format="csv",
)

100%|██████████| 1113/1113 [1:01:47<00:00,  3.33s/it]


In [None]:
results_dir

'../runs/17-01-2024_08:49:45-SubtaskC-cnn_bilstm_with_crf_for_token_classification'

In [91]:
!python ../scores_and_plots.py --results-dir "../runs/15-01-2024_19:43:53-SubtaskC-cnn_bilstm_with_crf_for_token_classification"

Results on validation
MAE: 8.34653
--------------------
Results on test
MAE: 157.67752
--------------------
