In [2]:
import os
import sys
import json
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from enum import Enum
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import PreTrainedTokenizer, BertModel, BatchEncoding


In [51]:
sys.path.insert(0,  "../")

import importlib
from lib.utils import get_current_date
from lib.utils.constants import Subtask, Track, PreprocessTextLevel, PoolingStrategy, DatasetType
from lib.utils.models import sequential_fully_connected
from lib.data.loading import load_train_dev_test_df, build_data_loader
from lib.data.tokenizer import get_tokenizer
from lib.models import get_model
from lib.training.loss import get_loss_fn
from lib.training.metric import get_metric
from lib.training.loops import training_loop, make_predictions
%load_ext autoreload
%autoreload 2

In [4]:
CONFIG_FILE = os.path.relpath("../config.json")

CONFIG = {}
with open(CONFIG_FILE) as f:
    CONFIG = json.load(f)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

print(f"Using device: {DEVICE}")

Using device: cpu


In [6]:
CONFIG

{'task': 'SubtaskA',
 'track': 'monolingual',
 'submission_format': 'csv',
 'model': 'bert',
 'tokenizer': {'model_name': 'bert', 'pretrained_name': 'bert-base-uncased'},
 'data': {'dataset_type': 'transformer_truncation_dataset',
  'dataset_type_settings': {'truncation_strategy': 'head_only'},
  'data_dir': './data/original_data',
  'label_column': 'label',
  'max_len': 128,
  'batch_size': 8,
  'test_size': 0.2,
  'preprocess_text_level': 0},
 'model_config': {'pretrained_model_name': 'bert-base-uncased',
  'out_size': 1,
  'dropout_p': 0.5,
  'fc': [128],
  'out_activation': 'sigmoid'},
 'training': {'num_epochs': 3,
  'num_epochs_before_finetune': 2,
  'optimizer': {'AdamW': {'freeze_lr': 0.001, 'finetune_lr': 2e-05}},
  'scheduler': {'num_warmup_steps': 0},
  'loss': 'bce',
  'metric': 'accuracy'}}

In [7]:
df_train, df_dev, df_test = load_train_dev_test_df(
    task=Subtask(CONFIG["task"]),
    track=Track(CONFIG["track"]),
    data_dir="../data/original_data",
    label_column=CONFIG["data"]["label_column"],
    test_size=CONFIG["data"]["test_size"],
    preprocess_text_level=PreprocessTextLevel(
        CONFIG["data"]["preprocess_text_level"]
    ),
)

print(f"df_train.shape: {df_train.shape}")
print(f"df_dev.shape: {df_dev.shape}")
print(f"df_test.shape: {df_test.shape}")

Loading train data...
Train/dev split... (df_train.shape: (119757, 5))
Loading test data...
df_train.shape: (95805, 5)
df_dev.shape: (23952, 5)
df_test.shape: (5000, 5)


In [8]:
max_seq_len = CONFIG["data"]["max_len"]
tokenizer = get_tokenizer(**CONFIG["tokenizer"])

tokenizer_config.json: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 28.0/28.0 [00:00<00:00, 57.9kB/s]
vocab.txt: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 232k/232k [00:00<00:00, 1.51MB/s]
tokenizer.json: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 466k/466k [00:00<00:00, 2.94MB/s]
config.json: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 570/570 [00:00<00:00, 1.20MB/s]


In [9]:
max_seq_len

128

In [10]:
# Clean cuda memroy
torch.cuda.empty_cache()

In [29]:
train_dataloader = build_data_loader(
    df_train[:10],
    tokenizer,
    max_len=CONFIG["data"]["max_len"],
    batch_size=CONFIG["data"]["batch_size"],
    label_column=CONFIG["data"]["label_column"],
    shuffle=True,
)

In [30]:
from transformers import BertModel

bert = BertModel.from_pretrained(
    CONFIG["model_config"]["pretrained_model_name"],
    return_dict=False,
    output_hidden_states=True,
)
bert_num_layers = len(bert.encoder.layer)

In [13]:
for p in bert.named_parameters():
    print(p[0])
    # print()

embeddings.word_embeddings.weight
embeddings.position_embeddings.weight
embeddings.token_type_embeddings.weight
embeddings.LayerNorm.weight
embeddings.LayerNorm.bias
encoder.layer.0.attention.self.query.weight
encoder.layer.0.attention.self.query.bias
encoder.layer.0.attention.self.key.weight
encoder.layer.0.attention.self.key.bias
encoder.layer.0.attention.self.value.weight
encoder.layer.0.attention.self.value.bias
encoder.layer.0.attention.output.dense.weight
encoder.layer.0.attention.output.dense.bias
encoder.layer.0.attention.output.LayerNorm.weight
encoder.layer.0.attention.output.LayerNorm.bias
encoder.layer.0.intermediate.dense.weight
encoder.layer.0.intermediate.dense.bias
encoder.layer.0.output.dense.weight
encoder.layer.0.output.dense.bias
encoder.layer.0.output.LayerNorm.weight
encoder.layer.0.output.LayerNorm.bias
encoder.layer.1.attention.self.query.weight
encoder.layer.1.attention.self.query.bias
encoder.layer.1.attention.self.key.weight
encoder.layer.1.attention.self.key

In [14]:
bert

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [15]:
class BERTWithLayerSelection(nn.Module):
    def __init__(
        self,
        pretrained_model_name: str,
        out_size: int = 1,
        dropout_p: float = 0.5,
        selected_layers: [int] = [-1],
        fc: [int] = [],
        out_activation: str | None = None,
    ):
        super().__init__()
        self.selected_layers = selected_layers

        self.bert = BertModel.from_pretrained(
            pretrained_model_name, return_dict=False, output_hidden_states=True
        )
        # self.drop_bert = nn.Dropout(dropout_p)

        input_size = len(selected_layers) * self.bert.config.hidden_size
        self.out = sequential_fully_connected(input_size, out_size, fc, dropout_p)

        self.out_activation = None
        if out_activation == "sigmoid":
            self.out_activation = nn.Sigmoid()

        self.freeze_transformer_layer()

    def forward(self, input_ids, attention_mask):
        bert_outputs = self.bert(input_ids, attention_mask)
        hidden_states = bert_outputs[2]
        bert_cls_features = torch.cat(
            [hidden_states[i][:, 0, :] for i in self.selected_layers],
            dim=1,
        )
        # output = self.drop_bert(pooled_output)
        output = self.out(bert_cls_features)

        if self.out_activation is not None:
            output = self.out_activation(output)

        return output

    def freeze_transformer_layer(self):
        for param in self.bert.parameters():
            param.requires_grad = False

    def unfreeze_transformer_layer(self):
        # BERT used only for feature extraction
        pass

    def get_predictions_from_outputs(self, outputs):
        if self.out_activation is None:
            return outputs.flatten().tolist()
        else:
            return torch.round(outputs).flatten().tolist()


In [31]:
bert_layers_to_use = [-1, -2, -3, -4]
CONFIG["model_config"]["selected_layers"] = bert_layers_to_use
bert_with_layer_selection = BERTWithLayerSelection(**CONFIG["model_config"]).to(DEVICE)

In [32]:
# Bert with layer selection

for idx, batch in enumerate(train_dataloader):
    ids = batch["id"]
    input_ids = batch["input_ids"]  # .to(device)
    attention_mask = batch["attention_mask"]  # .to(device)
    targets = batch["target"].to(DEVICE)

    outputs = bert_with_layer_selection(
        input_ids=input_ids, attention_mask=attention_mask
    )

    predictions = bert_with_layer_selection.get_predictions_from_outputs(outputs)
    true = targets.flatten().tolist()

    print(f"predictions = {predictions}")
    print(f"true = {true}")

    # expected_input_size = len(bert_layers_to_use) * bert.config.hidden_size
    # print(f"expected_input_size: {expected_input_size}")

    # embs = bert(input_ids, attention_mask)
    # print(f"embs[0].shape: {embs[0].shape}")
    # print(f"embs[1].shape: {embs[1].shape}")

    # hidden_states = embs[2]
    # selected_layers = [hidden_states[i] for i in bert_layers_to_use]

    # # selected_layers = torch.cat(selected_layers, dim=2)
    # # print(f"selected_layers.shape: {selected_layers.shape}")

    # # mean_layer = torch.mean(selected_layers, dim=1)
    # # print(f"mean_layer.shape: {mean_layer.shape}")

    # cls_features = [layer[:, 0, :] for layer in selected_layers]
    # cls_features = torch.cat(cls_features, dim=1)
    # print(f"cls_features.shape: {cls_features.shape}")


predictions = [0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0]
true = [1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0]
predictions = [1.0, 0.0]
true = [0.0, 0.0]


In [64]:
selected_layers_options = {
    "1-1_layers": [1],
    "1-2_layers": [1, 2],
    "1-3_layers": [1, 2, 3],
    # "1-4_layers": [1, 2, 3, 4],
    # "1-5_layers": [1, 2, 3, 4, 5],
    # "1-6_layers": [1, 2, 3, 4, 5, 6],
    # "1-7_layers": [1, 2, 3, 4, 5, 6, 7],
    # "1-8_layers": [1, 2, 3, 4, 5, 6, 7, 8],
    # "1-9_layers": [1, 2, 3, 4, 5, 6, 7, 8, 9],
    # "1-10_layers": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    # "1-11_layers": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
    # "1-12_layers": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
    # "first_4_layers": [1, 2, 3, 4],
    # "last_4_layers": [-1, -2, -3, -4],
}

for selection_name, selected_layers in selected_layers_options.items():
    print(f"##### Layer selection: {selection_name} #####\n")

    CONFIG["model_config"]["selected_layers"] = selected_layers

    if CONFIG["track"] is None:
        results_path = (
            f"../runs/{get_current_date()}-{CONFIG['task']}-{CONFIG['model']}_{selection_name}"
        )
    else:
        results_path = (
            f"../runs/{get_current_date()}-"
            f"{CONFIG['task']}-{CONFIG['track']}-{CONFIG['model']}_{selection_name}"
        )

    print(f"Will save results to: {results_path}\n")
    os.mkdir(results_path)

    with open(results_path + "/config.json", "w") as f:
        json.dump(CONFIG, f, indent=4)

    train_dataloader = build_data_loader(
        df_train[:100],
        tokenizer,
        max_len=CONFIG["data"]["max_len"],
        batch_size=CONFIG["data"]["batch_size"],
        label_column=CONFIG["data"]["label_column"],
        shuffle=True,
    )
    dev_dataloader = build_data_loader(
        df_dev[:100],
        tokenizer,
        max_len=CONFIG["data"]["max_len"],
        batch_size=CONFIG["data"]["batch_size"],
        label_column=CONFIG["data"]["label_column"],
    )
    test_dataloader = build_data_loader(
        df_test[:100],
        tokenizer,
        max_len=CONFIG["data"]["max_len"],
        batch_size=CONFIG["data"]["batch_size"],
        label_column=CONFIG["data"]["label_column"],
        has_targets=False if CONFIG["data"]["test_size"] is None else True,
    )

    num_epochs = CONFIG["training"]["num_epochs"]
    model = get_model(CONFIG["model"], CONFIG["model_config"]).to(DEVICE)
    loss_fn = get_loss_fn(CONFIG["training"]["loss"], DEVICE)
    optimizer_config = CONFIG["training"]["optimizer"]
    scheduler_config = CONFIG["training"]["scheduler"]
    metric_fn, is_better_metric_fn = get_metric(CONFIG["training"]["metric"])
    num_epochs_before_finetune = CONFIG["training"]["num_epochs_before_finetune"]
    print("da")
    best_model = training_loop(
        model,
        num_epochs,
        train_dataloader,
        dev_dataloader,
        loss_fn,
        optimizer_config,
        scheduler_config,
        DEVICE,
        metric_fn,
        is_better_metric_fn,
        results_path,
        num_epochs_before_finetune,
    )
    print("da")

    make_predictions(
        best_model,
        test_dataloader,
        DEVICE,
        results_path,
        label_column=CONFIG["data"]["label_column"],
        file_format=CONFIG["submission_format"],
    )

    print("-" * 50)
    print()

##### Layer selection: 1-1_layers #####

Will save results to: ../runs/15-12-2023_15:57:01-SubtaskA-monolingual-bert_1-1_layers

da
Starting training loop...
2
1
Epoch 1/3
Freeze transformeer
--------------------
dap


NotImplementedError: No such selected layers merge strategy: None

In [65]:
CONFIG

{'task': 'SubtaskA',
 'track': 'monolingual',
 'submission_format': 'csv',
 'model': 'bert',
 'tokenizer': {'model_name': 'bert', 'pretrained_name': 'bert-base-uncased'},
 'data': {'dataset_type': 'transformer_truncation_dataset',
  'dataset_type_settings': {'truncation_strategy': 'head_only'},
  'data_dir': './data/original_data',
  'label_column': 'label',
  'max_len': 128,
  'batch_size': 8,
  'test_size': 0.2,
  'preprocess_text_level': 0},
 'model_config': {'pretrained_model_name': 'bert-base-uncased',
  'out_size': 1,
  'dropout_p': 0.5,
  'fc': [128],
  'out_activation': 'sigmoid',
  'selected_layers': [1]},
 'training': {'num_epochs': 3,
  'num_epochs_before_finetune': 2,
  'optimizer': {'AdamW': {'freeze_lr': 0.001, 'finetune_lr': 2e-05}},
  'scheduler': {'num_warmup_steps': 0},
  'loss': 'bce',
  'metric': 'accuracy'}}

In [55]:
num_epochs_before_finetune

2