## Instalacja i dołączenie zależności

In [13]:
!pip install -qqq torch pandas numpy pytorch_lightning sentence-transformers

In [14]:
# Standard libraries
import os
import csv

# Standard data processing libraries
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from pytorch_lightning import LightningDataModule, LightningModule
from torch.utils.data import random_split, DataLoader

# ML libraries
import torch
from torch.utils.data import Dataset
from pytorch_lightning import Trainer

from lib.utils import types_to_int, TYPES_MAP

## Parametry

In [15]:
PROJECT_DIR = "./" # @param {type: "string"}
DATASET = "headlines" # @param {type: "string"}
DATA_DIR = os.path.join(PROJECT_DIR, "data/sem_eval_2016/", DATASET)

SBERT_EMBEDDING_WIDTH = 768

TRAIN_VAL_SPLIT = 0.8 # @param {type: "slider", min:0, max: 1}

TRAIN_BATCH_SIZE = 16 # @param {type: "slider", min:1, max:128}
BATCH_SIZE = 16 # @param {type: "slider", min:1, max:128}
NUM_WORKERS = 0 # @param {type: "slider", min:1, max:16}
PERSISTENT_WORKERS = False # TODO param

EPOCHS = 10 # @param {type: "slider", min:1, max:128}
ACCELERATOR = "cpu" # @param ["auto", "gpu", "tpu", "cpu"]

## Moduł ładowania danych

In [16]:
class SBERTDataset(Dataset):
    def __init__(self, file_path: str):
        self._data = pd.read_csv(
            file_path, sep="\t", keep_default_na=False, quoting=csv.QUOTE_NONE
        )

        self._sbert = SentenceTransformer("all-mpnet-base-v2")

        self._types = self._get_encoded_types()
        self._scores = torch.tensor(self._data["y_score"]).float()

    def _get_encoded_types(self):
        types_as_int = types_to_int(self._data["y_type"].tolist())
        encoded_types = torch.nn.functional.one_hot(
            torch.tensor(types_as_int), num_classes=len(TYPES_MAP)
        ).float()
        return encoded_types

    def __getitem__(self, index):
        x1 = self._data["x1"]
        x2 = self._data["x2"]

        a = self._sbert.encode(x1[index])
        b = self._sbert.encode(x2[index])
        c = np.concatenate((a, b))

        x = torch.tensor(c)
        y = (self._types[index], self._scores[index])
        return x, y

    def __len__(self):
        return self._types.shape[0]

In [17]:
class SBERTDataModule(LightningDataModule):
    def __init__(
        self,
        train_path: str,
        test_path: str,
        batch_size: int,
        train_batch_size: int,
        num_workers: int,
        persistent_workers: bool,
    ):
        super().__init__()

        self._train_dataset = None
        self._val_dataset = None
        self._test_dataset = None

        self._batch_size = batch_size
        self._train_batch_size = train_batch_size
        self._num_workers = num_workers
        self._persistent_workers = persistent_workers
        self._train_path = train_path
        self._test_path = test_path
        self._preparte_data_per_node = True

    def _split(self, dataset, proportion):
        a = int(len(dataset) * proportion)
        b = len(dataset) - a
        return random_split(dataset, (a, b))

    def prepare_data(self):
        pass

    def setup(self, stage):
        if self._train_dataset is not None:
            return

        self._train_dataset, self._val_dataset = self._split(
            SBERTDataset(self._train_path), TRAIN_VAL_SPLIT
        )
        self._test_dataset = SBERTDataset(self._test_path)

    def train_dataloader(self):
        return DataLoader(
            self._train_dataset,
            batch_size=self._train_batch_size,
            num_workers=self._num_workers,
            persistent_workers=self._persistent_workers,
        )

    def val_dataloader(self):
        return DataLoader(
            self._val_dataset,
            batch_size=self._batch_size,
            num_workers=self._num_workers,
            persistent_workers=self._persistent_workers,
        )

    def test_dataloader(self):
        return DataLoader(
            self._test_dataset,
            batch_size=self._batch_size,
            num_workers=self._num_workers,
            persistent_workers=self._persistent_workers,
        )

    def predict_dataloader(self):
        return DataLoader(
            self._test_dataset,
            batch_size=self._batch_size,
            num_workers=self._num_workers,
            shuffle=False,
            persistent_workers=self._persistent_workers,
        )


## Modele

### Pojedyncza warstwa, typ i dopasowanie uczone razem, model SBERT nie podlega treningowi

In [18]:
class SingleLayeredHeadJointLearningWithSBERTFrozen(LightningModule):
    def __init__(
        self, sbert_model: str = "all-mpnet-base-v2", learning_rate: float = 0.001
    ):
        super().__init__()
        self._scoring_head = torch.nn.Linear(in_features=SBERT_EMBEDDING_WIDTH * 2, out_features=1)
        self._class_head = torch.nn.Linear(
            in_features=SBERT_EMBEDDING_WIDTH * 2, out_features=len(TYPES_MAP)
        )
        self._learning_rate = learning_rate
        self.save_hyperparameters()

    def _step(self, batch, batch_idx, id: str):
        x, y = batch
        y_hat = self.forward(x)
        return self.loss(y, y_hat, id)

    def forward(self, x):
        score = torch.reshape(self._scoring_head(x), (-1,))
        cls = torch.nn.functional.softmax(self._class_head(x), dim=1)

        return cls, score

    def loss(self, y, y_hat, id):
        # Klasa i ocena uczone razem
        scoring_loss = torch.nn.functional.mse_loss(y_hat[1], y[1])
        class_loss = torch.nn.functional.binary_cross_entropy_with_logits(
            y_hat[0], y[0]
        )
        return scoring_loss + class_loss

    def training_step(self, batch, batch_idx):
        loss = self._step(batch, batch_idx, "train")
        return loss

    def validation_step(self, batch, batch_idx):
        loss = self._step(batch, batch_idx, "val")
        return loss

    def test_step(self, batch, batch_idx):
        loss = self._step(batch, batch_idx, "test")
        return loss

    def predict_step(self, batch, batch_idx):
        x, y = batch
        types, scores = self.forward(x)

        return torch.argmax(types, dim=1), torch.clamp(
            torch.round(scores).int(), min=0, max=5
        )

    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=self._learning_rate)


## Trening

In [19]:
model = SingleLayeredHeadJointLearningWithSBERTFrozen()

In [20]:
data = SBERTDataModule(
    f'{DATA_DIR}/train.tsv', 
    f'{DATA_DIR}/test.tsv', 
    batch_size=BATCH_SIZE, 
    train_batch_size=TRAIN_BATCH_SIZE,
    num_workers=NUM_WORKERS,
    persistent_workers=PERSISTENT_WORKERS,
)

In [21]:
trainer = Trainer(accelerator=ACCELERATOR, max_epochs=EPOCHS, strategy="ddp_fork")

GPU available: True (cuda), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [22]:
# torch.multiprocessing.set_start_method('spawn')# good solution !!!! https://github.com/pytorch/pytorch/issues/40403

trainer.fit(model, data)

Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/1
----------------------------------------------------------------------------------------------------
distributed_backend=gloo
All distributed processes registered. Starting with 1 processes
----------------------------------------------------------------------------------------------------


  | Name          | Type   | Params
-----------------------------------------
0 | _scoring_head | Linear | 1.5 K 
1 | _class_head   | Linear | 12.3 K
-----------------------------------------
13.8 K    Trainable params
0         Non-trainable params
13.8 K    Total params
0.055     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

ProcessRaisedException: 

-- Process 0 terminated with the following error:
Traceback (most recent call last):
  File "/home/damiankolaska/Desktop/NLP_SBERT_interpretable_semantic_text_similarity/.venv/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 69, in _wrap
    fn(i, *args)
  File "/home/damiankolaska/Desktop/NLP_SBERT_interpretable_semantic_text_similarity/.venv/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/multiprocessing.py", line 139, in _wrapping_function
    results = function(*args, **kwargs)
  File "/home/damiankolaska/Desktop/NLP_SBERT_interpretable_semantic_text_similarity/.venv/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 645, in _fit_impl
    self._run(model, ckpt_path=self.ckpt_path)
  File "/home/damiankolaska/Desktop/NLP_SBERT_interpretable_semantic_text_similarity/.venv/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1098, in _run
    results = self._run_stage()
  File "/home/damiankolaska/Desktop/NLP_SBERT_interpretable_semantic_text_similarity/.venv/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1177, in _run_stage
    self._run_train()
  File "/home/damiankolaska/Desktop/NLP_SBERT_interpretable_semantic_text_similarity/.venv/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1190, in _run_train
    self._run_sanity_check()
  File "/home/damiankolaska/Desktop/NLP_SBERT_interpretable_semantic_text_similarity/.venv/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1255, in _run_sanity_check
    val_loop._reload_evaluation_dataloaders()
  File "/home/damiankolaska/Desktop/NLP_SBERT_interpretable_semantic_text_similarity/.venv/lib/python3.10/site-packages/pytorch_lightning/loops/dataloader/evaluation_loop.py", line 234, in _reload_evaluation_dataloaders
    self.trainer.reset_val_dataloader()
  File "/home/damiankolaska/Desktop/NLP_SBERT_interpretable_semantic_text_similarity/.venv/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1635, in reset_val_dataloader
    self.num_val_batches, self.val_dataloaders = self._data_connector._reset_eval_dataloader(
  File "/home/damiankolaska/Desktop/NLP_SBERT_interpretable_semantic_text_similarity/.venv/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py", line 357, in _reset_eval_dataloader
    dataloaders = self._request_dataloader(mode)
  File "/home/damiankolaska/Desktop/NLP_SBERT_interpretable_semantic_text_similarity/.venv/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py", line 446, in _request_dataloader
    dataloader = source.dataloader()
  File "/home/damiankolaska/Desktop/NLP_SBERT_interpretable_semantic_text_similarity/.venv/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py", line 524, in dataloader
    return method()
  File "/tmp/ipykernel_57917/1552273051.py", line 49, in val_dataloader
    return DataLoader(
  File "/home/damiankolaska/Desktop/NLP_SBERT_interpretable_semantic_text_similarity/.venv/lib/python3.10/site-packages/lightning_lite/utilities/data.py", line 323, in wrapper
    init(obj, *args, **kwargs)
  File "/home/damiankolaska/Desktop/NLP_SBERT_interpretable_semantic_text_similarity/.venv/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 246, in __init__
    raise ValueError('persistent_workers option needs num_workers > 0')
ValueError: persistent_workers option needs num_workers > 0


## Ewaluacja

In [None]:
trainer.test(model, data)

Testing: 0it [04:21, ?it/s]


RuntimeError: Caught RuntimeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/damiankolaska/Desktop/NLP_SBERT_interpretable_semantic_text_similarity/.venv/lib/python3.10/site-packages/torch/utils/data/_utils/worker.py", line 302, in _worker_loop
    data = fetcher.fetch(index)
  File "/home/damiankolaska/Desktop/NLP_SBERT_interpretable_semantic_text_similarity/.venv/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 58, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/home/damiankolaska/Desktop/NLP_SBERT_interpretable_semantic_text_similarity/.venv/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 58, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/tmp/ipykernel_46965/4014144725.py", line 23, in __getitem__
    a = self._sbert.encode(x1[index])
  File "/home/damiankolaska/Desktop/NLP_SBERT_interpretable_semantic_text_similarity/.venv/lib/python3.10/site-packages/sentence_transformers/SentenceTransformer.py", line 153, in encode
    self.to(device)
  File "/home/damiankolaska/Desktop/NLP_SBERT_interpretable_semantic_text_similarity/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 989, in to
    return self._apply(convert)
  File "/home/damiankolaska/Desktop/NLP_SBERT_interpretable_semantic_text_similarity/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 641, in _apply
    module._apply(fn)
  File "/home/damiankolaska/Desktop/NLP_SBERT_interpretable_semantic_text_similarity/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 641, in _apply
    module._apply(fn)
  File "/home/damiankolaska/Desktop/NLP_SBERT_interpretable_semantic_text_similarity/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 641, in _apply
    module._apply(fn)
  [Previous line repeated 1 more time]
  File "/home/damiankolaska/Desktop/NLP_SBERT_interpretable_semantic_text_similarity/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 664, in _apply
    param_applied = fn(param)
  File "/home/damiankolaska/Desktop/NLP_SBERT_interpretable_semantic_text_similarity/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 987, in convert
    return t.to(device, dtype if t.is_floating_point() or t.is_complex() else None, non_blocking)
  File "/home/damiankolaska/Desktop/NLP_SBERT_interpretable_semantic_text_similarity/.venv/lib/python3.10/site-packages/torch/cuda/__init__.py", line 217, in _lazy_init
    raise RuntimeError(
RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method


In [None]:
predictions = trainer.predict(model, data)

  rank_zero_warn(


Predicting: 199it [00:00, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Predicting DataLoader 0: 100%|██████████| 128/128 [02:50<00:00,  1.33s/it]


In [None]:
fields_sep = ' // '

def preds_to_wa(wa_content: str, preds_lines):
    wa_lines = wa_content.splitlines()

    idx = 0
    result = []

    for line in wa_lines:
        line_res = line

        if '<==>' in line:
            fields = line.split(fields_sep)
            preds_fields = preds_lines[idx].split()

            fields[1] = preds_fields[1]
            fields[2] = preds_fields[2]

            line_res = fields_sep.join(fields)
            idx += 1

        result.append(line_res)
    
    return '\n'.join(result)

In [None]:
def flatten(t):
    return [item for sublist in t for item in sublist]

types_inv_map = {v: k for k, v in TYPES_MAP.items()}

types = list(map(lambda t: types_inv_map[t], flatten([t.tolist() for t, s in predictions])))
scores = flatten([s.tolist() for t, s in predictions])

predictions = [
    f"{index}\t{item[0]} {item[1]}\n" for index, item in enumerate(zip(types, scores))
]

In [None]:
wa_file = os.path.join(DATA_DIR, f"STSint.testinput.{DATASET}.wa")
wa_output_file = os.path.join(DATA_DIR, f"STSint.testinput.{DATASET}-predictions.wa")

with open(wa_file) as file:
    wa_test = file.read()

wa_predictions = preds_to_wa(wa_test, predictions)

with open(wa_output_file, "w") as file:
    file.write(wa_predictions)

In [None]:
from subprocess import check_output

cmds = [
    f"perl evalF1_penalty.pl {wa_file} {wa_output_file}",
    f"perl evalF1_no_penalty.pl {wa_file} {wa_output_file}",
]

for cmd in cmds:
    print(f"Executing {cmd}")
    print(check_output(cmd.split(), cwd="./").decode())

Executing perl evalF1_penalty.pl ./data/sem_eval_2016/headlines/STSint.testinput.headlines.wa ./data/sem_eval_2016/headlines/STSint.testinput.headlines-predictions.wa


chunk aligned twice 10 (3 4 5):7 8 9 10 <==> 3 4 5 // OPPO // 4 // instead of releasing them <==> Palestinian prisoner release  at evalF1_penalty.pl line 165, <I> line 322.
chunk aligned twice 177 (1 2 3):1 2 3 <==> 1 2 3 // REL // 4 // Bangladesh collapse search <==> Bangladesh collapse deaths  at evalF1_penalty.pl line 165, <I> line 5913.
chunk aligned twice 230 (1 2 3):1 2 <==> 1 2 3 // REL // 4 // Arsenal stars <==> Arsenal great Rice  at evalF1_penalty.pl line 165, <I> line 7658.
chunk aligned twice 305 (3 4):4 <==> 3 4 // SIMI // 3 // who <==> Christian woman  at evalF1_penalty.pl line 165, <I> line 10154.
chunk aligned twice 362 (6):1 <==> 6 // EQUI // 5 // Gunman <==> Gunman  at evalF1_penalty.pl line 165, <I> line 12076.
chunk aligned twice 10 (3 4 5):7 8 9 10 <==> 3 4 5 // EQUI // 2 // instead of releasing them <==> Palestinian prisoner release  at evalF1_penalty.pl line 165, <I> line 12836.
chunk aligned twice 177 (1 2 3):1 2 3 <==> 1 2 3 // EQUI // 2 // Bangladesh collapse 

 F1 Ali     1.0000
 F1 Type    0.5552
 F1 Score   0.5399
 F1 Typ+Sco 0.3545

Executing perl evalF1_no_penalty.pl ./data/sem_eval_2016/headlines/STSint.testinput.headlines.wa ./data/sem_eval_2016/headlines/STSint.testinput.headlines-predictions.wa
 F1 Ali     1.0000
 F1 Type    0.5552
 F1 Score   0.5399
 F1 Typ+Sco 0.2382

