In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "6"
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"

In [3]:
import os

import pandas as pd
import pytorch_lightning as pl
import torch
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from transformers import AdamW, T5ForConditionalGeneration, T5Tokenizer

# Local File System
file_path = "../data/data.csv"
root_path = "../data/"


df = pd.read_csv(file_path)
df["prefix"] = "clsorg"
df = df.rename({"message": "input_text", "label": "target_text"}, axis=1)
df.tail()

Unnamed: 0,input_text,target_text,prefix
16604,🪨 Взгляд на компанию: «Мечел» — эффект от отме...,99,clsorg
16605,🪨 Взгляд на компанию: «Мечел»: 3-й кв. 2023 г....,99,clsorg
16606,🪨 Мечел: акции с потенциалом роста свыше 90% д...,99,clsorg
16607,🪨 Мечел: анализ ключевых тем. Взгляд БКС Мы п...,99,clsorg
16608,🫶 АФК Система обещает дивы. Верим? Основатель...,26,clsorg


In [4]:
m_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(m_name)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
class NERDataset(Dataset):
    def __init__(
        self,
        data: pd.DataFrame,
        tokenizer: T5Tokenizer,
        source_max_token_length: int = 396,
        target_max_token_length: int = 32,
    ):
        self.tokenizer = tokenizer
        self.data = data
        self.source_max_token_length = source_max_token_length
        self.target_max_token_length = target_max_token_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index: int):
        data_row = self.data.iloc[index]

        source_encoding = tokenizer(
            data_row["prefix"] + ": " + data_row["input_text"],
            max_length=self.source_max_token_length,
            padding="max_length",
            truncation=True,
            # truncation="only_second",
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt",
        )

        target_encoding = tokenizer(
            data_row["target_text"],
            max_length=self.target_max_token_length,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt",
        )

        labels = target_encoding["input_ids"]
        labels[labels == 0] = -100

        return dict(
            input_text=data_row["prefix"] + ": " + data_row["input_text"],
            target_text=data_row["target_text"],
            input_ids=source_encoding["input_ids"].flatten(),
            attention_mask=source_encoding["attention_mask"].flatten(),
            labels=labels.flatten(),
        )

In [6]:
class NERDataModel(pl.LightningDataModule):
    def __init__(
        self,
        train_df: pd.DataFrame,
        test_df: pd.DataFrame,
        tokenizer: T5Tokenizer,
        batch_size: int = 8,
        source_max_token_length=396,
        target_max_token_length=32,
    ):
        super().__init__()
        self.batch_size = batch_size
        self.train_df = train_df
        self.test_df = test_df
        self.train_dataset = None
        self.test_dataset = None
        self.tokenizer = tokenizer
        self.source_max_token_length = source_max_token_length
        self.target_max_token_length = target_max_token_length

    def setup(self, stage=None):
        self.train_dataset = NERDataset(
            self.train_df,
            self.tokenizer,
            self.source_max_token_length,
            self.target_max_token_length,
        )

        self.test_dataset = NERDataset(
            self.test_df,
            self.tokenizer,
            self.source_max_token_length,
            self.target_max_token_length,
        )

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=10
        )

    def val_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size, num_workers=16)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=1, num_workers=16)

In [7]:
BATCH_SIZE = 128
EPOCHS = 10
train_df, test_df = train_test_split(df, test_size=0.25, random_state=42)
data_module = NERDataModel(train_df, test_df, tokenizer, batch_size=BATCH_SIZE)
data_module.setup()

In [8]:
class NERModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained(
            m_name, return_dict=True
        )

    def forward(self, input_ids, attention_mask, labels=None):
        output = self.model(
            input_ids=input_ids, attention_mask=attention_mask, labels=labels
        )
        return output.loss, output.logits

    def training_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("train_loss", loss, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("val_loss", loss, prog_bar=True, logger=True)
        return loss

    def test_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("test_loss", loss, prog_bar=True, logger=True)
        return loss

    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=0.001)

In [9]:
model = NERModel()

In [11]:
checkpoint_callback = ModelCheckpoint(
    dirpath="checkpoints",
    filename="ner",
    save_top_k=1,
    verbose=True,
    monitor="val_loss",
    mode="min",
)

trainer = Trainer(
    callbacks=[checkpoint_callback],
    max_epochs=EPOCHS,
    accelerator="cuda",
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/home/worker/workspace/hakaton-gagarin-sentiment_interface/.conda/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default


In [62]:
!rm -r lightning_logs

In [63]:
trainer.fit(model, data_module)

Missing logger folder: /home/worker/workspace/hakaton-gagarin-sentiment_interface/pybooks/lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [6]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M
-----------------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
242.026   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 0, global step 98: 'val_loss' reached 1.25994 (best 1.25994), saving model to '/home/worker/workspace/hakaton-gagarin-sentiment_interface/pybooks/checkpoints/ner-v4.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 1, global step 196: 'val_loss' reached 1.04948 (best 1.04948), saving model to '/home/worker/workspace/hakaton-gagarin-sentiment_interface/pybooks/checkpoints/ner-v4.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 2, global step 294: 'val_loss' reached 0.97783 (best 0.97783), saving model to '/home/worker/workspace/hakaton-gagarin-sentiment_interface/pybooks/checkpoints/ner-v4.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 3, global step 392: 'val_loss' reached 0.95075 (best 0.95075), saving model to '/home/worker/workspace/hakaton-gagarin-sentiment_interface/pybooks/checkpoints/ner-v4.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 4, global step 490: 'val_loss' reached 0.95019 (best 0.95019), saving model to '/home/worker/workspace/hakaton-gagarin-sentiment_interface/pybooks/checkpoints/ner-v4.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 5, global step 588: 'val_loss' was not in top 1
/home/worker/workspace/hakaton-gagarin-sentiment_interface/.conda/lib/python3.11/site-packages/pytorch_lightning/trainer/call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...


In [10]:
trained_model = NERModel.load_from_checkpoint("checkpoints/ner-v4.ckpt")
trained_model.freeze()

In [11]:
def generate_answer(data_row):
    with torch.no_grad():
        source_encoding = tokenizer(
            data_row["prefix"] + ": " + data_row["input_text"],
            max_length=396,
            padding="max_length",
            truncation="only_second",
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt",
        )

        generated_ids = trained_model.model.generate(
            input_ids=source_encoding["input_ids"].cuda(),
            attention_mask=source_encoding["attention_mask"].cuda(),
            num_beams=3,
            max_length=80,
            repetition_penalty=1.0,
            early_stopping=True,
            use_cache=True,
        ).cpu()

        preds = [
            tokenizer.decode(
                generated_id,
                skip_special_tokens=True,
                clean_up_tokenization_spaces=True,
            )
            for generated_id in generated_ids
        ]

    return "".join(preds)

In [12]:
import numpy as np
from tqdm.notebook import tqdm


def generate_answer_batched(data: pd.DataFrame, batch_size: int = 64):
    predictions = []
    with torch.no_grad():
        for name, batch in tqdm(data.groupby(np.arange(len(data)) // batch_size)):
            source_encoding = tokenizer(
                (batch["prefix"] + ": " + batch["input_text"]).tolist(),
                max_length=396,
                padding="max_length",
                truncation=True,
                return_attention_mask=True,
                add_special_tokens=True,
                return_tensors="pt",
            )

            generated_ids = trained_model.model.generate(
                input_ids=source_encoding["input_ids"].cuda(),
                attention_mask=source_encoding["attention_mask"].cuda(),
                num_beams=3,
                max_length=80,
                repetition_penalty=1.0,
                early_stopping=True,
                use_cache=True,
            ).cpu()

            preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
            predictions.append(preds)

    return sum(predictions, [])

In [13]:
predictions = generate_answer_batched(test_df, batch_size=512)

  0%|          | 0/9 [00:00<?, ?it/s]

In [27]:
res_df = test_df.copy()
res_df.drop(columns=["target_text"], inplace=True)
res_df["predictions"] = predictions
res_df["predictions"] = res_df["predictions"].str.split(";")
res_df = res_df.explode("predictions")
res_df.shape

(4232, 3)

In [26]:
exp_df = test_df.copy()
exp_df["target_text"] = exp_df["target_text"].str.split(";")
exp_df = exp_df.explode("target_text")
exp_df.shape

(4812, 3)

In [32]:
mrg_df = pd.merge(
    exp_df[["input_text", "target_text"]],
    res_df[["input_text", "predictions"]],
    on="input_text",
    how="left",
).fillna("-1")

In [53]:
ldf = test_df.copy()
ldf["predictions"] = pd.Series(predictions).str.split(";", expand=True)[0].values
ldf["target_text"] = ldf["target_text"].str.split(";", expand=True)[0].values
ldf.sample(20)

Unnamed: 0,input_text,target_text,prefix,predictions
11387,По BANEP кста почти закрыли дивгэп. Ни на что...,25,clsorg,25
6076,#риком_обзоры 📂 «Лента» — крупнейшая сеть гип...,229,clsorg,218
15554,"📆 #Календарь недели: 24-28 июля Понедельник,...",231,clsorg,235
5723,#VTBR ⚡ Чистая прибыль ВТБ за 1кв2023 оценивае...,7,clsorg,7
8193,????????#ipo #FLOT ВЬЮГИН: СОВКОМФЛОТ ОЦЕНЕН 2...,157,clsorg,157
2788,"""Льготная ипотека живет! МОСКВА, 4 сентября...",218,clsorg,230
7762,????#брокерырф #фининдустриярф #VTBR В 2018 го...,7,clsorg,7
14183,🇷🇺#MTSS #ipo Акционеры МТС-банка воздержались ...,100,clsorg,100
9492,TRCN - ДИВИДЕНДЫ СОВЕТ ДИРЕКТОРОВ ТРАНСКОНТЕЙН...,190,clsorg,190
14189,🇷🇺#MTSS #отчетность 19 мая - МТС операционны...,100,clsorg,100


In [54]:
import evaluate

metric = evaluate.load("f1")
final_score = metric.compute(
    predictions=ldf["predictions"].tolist(),
    references=ldf["target_text"].tolist(),
    average="weighted",
)
final_score

{'f1': 0.6451461823244122}