In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "4"
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"

In [3]:
import pandas as pd
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer

In [4]:
import random
import numpy as np
import torch
import re

def seed_everything(seed=10):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


seed_everything()

In [5]:
# [pin]
file_path = "../data/data-hard.csv"
root_path = "../data/"


df = pd.read_csv(file_path)
df["prefix"] = "clsorg"
df = df.rename({"message": "input_text", "label": "target_text"}, axis=1)

# df["input_text"] = df["input_text"].apply(preprocess_text)
df.drop(2764)
df.sample(20)

Unnamed: 0,input_text,target_text,prefix
5222,üá∑üá∫#TCSG #–æ—Ç—á–µ—Ç–Ω–æ—Å—Ç—å –ö–û–ù–°–ï–ù–°–£–°: TCS Group –≤–æ I...,225-4,clsorg
4135,‚è∞ –î–æ–±—Ä–æ–µ —É—Ç—Ä–æ! 16 –º–∞—Ä—Ç–∞ üåç –ù–æ—á–Ω–æ–µ –¥–µ–∂—É—Ä—Å—Ç–≤–æ (–∑...,228-3;251-3,clsorg
3130,–°–ü–ë –ë–∏—Ä–∂–∞ –Ω–∞—á–Ω–µ—Ç —Ç–æ—Ä–≥–∏ —Ü–µ–Ω–Ω—ã–º–∏ –±—É–º–∞–≥–∞–º–∏ –≤–æ—Å—å–º–∏...,255-4,clsorg
1629,#SMLT –û—Ç–∫—É–ø –≤ –∞–∫—Ü–∏—è—Ö –°–∞–º–æ–ª—ë—Ç–∞: –≥—ç–ø –Ω–∞ –æ—Ç–∫—Ä—ã—Ç–∏...,56-3,clsorg
3917,‚Äã‚Äãüü¢ –ò–¢–û–ì–ò –î–ù–Ø. –†–æ—Å—Å–∏–π—Å–∫–∏–µ –∞–∫—Ü–∏–∏ –Ω–µ–º–Ω–æ–≥–æ –ø–æ–¥—Ä–æ—Å...,90-2;152-2,clsorg
4392,‚õîÔ∏è –†–æ—Å—Å–∏—è –∑–∞–ø—Ä–µ—â–∞–µ—Ç —ç–∫—Å–ø–æ—Ä—Ç –±–µ–Ω–∑–∏–Ω–∞. –ö–∞–∫–∏–µ –∫–æ–º...,25-2,clsorg
5193,üá∑üá∫#SPBE #–æ—Ç—á–µ—Ç–Ω–æ—Å—Ç—å –ò—Ç–æ–≥–∏ —Ç–æ—Ä–≥–æ–≤ –Ω–∞ –°–ü–ë –ë–∏—Ä–∂–µ...,255-3,clsorg
2996,–ü—Ä–æ—Å—Ç–æ –≤—Å–ø–æ–º–Ω–∏—Ç–µ –∫–∞–∫ –ø—Ä–æ—Å—Ä–∞–ª—Å—è –°–µ–≤–∫–∞ –Ω–∞ –æ–∂–∏–¥–∞–Ω...,90-5,clsorg
5391,üá∑üá∫#–∞–≤–∏–∞ #—Ä–æ—Å—Å–∏—è –†–æ—Å—Å–∏—è –º–æ–∂–µ—Ç –æ—Ç–∫—Ä—ã—Ç—å –ø—Ä—è–º—ã–µ –∞...,32-4,clsorg
4306,‚ö°Ô∏è –°–±–µ—Ä (SBER) –æ—Ç—ã–≥—Ä–∞–ª –≤—Å—ë –ø–∞–¥–µ–Ω–∏–µ –Ω–∞ –°–í–û. #—Ö–≤...,150-3,clsorg


In [6]:
m_name = "cointegrated/rut5-small"
tokenizer = T5Tokenizer.from_pretrained(m_name)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [7]:
whitelist = [str(num) for num in range(280)]
whitelist_ids = [tokenizer.encode(word)[0] for word in whitelist]
bad_words_ids=[[id] for id in range(tokenizer.vocab_size) if id not in whitelist_ids]

In [8]:
import sys

sys.path.append("/home/worker/workspace/hakatons/hakaton-gagarin-sentiment_interface/src")

from t5.dataset import NERDataModel
from t5.model import NERModel

BATCH_SIZE = 128
EPOCHS = 4
train_df, test_df = train_test_split(df, test_size=0.25, random_state=42)
data_module = NERDataModel(train_df, test_df, tokenizer, batch_size=BATCH_SIZE)
data_module.setup()

In [10]:
model = NERModel(m_name, lr=0.0007)

checkpoint_callback = ModelCheckpoint(
    dirpath="checkpoints2",
    filename="ner",
    save_top_k=1,
    verbose=True,
    monitor="val_loss",
    mode="min",
)

trainer = Trainer(
    callbacks=[checkpoint_callback],
    max_epochs=EPOCHS,
    accelerator="cuda",
)

You are using a model of type mt5 to instantiate a model of type t5. This is not supported for all configurations of models and can yield errors.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


T5ForConditionalGeneration(
  (shared): Embedding(20100, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(20100, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
              (wo): 

/home/worker/workspace/hakatons/hakaton-gagarin-sentiment_interface/.conda/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default


In [11]:
!rm -r lightning_logs

In [12]:
trainer.fit(model, data_module)

You are using a CUDA device ('NVIDIA A100 80GB PCIe') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Missing logger folder: /home/worker/workspace/hakatons/hakaton-gagarin-sentiment_interface/pybooks/lightning_logs
/home/worker/workspace/hakatons/hakaton-gagarin-sentiment_interface/.conda/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:653: Checkpoint directory /home/worker/workspace/hakatons/hakaton-gagarin-sentiment_interface/pybooks/checkpoints2 exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [6]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 64.6 M
---------------------------------------

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/home/worker/workspace/hakatons/hakaton-gagarin-sentiment_interface/.conda/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (43) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 0, global step 43: 'val_loss' reached 1.63819 (best 1.63819), saving model to '/home/worker/workspace/hakatons/hakaton-gagarin-sentiment_interface/pybooks/checkpoints2/ner-v21.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 1, global step 86: 'val_loss' reached 1.05920 (best 1.05920), saving model to '/home/worker/workspace/hakatons/hakaton-gagarin-sentiment_interface/pybooks/checkpoints2/ner-v21.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 2, global step 129: 'val_loss' reached 0.91543 (best 0.91543), saving model to '/home/worker/workspace/hakatons/hakaton-gagarin-sentiment_interface/pybooks/checkpoints2/ner-v21.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 3, global step 172: 'val_loss' reached 0.89289 (best 0.89289), saving model to '/home/worker/workspace/hakatons/hakaton-gagarin-sentiment_interface/pybooks/checkpoints2/ner-v21.ckpt' as top 1
`Trainer.fit` stopped: `max_epochs=4` reached.


In [13]:
# import torch
# checkpoint_path = "best_model.pth"

# # Save the entire model state dictionary
# torch.save(model.state_dict(), checkpoint_path)

In [14]:
# import torch
# trained_model = NERModel(m_name)
# trained_model.load_state_dict(torch.load("model.pth"))

In [15]:
# trained_model = NERModel.load_from_checkpoint("/home/worker/workspace/hakatons/hakaton-gagarin-sentiment_interface/pybooks/checkpoints2/ner-v2.ckpt")
# trained_model.freeze()

In [9]:
model = torch.load('final_model.pth')


In [10]:
import torch

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)

NERModel(
  (model): T5ForConditionalGeneration(
    (shared): Embedding(20100, 512)
    (encoder): T5Stack(
      (embed_tokens): Embedding(20100, 512)
      (block): ModuleList(
        (0): T5Block(
          (layer): ModuleList(
            (0): T5LayerSelfAttention(
              (SelfAttention): T5Attention(
                (q): Linear(in_features=512, out_features=384, bias=False)
                (k): Linear(in_features=512, out_features=384, bias=False)
                (v): Linear(in_features=512, out_features=384, bias=False)
                (o): Linear(in_features=384, out_features=512, bias=False)
                (relative_attention_bias): Embedding(32, 6)
              )
              (layer_norm): T5LayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (1): T5LayerFF(
              (DenseReluDense): T5DenseGatedActDense(
                (wi_0): Linear(in_features=512, out_features=1024, bias=False)
                (wi_1): Linear(in_fe

In [19]:
from t5.utils import evaluate_metric, generate_answer_batched

predictions = generate_answer_batched(
    trained_model=model, tokenizer=tokenizer, data=test_df, batch_size=64
)

  0%|          | 0/26 [00:00<?, ?it/s]

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 26/26 [00:35<00:00,  1.38s/it]


In [20]:
len(test_df)

1797

In [12]:
ldf = test_df.copy()
ldf["predictions"] = predictions

In [13]:
ldf

Unnamed: 0,input_text,target_text,prefix,predictions
3337,"–¢—Ä–µ–Ω–¥—ã, —Ü–∏—Ñ—Ä—ã, —Ñ–∞–∫—Ç—ã: —Ä—ã–Ω–æ–∫ 14 –º–∞—Ä—Ç–∞ üìâ –í —Å—Ä–µ...",111-4;241-3;160-4,clsorg,160-5
5144,üá∑üá∫#SBER #–æ—Ç—á–µ—Ç–Ω–æ—Å—Ç—å –°–ë–ï–†–ë–ê–ù–ö –í –Ø–ù–í–ê–†–ï 2023–ì –£...,150-5,clsorg,150-4
5062,üá∑üá∫#POSI #–¥–∏–≤–∏–¥–µ–Ω–¥ –°–æ–≤–µ—Ç –¥–∏—Ä–µ–∫—Ç–æ—Ä–æ–≤ Positive Te...,241-4,clsorg,241-4
4417,‚ú¥Ô∏è#MATIC #–∫—Ä–∏–ø—Ç–æ Polygon —Å—Ç–∞–ª —Å–≤–æ–µ–≥–æ —Ä–æ–¥–∞ –º–∞—è...,235-0,clsorg,235-0
1615,#SBER –°–±–µ—Ä–±–∞–Ω–∫ üî∑ –ê–Ω–æ–º–∞–ª—å–Ω—ã–π –æ–±—ä—ë–º –ò–∑–º–µ–Ω–µ–Ω–∏–µ —Ü–µ...,150-5,clsorg,150-5
...,...,...,...,...
5871,üí•üá∑üá∫#GMKN = –º–∞–∫—Å –∑–∞ 1 –º–µ—Å,53-3,clsorg,53-3
4325,‚ö°Ô∏è‚ö°Ô∏è‚ö°Ô∏è –ü–æ –∏—Ç–æ–≥–∞–º 2023 –≥–æ–¥–∞ –¥–∏–≤–∏–¥–µ–Ω–¥ –ú–ú–ö (MAGN)...,90-4,clsorg,90-4
1027,"""üèÅ –ò—Ç–æ–≥–∏ –¥–Ω—è: 4 –∞–ø—Ä–µ–ª—è üìà –ü—Ä–∏–≤–∏–ª–µ–≥–∏—Ä–æ–≤–∞–Ω–Ω—ã–µ –∞–∫...",36-4;160-4;187-3;48-3,clsorg,160-4
7032,üü¢ –ù–æ–≤—ã–π –≤—ã–ø—É—Å–∫ –µ–∂–µ–Ω–µ–¥–µ–ª—å–Ω–æ–≥–æ –≤–∏–¥–µ–æ Top News –æ—Ç...,32-3;227-3;111-3;90-3,clsorg,90-3;111-3;111-3


In [14]:

ldf[["tcomp", "tsent"]] = (
    ldf["target_text"].str.split(";", expand=True)[0].str.split("-", expand=True)
)
ldf[["pcomp", "psent"]] = (
    ldf["predictions"].str.split(";", expand=True)[0].str.split("-", expand=True)
)

In [15]:
import re

for index, row in ldf.iterrows():
    pcomp_digits = re.sub(r'\D', '', str(row['pcomp']))
  
    if pcomp_digits == '':
        ldf.at[index, 'pcomp'] = '0' 
    else:
        ldf.at[index, 'pcomp'] = pcomp_digits 
        # try:
        #     i = int(row['pcomp'])
        # except ValueError:
        #     print(row)
        #     ldf.at[index, 'pcomp'] = '0'

In [16]:
# [pin]

evaluate_metric(
    company_predictions=ldf["pcomp"].tolist(),
    company_labels=ldf["tcomp"].tolist(),
    sentiment_predictions=ldf["psent"].tolist(),
    sentiment_labels=ldf["tsent"].tolist(),
)

{'total': 66.44414139494042,
 'f1': 0.7178644639589083,
 'accuracy': 0.6110183639398998}

In [23]:
model

NERModel(
  (model): T5ForConditionalGeneration(
    (shared): Embedding(20100, 512)
    (encoder): T5Stack(
      (embed_tokens): Embedding(20100, 512)
      (block): ModuleList(
        (0): T5Block(
          (layer): ModuleList(
            (0): T5LayerSelfAttention(
              (SelfAttention): T5Attention(
                (q): Linear(in_features=512, out_features=384, bias=False)
                (k): Linear(in_features=512, out_features=384, bias=False)
                (v): Linear(in_features=512, out_features=384, bias=False)
                (o): Linear(in_features=384, out_features=512, bias=False)
                (relative_attention_bias): Embedding(32, 6)
              )
              (layer_norm): T5LayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (1): T5LayerFF(
              (DenseReluDense): T5DenseGatedActDense(
                (wi_0): Linear(in_features=512, out_features=1024, bias=False)
                (wi_1): Linear(in_fe

In [None]:
entities_found = ldf["target_text"].values.tolist()

results = []

for row in entities_found:
    for entity in row.split(";"):
        t = []
        tup = entity.split('-')
        entity_id, entity_score = tup
        t.append((entity_id, entity_score))
    results.append(t)
    


    

In [None]:
results

[[('160', '4')],
 [('150', '5')],
 [('241', '4')],
 [('235', '0')],
 [('150', '5')],
 [('241', '4')],
 [('160', '5')],
 [('112', '4')],
 [('56', '4')],
 [('204', '4')],
 [('72', '2')],
 [('90', '4')],
 [('221', '4')],
 [('90', '4')],
 [('56', '3')],
 [('235', '0')],
 [('265', '3')],
 [('36', '4')],
 [('220', '3')],
 [('47', '2')],
 [('48', '3')],
 [('227', '3')],
 [('127', '4')],
 [('111', '4')],
 [('157', '4')],
 [('152', '4')],
 [('56', '3')],
 [('127', '3')],
 [('127', '3')],
 [('224', '3')],
 [('175', '2')],
 [('100', '3')],
 [('103', '4')],
 [('157', '3')],
 [('235', '4')],
 [('254', '4')],
 [('7', '4')],
 [('241', '4')],
 [('56', '3')],
 [('112', '2')],
 [('223', '4')],
 [('150', '4')],
 [('225', '4')],
 [('236', '4')],
 [('111', '5')],
 [('236', '3')],
 [('228', '4')],
 [('33', '3')],
 [('129', '4')],
 [('53', '3')],
 [('236', '3')],
 [('89', '5')],
 [('236', '3')],
 [('7', '2')],
 [('235', '3')],
 [('235', '3')],
 [('235', '0')],
 [('109', '3')],
 [('225', '5')],
 [('160', '5')

In [25]:

# torch.save(model, 'final_model.pth')