In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys

sys.path.append("../")

In [3]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "5"
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"

In [4]:
import pandas as pd
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer

from src.bert.dataset import BERTClassificationDataModule
from src.bert.model import BERTClassificationModel

In [5]:
DATA_PATH = "../data/sentiment_data.csv"

In [6]:
data = pd.read_csv(DATA_PATH)

In [7]:
data.SentimentScore -= 1

In [8]:
data

Unnamed: 0,issuerid,MessageText,SentimentScore
0,153,‚ö†Ô∏èüá∑üá∫#SELG #–¥–∏–≤–∏–¥–µ–Ω–¥ —Å–¥ –°–µ–ª–∏–≥–¥–∞—Ä: –¥–∏–≤–∏–¥–µ–Ω–¥—ã 20...,1
1,230,Ozon –ø—Ä–æ–¥–æ–ª–∂–∞–µ—Ç —Ä–∞–∑–≤–∏–≤–∞—Ç—å —Å–ø–µ—Ü–∏–∞–ª–∏–∑–∏—Ä–æ–≤–∞–Ω–Ω—ã–µ —Ñ...,3
2,118,‚Äã–§–æ–∫—É—Å—ã –ø—Ä–æ–¥–æ–ª–∂–∞—é—Ç—Å—èüî•–ê–∫—Ü–∏–∏ –∏ –∏–Ω–≤–µ—Å—Ç–∏—Ü–∏–∏ üìà–í–¢–ë ...,3
3,220,‚Äã–§–æ–∫—É—Å—ã –ø—Ä–æ–¥–æ–ª–∂–∞—é—Ç—Å—èüî•–ê–∫—Ü–∏–∏ –∏ –∏–Ω–≤–µ—Å—Ç–∏—Ü–∏–∏ üìà–í–¢–ë ...,4
4,89,‚Äã‚ÄãWindfall Tax ‚Äî –Ω–∞–ª–æ–≥ –Ω–∞ —Å–≤–µ—Ä—Ö–ø—Ä–∏–±—ã–ª—å. –ö–∞–∫–∏–µ ...,1
...,...,...,...
9316,157,#FLOT #–î–∏–≤–∏–¥–µ–Ω–¥—ã üí∞ 7% ‚Äî –≤–æ–∑–º–æ–∂–Ω–∞—è –¥–∏–≤–¥–æ—Ö–æ–¥–Ω–æ—Å—Ç...,3
9317,157,üá∑üá∫#FLOT #–æ—Ç—á–µ—Ç–Ω–æ—Å—Ç—å –ß–ò–°–¢–ê–Ø –ü–†–ò–ë–´–õ–¨ –°–û–í–ö–û–ú–§–õ–û–¢...,3
9318,225,‚Äã‚Äã–ö–ª—é—á–µ–≤–æ–π –ø—Ä–∏–Ω—Ü–∏–ø —Å–æ–∑–¥–∞–Ω–∏—è –ø–æ—Ä—Ç—Ñ–µ–ª—è üîπ–î–∏–≤–µ—Ä—Å–∏...,2
9319,127,"""üí•üá∑üá∫#PLZL #–ª–∏—Å—Ç–∏–Ω–≥ #—Ç–æ—Ä–≥–∏ """"–ü–æ–ª—é—Å"""" –≤–µ–¥–µ—Ç –¥–∏–∞...",2


In [9]:
train_df, test_df = train_test_split(data, test_size=0.25, random_state=42)

In [11]:
MODEL_NAME = "cointegrated/rubert-tiny2"
BATCH_SIZE = 64
NUM_EPOCHS = 10

In [12]:
num_issuers_classes = train_df.issuerid.max() + 1
num_sentiment_classes = train_df.SentimentScore.nunique()
num_issuers_classes, num_sentiment_classes

(275, 5)

In [13]:
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

In [14]:
data_module = BERTClassificationDataModule(train_df=train_df,
                                           test_df=test_df,
                                           tokenizer=tokenizer,
                                           num_issuers_classes=num_issuers_classes,
                                           num_sentiment_classes=num_sentiment_classes,
                                           batch_size=BATCH_SIZE)
data_module.setup()

In [15]:
model = BERTClassificationModel(model_name=MODEL_NAME,
                                num_issuers_classes=num_issuers_classes,
                                num_sentiment_classes=num_sentiment_classes)

In [16]:
checkpoint_callback = ModelCheckpoint(
    dirpath="checkpoints",
    filename="sentiment",
    save_top_k=1,
    verbose=True,
    monitor="loss/val",
    mode="min",
)

trainer = Trainer(
    callbacks=[checkpoint_callback],
    max_epochs=NUM_EPOCHS,
    accelerator="cuda",
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [17]:
!rm -r lightning_logs

In [18]:
trainer.fit(model, data_module)

You are using a CUDA device ('NVIDIA A100 80GB PCIe') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Missing logger folder: /home/worker/workspace/hakaton-gagarin-sentiment_interface/pybooks/lightning_logs
/home/worker/workspace/ctc-ocr-torch/venv/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:653: Checkpoint directory /home/worker/workspace/hakaton-gagarin-sentiment_interface/pybooks/checkpoints exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [5]

  | Name           | Type       | Params
----------------------------------------------
0 | bert           | BertModel  | 29.2 M
1 | issuers_head   | Sequential | 150 K 
2 | sentiment_head | Sequential | 865 K 
------------------------------

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 0, global step 87: 'loss/val' reached 0.02543 (best 0.02543), saving model to '/home/worker/workspace/hakaton-gagarin-sentiment_interface/pybooks/checkpoints/sentiment-v3.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 1, global step 174: 'loss/val' reached 0.01717 (best 0.01717), saving model to '/home/worker/workspace/hakaton-gagarin-sentiment_interface/pybooks/checkpoints/sentiment-v3.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 2, global step 261: 'loss/val' reached 0.01452 (best 0.01452), saving model to '/home/worker/workspace/hakaton-gagarin-sentiment_interface/pybooks/checkpoints/sentiment-v3.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 3, global step 348: 'loss/val' reached 0.01354 (best 0.01354), saving model to '/home/worker/workspace/hakaton-gagarin-sentiment_interface/pybooks/checkpoints/sentiment-v3.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 4, global step 435: 'loss/val' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 5, global step 522: 'loss/val' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 6, global step 609: 'loss/val' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 7, global step 696: 'loss/val' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 8, global step 783: 'loss/val' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 9, global step 870: 'loss/val' was not in top 1
`Trainer.fit` stopped: `max_epochs=10` reached.


In [19]:
trained_model = BERTClassificationModel.load_from_checkpoint("checkpoints/sentiment-v3.ckpt",
                                                             model_name=MODEL_NAME,
                                                             num_issuers_classes=num_issuers_classes,
                                                             num_sentiment_classes=num_sentiment_classes)

In [20]:
from src.bert.utils import evaluate_metric, generate_answer_batched

In [21]:
issuer_preds, sentiment_preds = generate_answer_batched(trained_model, tokenizer, test_df)

  0%|          | 0/37 [00:00<?, ?it/s]

In [22]:
test_df

Unnamed: 0,issuerid,MessageText,SentimentScore
1512,90,üü¢ –¢–æ–ø —Ä–æ—Å—Å–∏–π—Å–∫–∏—Ö –∞–∫—Ü–∏–π: –∏—Å–∫–ª—é—á–µ–Ω—ã –∞–∫—Ü–∏–∏ –¢–∞—Ç–Ω–µ—Ñ...,2
5981,185,üá∑üá∫#HYDR #–æ—Ç—á–µ—Ç–Ω–æ—Å—Ç—å –ß–ò–°–¢–ê–Ø –ü–†–ò–ë–´–õ–¨ –†–£–°–ì–ò–î–†–û –ü...,3
7105,227,"‚Äã‚Äã–ò–Ω–¥–µ–∫—Å –ú–æ—Å–ë–∏—Ä–∂–∏ –ø–æ –∏—Ç–æ–≥–∞–º –Ω–µ–¥–µ–ª–∏: +2,48% –ò—Ç...",2
960,36,üèÅ –ò—Ç–æ–≥–∏ –¥–Ω—è: 9 –º–∞—Ä—Ç–∞ ü•É –ê–∫—Ü–∏–∏ Beluga Group —Å–µ...,2
2771,48,üíµ –¢–µ–∫—É—â–∞—è –∫–∞—Ä—Ç–∏–Ω–∞ –æ—Å–Ω–æ–≤–Ω—ã—Ö –æ—Ç—Å–µ—á–µ–∫ –≤ —ç—Ç–æ–º –¥–∏–≤–∏...,3
...,...,...,...
1535,111,#LKOH –õ–£–ö–û–ô–õ üî¥ –ê–Ω–æ–º–∞–ª—å–Ω–∞—è –ª–∏–º–∏—Ç–∫–∞ –Ω–∞ –ø—Ä–æ–¥–∞–∂—É ...,1
6497,236,–ü—Ä–æ—Å—Ç–æ –∏–¥–µ–º —Å–Ω–∏–∑—É –≤–≤–µ—Ä—Ö: PLZL ‚Äî —Ä–µ—Ö–∞–π –≤ –∑–æ–ª–æ...,2
7623,11,üîç –í–∑–≥–ª—è–¥ –Ω–∞ –∫–æ–º–ø–∞–Ω–∏—é: –£ –†—É—Å–∞–ª–∞ –Ω–µ—Ç –ø—Ä–æ–±–ª–µ–º —Å–æ ...,3
5967,231,‚Ä¢ –ü–µ—Ä–µ–µ–∑–¥ –≤ —Ä–æ—Å—Å–∏–π—Å–∫—É—é —é—Ä–∏—Å–¥–∏–∫—Ü–∏—é –∏ –≤–æ–∑–≤—Ä–∞—â–µ–Ω–∏...,3


In [23]:
# [pin]

evaluate_metric(
    company_predictions=issuer_preds,
    company_labels=test_df.issuerid.tolist(),
    sentiment_predictions=sentiment_preds,
    sentiment_labels=test_df.SentimentScore.tolist(), 
)

{'total': 50.62072020966396,
 'f1': 0.5658249576038326,
 'accuracy': 0.4465894465894466}