In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys

sys.path.append("../")

In [3]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "5"
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"

In [4]:
import pandas as pd
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer

from src.bert.dataset import BERTClassificationDataModule
from src.bert.model import BERTClassificationModel

In [5]:
DATA_PATH = "../data/sentiment_data.csv"

In [6]:
data = pd.read_csv(DATA_PATH)

In [7]:
data.SentimentScore -= 1

In [8]:
data

Unnamed: 0,issuerid,MessageText,SentimentScore
0,153,⚠️🇷🇺#SELG #дивиденд сд Селигдар: дивиденды 20...,1
1,230,Ozon продолжает развивать специализированные ф...,3
2,118,​Фокусы продолжаются🔥Акции и инвестиции 📈ВТБ ...,3
3,220,​Фокусы продолжаются🔥Акции и инвестиции 📈ВТБ ...,4
4,89,​​Windfall Tax — налог на сверхприбыль. Какие ...,1
...,...,...,...
9316,157,#FLOT #Дивиденды 💰 7% — возможная дивдоходност...,3
9317,157,🇷🇺#FLOT #отчетность ЧИСТАЯ ПРИБЫЛЬ СОВКОМФЛОТ...,3
9318,225,​​Ключевой принцип создания портфеля 🔹Диверси...,2
9319,127,"""💥🇷🇺#PLZL #листинг #торги """"Полюс"""" ведет диа...",2


In [9]:
train_df, test_df = train_test_split(data, test_size=0.25, random_state=42)

In [11]:
MODEL_NAME = "cointegrated/rubert-tiny2"
BATCH_SIZE = 64
NUM_EPOCHS = 10

In [12]:
num_issuers_classes = train_df.issuerid.max() + 1
num_sentiment_classes = train_df.SentimentScore.nunique()
num_issuers_classes, num_sentiment_classes

(275, 5)

In [13]:
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

In [14]:
data_module = BERTClassificationDataModule(train_df=train_df,
                                           test_df=test_df,
                                           tokenizer=tokenizer,
                                           num_issuers_classes=num_issuers_classes,
                                           num_sentiment_classes=num_sentiment_classes,
                                           batch_size=BATCH_SIZE)
data_module.setup()

In [15]:
model = BERTClassificationModel(model_name=MODEL_NAME,
                                num_issuers_classes=num_issuers_classes,
                                num_sentiment_classes=num_sentiment_classes)

In [16]:
checkpoint_callback = ModelCheckpoint(
    dirpath="checkpoints",
    filename="sentiment",
    save_top_k=1,
    verbose=True,
    monitor="loss/val",
    mode="min",
)

trainer = Trainer(
    callbacks=[checkpoint_callback],
    max_epochs=NUM_EPOCHS,
    accelerator="cuda",
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [17]:
!rm -r lightning_logs

In [18]:
trainer.fit(model, data_module)

You are using a CUDA device ('NVIDIA A100 80GB PCIe') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Missing logger folder: /home/worker/workspace/hakaton-gagarin-sentiment_interface/pybooks/lightning_logs
/home/worker/workspace/ctc-ocr-torch/venv/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:653: Checkpoint directory /home/worker/workspace/hakaton-gagarin-sentiment_interface/pybooks/checkpoints exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [5]

  | Name           | Type       | Params
----------------------------------------------
0 | bert           | BertModel  | 29.2 M
1 | issuers_head   | Sequential | 150 K 
2 | sentiment_head | Sequential | 865 K 
------------------------------

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 0, global step 87: 'loss/val' reached 0.02543 (best 0.02543), saving model to '/home/worker/workspace/hakaton-gagarin-sentiment_interface/pybooks/checkpoints/sentiment-v3.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 1, global step 174: 'loss/val' reached 0.01717 (best 0.01717), saving model to '/home/worker/workspace/hakaton-gagarin-sentiment_interface/pybooks/checkpoints/sentiment-v3.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 2, global step 261: 'loss/val' reached 0.01452 (best 0.01452), saving model to '/home/worker/workspace/hakaton-gagarin-sentiment_interface/pybooks/checkpoints/sentiment-v3.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 3, global step 348: 'loss/val' reached 0.01354 (best 0.01354), saving model to '/home/worker/workspace/hakaton-gagarin-sentiment_interface/pybooks/checkpoints/sentiment-v3.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 4, global step 435: 'loss/val' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 5, global step 522: 'loss/val' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 6, global step 609: 'loss/val' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 7, global step 696: 'loss/val' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 8, global step 783: 'loss/val' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 9, global step 870: 'loss/val' was not in top 1
`Trainer.fit` stopped: `max_epochs=10` reached.


In [19]:
trained_model = BERTClassificationModel.load_from_checkpoint("checkpoints/sentiment-v3.ckpt",
                                                             model_name=MODEL_NAME,
                                                             num_issuers_classes=num_issuers_classes,
                                                             num_sentiment_classes=num_sentiment_classes)

In [20]:
from src.bert.utils import evaluate_metric, generate_answer_batched

In [21]:
issuer_preds, sentiment_preds = generate_answer_batched(trained_model, tokenizer, test_df)

  0%|          | 0/37 [00:00<?, ?it/s]

In [22]:
test_df

Unnamed: 0,issuerid,MessageText,SentimentScore
1512,90,🟢 Топ российских акций: исключены акции Татнеф...,2
5981,185,🇷🇺#HYDR #отчетность ЧИСТАЯ ПРИБЫЛЬ РУСГИДРО П...,3
7105,227,"​​Индекс МосБиржи по итогам недели: +2,48% Ит...",2
960,36,🏁 Итоги дня: 9 марта 🥃 Акции Beluga Group се...,2
2771,48,💵 Текущая картина основных отсечек в этом диви...,3
...,...,...,...
1535,111,#LKOH ЛУКОЙЛ 🔴 Аномальная лимитка на продажу ...,1
6497,236,Просто идем снизу вверх: PLZL — рехай в золо...,2
7623,11,🔍 Взгляд на компанию: У Русала нет проблем со ...,3
5967,231,• Переезд в российскую юрисдикцию и возвращени...,3


In [23]:
# [pin]

evaluate_metric(
    company_predictions=issuer_preds,
    company_labels=test_df.issuerid.tolist(),
    sentiment_predictions=sentiment_preds,
    sentiment_labels=test_df.SentimentScore.tolist(), 
)

{'total': 50.62072020966396,
 'f1': 0.5658249576038326,
 'accuracy': 0.4465894465894466}