In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys

sys.path.append("../")

In [None]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "2"
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"

In [None]:
import pandas as pd
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer

from src.bert.dataset import BERTClassificationDataModule
from src.bert.model import BERTClassificationModel

In [None]:
DATA_PATH = "../data/sentiment_data.csv"

In [None]:
data = pd.read_csv(DATA_PATH)

In [None]:
data.SentimentScore -= 1

In [None]:
data

In [None]:
train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)

In [None]:
MODEL_NAME = "cointegrated/rubert-tiny2"
BATCH_SIZE = 64
NUM_EPOCHS = 5

In [None]:
num_issuers_classes = train_df.issuerid.max() + 1
num_sentiment_classes = train_df.SentimentScore.nunique()
num_issuers_classes, num_sentiment_classes

In [None]:
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

In [None]:
data_module = BERTClassificationDataModule(
    train_df=train_df,
    test_df=test_df,
    tokenizer=tokenizer,
    num_issuers_classes=num_issuers_classes,
    num_sentiment_classes=num_sentiment_classes,
    batch_size=BATCH_SIZE,
    max_length=256,
    use_aug=True,
)
data_module.setup()

In [None]:
model = BERTClassificationModel(
    model_name=MODEL_NAME,
    num_issuers_classes=num_issuers_classes,
    num_sentiment_classes=num_sentiment_classes,
)

In [None]:
checkpoint_callback = ModelCheckpoint(
    dirpath="checkpoints",
    filename="sentiment",
    save_top_k=1,
    verbose=True,
    monitor="loss/val",
    mode="min",
)

trainer = Trainer(
    callbacks=[checkpoint_callback],
    max_epochs=NUM_EPOCHS,
    accelerator="cuda",
)

In [None]:
trainer.fit(model, data_module)

In [None]:
trained_model = BERTClassificationModel.load_from_checkpoint(
    "checkpoints/sentiment.ckpt",
    model_name=MODEL_NAME,
    num_issuers_classes=num_issuers_classes,
    num_sentiment_classes=num_sentiment_classes,
)

In [None]:
from src.bert.utils import evaluate_metric, generate_answer_batched

In [None]:
issuer_preds, sentiment_preds = generate_answer_batched(
    trained_model, tokenizer, test_df
)

In [None]:
# [pin]

evaluate_metric(
    company_predictions=issuer_preds,
    company_labels=test_df.issuerid.tolist(),
    sentiment_predictions=sentiment_preds,
    sentiment_labels=test_df.SentimentScore.tolist(),
)

{'total': 51.058618307236145,
 'f1': 0.5675530631956612,
 'accuracy': 0.45361930294906166}