In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys

sys.path.append("../")

In [None]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"

In [None]:
import pandas as pd
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer

In [None]:
# [pin]
file_path = "../data/data-hard.csv"
root_path = "../data/"


df = pd.read_csv(file_path)
df["prefix"] = "clsorg"
df = df.rename({"message": "input_text", "label": "target_text"}, axis=1)
df.sample(20)

Unnamed: 0,input_text,target_text,prefix
3755,‚Äã‚Äã–ü—Ä–æ–±–ª–µ–º–∞ –∑–∞—Ç–æ–≤–∞—Ä–∏–≤–∞–Ω–∏—è —Ä—ã–Ω–∫–∞ –Ω–µ–¥–≤–∏–∂–∏–º–æ—Å—Ç–∏ –±—É...,111-4;100-5,clsorg
6654,üóì–ö–ê–õ–ï–ù–î–ê–†–¨ –ù–ê –°–ï–ì–û–î–ù–Ø ‚Äî 2023.07.24 üá¶üá∫–ê–≤—Å—Ç—Ä–∞–ª–∏...,53-3;231-3,clsorg
1587,#ROSN ROSN –°–î –†–û–°–ù–ï–§–¢–ò 10 –ù–û–Ø–ë–†–Ø –†–ê–°–°–ú–û–¢–†–ò–¢ –†–ï...,112-4,clsorg
7171,üß† –ó–∞–∫—Ä—ã—Ç—ã–µ –∏–¥–µ–∏ –ú–æ–∑–≥–æ–≤–æ–≥–æ —Ü–µ–Ω—Ç—Ä–∞ –°–∏–≥–Ω–∞–ª–æ–≤ –†–¶–ë ...,152-3;111-3;127-3;89-3;160-3;163-3;254-3;221-3...,clsorg
3678,"‚Äã‚Äã–ò–Ω–¥–µ–∫—Å –ú–æ—Å–ë–∏—Ä–∂–∏ –ø–æ –∏—Ç–æ–≥–∞–º –Ω–µ–¥–µ–ª–∏: +2,04% –ò—Ç...",100-3,clsorg
914,"""üá∑üá∫#YNDX """"–Ø–Ω–¥–µ–∫—Å"""" –æ–±—ä–µ–¥–∏–Ω–∏—Ç –±–∏–∑–Ω–µ—Å —Å–æ–±—Å—Ç–≤–µ–Ω...",236-3,clsorg
6814,üõ¢üá∑üá∫#—Å–ø–≥ #—Ä–æ—Å—Å–∏—è #NVTK –°–§ –æ–¥–æ–±—Ä–∏–ª –∑–∞–∫–æ–Ω –æ–± —ç–∫—Å–ø...,115-4,clsorg
1819,#—Å—Ç—Ä–∞—Ç–µ–≥–∏—è—Ç–∞–∫—Ç–∏–∫–∞ #–∫–æ–ª—É–º–Ω–∏—Å—Ç–∏–∫–∞ –ú–∏–Ω—É—Å 39 –º–ª—Ä–¥...,150-2;7-2,clsorg
5913,üí•üá∑üá∫#MAGN = +5% = –º–∞–∫—Å –∑–∞ 1 –≥–æ–¥,90-3,clsorg
4607,‚¨ÜÔ∏è #YNDX 101.4M‚ÇΩ +0.06% üî∑ –ê–Ω–æ–º–∞–ª—å–Ω—ã–π –æ–±—ä–µ–º Y...,236-4,clsorg


In [None]:
m_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(m_name)

In [None]:
from src.t5.dataset import NERDataModel
from src.t5.model import NERModel

BATCH_SIZE = 64
EPOCHS = 10
train_df, test_df = train_test_split(df, test_size=0.25, random_state=42)
data_module = NERDataModel(train_df, test_df, tokenizer, batch_size=BATCH_SIZE)
data_module.setup()

In [None]:
model = NERModel()

checkpoint_callback = ModelCheckpoint(
    dirpath="checkpoints",
    filename="ner",
    save_top_k=1,
    verbose=True,
    monitor="val_loss",
    mode="min",
)

trainer = Trainer(
    callbacks=[checkpoint_callback],
    max_epochs=EPOCHS,
    accelerator="cuda",
)

In [None]:
!rm -r lightning_logs

In [None]:
trainer.fit(model, data_module)

In [None]:
trained_model = NERModel.load_from_checkpoint("checkpoints/ner-v5.ckpt")
trained_model.freeze()

In [None]:
from src.t5.utils import evaluate_metric, generate_answer_batched

predictions = generate_answer_batched(
    trained_model=trained_model, tokenizer=tokenizer, data=test_df, batch_size=64
)

In [None]:
ldf = test_df.copy()
ldf["predictions"] = predictions
ldf[["tcomp", "tsent"]] = (
    ldf["target_text"].str.split(";", expand=True)[0].str.split("-", expand=True)
)
ldf[["pcomp", "psent"]] = (
    ldf["predictions"].str.split(";", expand=True)[0].str.split("-", expand=True)
)

In [None]:
# [pin]

evaluate_metric(
    company_predictions=ldf["pcomp"].tolist(),
    company_labels=ldf["tcomp"].tolist(),
    sentiment_predictions=ldf["psent"].tolist(),
    sentiment_labels=ldf["tsent"].tolist(),
)

{'total': 57.80824034396828,
 'f1': 0.6391920745477017,
 'accuracy': 0.5169727323316639}