In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys

sys.path.append("../")

In [None]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "6"
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"

In [None]:
import pandas as pd
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer

In [None]:
# [pin]
file_path = "../data/data-hard.csv"
root_path = "../data/"


df = pd.read_csv(file_path)
df["prefix"] = "clsorg"
df = df.rename({"message": "input_text", "label": "target_text"}, axis=1)
df.sample(20)

Unnamed: 0,input_text,target_text,prefix
2095,–ê–∫—Ü–∏—è ¬´–ü–æ–ø—Ä–æ–±—É–π—Ç–µ –∏–Ω–≤–µ—Å—Ç–∏—Ü–∏–∏!¬ª —Å –±—Ä–æ–∫–µ—Ä–æ–º –¢–∏–Ω—å...,236-3,clsorg
6778,üõ¢üá∑üá∫#GAZP ¬´–ì–ê–ó–ü–†–û–ú¬ª –ú–û–ñ–ï–¢ –ù–ê–ß–ê–¢–¨ –ü–û–°–¢–ê–í–ö–ò –ì–ê–ó–ê ...,48-4,clsorg
2798,–û–∂–∏–¥–∞–µ–º—ã–µ —Å–æ–±—ã—Ç–∏—è –Ω–∞ 15 –º–∞—Ä—Ç–∞ üá∑üá∫ –í –†–æ—Å—Å–∏–∏ –ü–æ...,225-3,clsorg
6387,üìä –ü–ê–û ¬´–ù–ö –†–æ—Å–Ω–µ—Ñ—Ç—å¬ª –¢–∏–∫–µ—Ä: ROSN –ò–¥–µ—è: Long ‚¨ÜÔ∏è ...,112-5,clsorg
5512,üåé#–∞–ª—é–º–∏–Ω–∏–π #–º–µ—Ç–∞–ª–ª—ã –º–∏—Ä–æ–≤—ã–µ –ø—Ä–æ–∏–∑–≤–æ–¥–∏—Ç–µ–ª–∏ –∞–ª—é...,11-4,clsorg
2225,–í–∑–≥–ª—è–¥ –Ω–∞ –∫–æ–º–ø–∞–Ω–∏—é üìë –í –ø—è—Ç–Ω–∏—Ü—É –ù–æ—Ä–Ω–∏–∫–µ–ª—å –æ—Ç—á–∏...,53-3,clsorg
6772,üõ¢üá∞üáøüá∑üá∫#TRNFP –ö–∞–∑–∞—Ö—Å—Ç–∞–Ω –≤ 2024 –≥–æ–¥—É –º–æ–∂–µ—Ç –Ω–∞—Ä–∞—Å...,175-4,clsorg
3294,–¢–∏–Ω—å–∫–æ—Ñ—Ñ $TCS $TCSG –ø—Ä–æ–¥–æ–ª–∂–∏—Ç –∞–∫—Ü–∏—é –ø–æ –∫–æ–º–ø–µ–Ω—Å...,225-4,clsorg
5192,üá∑üá∫#SPBE #–æ—Ç—á–µ—Ç–Ω–æ—Å—Ç—å,255-3,clsorg
5934,üí•üá∑üá∫#MTLRp = –º–∞–∫—Å –∑–∞ 11 –º–µ—Å –ö–∏—Ç–∞–π —É–≤–µ–ª–∏—á–∏–ª –∏–º...,99-4;129-4,clsorg


In [None]:
m_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(m_name)

In [None]:
from src.t5.dataset import NERDataModel
from src.t5.model import NERModel

BATCH_SIZE = 64
EPOCHS = 10
train_df, test_df = train_test_split(df, test_size=0.25, random_state=42)
data_module = NERDataModel(
    train_df, test_df, tokenizer, batch_size=BATCH_SIZE, source_max_token_length=512
)
data_module.setup()

In [None]:
model = NERModel()

checkpoint_callback = ModelCheckpoint(
    dirpath="checkpoints",
    filename="ner",
    save_top_k=1,
    verbose=True,
    monitor="val_loss",
    mode="min",
)

trainer = Trainer(
    callbacks=[checkpoint_callback],
    max_epochs=EPOCHS,
    accelerator="cuda",
)

In [None]:
!rm -r lightning_logs

In [None]:
trainer.fit(model, data_module)

In [None]:
# trained_model = NERModel.load_from_checkpoint("checkpoints/ner-v5.ckpt")
trained_model = NERModel.load_from_checkpoint("checkpoints/ner-v6.ckpt")
trained_model.freeze()

In [None]:
import torch

from src.t5.utils import evaluate_metric, generate_answer_batched

with torch.inference_mode(), torch.cuda.amp.autocast():
    predictions = generate_answer_batched(
        trained_model=trained_model.model,
        tokenizer=tokenizer,
        data=test_df[:1200],
        batch_size=64,
        max_length=256,
    )

In [None]:
ldf = test_df.copy()[:1200]
ldf["predictions"] = predictions
ldf[["tcomp", "tsent"]] = (
    ldf["target_text"].str.split(";", expand=True)[0].str.split("-", expand=True)
)
ldf[["pcomp", "psent"]] = (
    ldf["predictions"].str.split(";", expand=True)[0].str.split("-", expand=True)
)

In [None]:
# [pin]

evaluate_metric(
    company_predictions=ldf["pcomp"].tolist(),
    company_labels=ldf["tcomp"].tolist(),
    sentiment_predictions=ldf["psent"].tolist(),
    sentiment_labels=ldf["tsent"].tolist(),
)

{'total': 55.85067410545177,
 'f1': 0.618680148775702,
 'accuracy': 0.49833333333333335}