In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "6"
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"

In [None]:
import pandas as pd
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer

In [None]:
# [pin]
file_path = "../data/data-hard.csv"
root_path = "../data/"


df = pd.read_csv(file_path)
df["prefix"] = "clsorg"
df = df.rename({"message": "input_text", "label": "target_text"}, axis=1)
df.sample(20)

Unnamed: 0,input_text,target_text,prefix
2595,–ö—Ä–∏–ø—Ç–æ–±–∏—Ä–∂–∞ Binance –≤–µ—Ä–Ω—É–ª–∞ –≤–æ–∑–º–æ–∂–Ω–æ—Å—Ç—å –æ–ø–ª–∞—Ç—ã...,228-4,clsorg
6231,üìÇ –ö–∞–∫ –æ—Ç—á–∏—Ç–∞–ª–∞—Å—å –ê–§–ö ¬´–°–∏—Å—Ç–µ–º–∞¬ª –∑–∞ IV –∫–≤–∞—Ä—Ç–∞–ª ...,100-3,clsorg
3281,–¢–∞–∫ –∑–∞—á–µ–º –¢–∏–Ω—å–∫–æ—Ñ—Ñ –ö–•–õ? –•–æ–∫–∫–µ–π –∞–±—Å–æ–ª—é—Ç–Ω–æ –∏—Å–∫—Ä–µ...,115-3,clsorg
452,"""‚ö†Ô∏èüá∑üá∫#CHMF #–¥–∏–≤–∏–¥–µ–Ω–¥ –∞–∫—Ü–∏–æ–Ω–µ—Ä—ã """"–°–µ–≤–µ—Ä—Å—Ç–∞–ª—å""...",152-2,clsorg
4661,üá∑üá∫#ABIO –ü–æ—Å–ª–µ –∏–Ω—Ç–µ—Ä–≤—å—é –ú–∞—Ä–∏–∏ –í–æ—Ä–æ–Ω—Ü–æ–≤–æ–π –æ —Ä–∞–∑...,266-4,clsorg
5204,üá∑üá∫#TATN #–¥–∏–≤–∏–¥–µ–Ω–¥—ã –ö–û–ù–°–ï–ù–°–£–°: –¢–∞—Ç–Ω–µ—Ñ—Ç—å –º–æ–∂–µ—Ç ...,163-4,clsorg
3540,‚Äã–û—Å–ª–∞–±–ª–µ–Ω–∏–µ —Ä—É–±–ª—è. –ö–∞–∫ –∑–∞—â–∏—Ç–∏—Ç—å—Å—è? üî∂–î–∏–∞–ø–∞–∑–æ–Ω ...,160-4;53-4,clsorg
3990,‚Äã‚Äãüü¢ –ò–¢–û–ì–ò –î–ù–Ø. –†–æ—Å—Å–∏–π—Å–∫–∏–π —Ä—ã–Ω–æ–∫ –∞–∫—Ü–∏–π –ø—Ä–æ–¥–æ–ª–∂–∏...,22-4,clsorg
843,"""üá∑üá∫#SNGS #–¥–∏–≤–∏–¥–µ–Ω–¥ –î–∏–≤–¥–æ—Ö–æ–¥–Ω–æ—Å—Ç—å """"–ø—Ä–µ—Ñ–æ–≤"""" –°—É...",160-4,clsorg
2798,–û–∂–∏–¥–∞–µ–º—ã–µ —Å–æ–±—ã—Ç–∏—è –Ω–∞ 15 –º–∞—Ä—Ç–∞ üá∑üá∫ –í –†–æ—Å—Å–∏–∏ –ü–æ...,225-3,clsorg


In [None]:
m_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(m_name)

In [None]:
from t5src.dataset import NERDataModel
from t5src.model import NERModel

BATCH_SIZE = 64
EPOCHS = 10
train_df, test_df = train_test_split(df, test_size=0.25, random_state=42)
data_module = NERDataModel(train_df, test_df, tokenizer, batch_size=BATCH_SIZE)
data_module.setup()

In [None]:
model = NERModel()

checkpoint_callback = ModelCheckpoint(
    dirpath="checkpoints",
    filename="ner",
    save_top_k=1,
    verbose=True,
    monitor="val_loss",
    mode="min",
)

trainer = Trainer(
    callbacks=[checkpoint_callback],
    max_epochs=EPOCHS,
    accelerator="cuda",
)

In [None]:
!rm -r lightning_logs

In [None]:
trainer.fit(model, data_module)

In [None]:
trained_model = NERModel.load_from_checkpoint("checkpoints/ner-v5.ckpt")
trained_model.freeze()

In [None]:
from t5src.utils import evaluate_metric, generate_answer_batched

predictions = generate_answer_batched(
    trained_model=trained_model, tokenizer=tokenizer, data=test_df, batch_size=512
)

In [None]:
ldf = test_df.copy()
ldf["predictions"] = predictions
ldf[["tcomp", "tsent"]] = (
    ldf["target_text"].str.split(";", expand=True)[0].str.split("-", expand=True)
)
ldf[["pcomp", "psent"]] = (
    ldf["predictions"].str.split(";", expand=True)[0].str.split("-", expand=True)
)

In [None]:
# [pin]

evaluate_metric(
    company_predictions=ldf["pcomp"].tolist(),
    company_labels=ldf["tcomp"].tolist(),
    sentiment_predictions=ldf["psent"].tolist(),
    sentiment_labels=ldf["tsent"].tolist(),
)

{'total': 57.80824034396828,
 'f1': 0.6391920745477017,
 'accuracy': 0.5169727323316639}