In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys

sys.path.append("../")

In [None]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "6"
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"

In [None]:
import pandas as pd
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer

In [None]:
# [pin]
file_path = "../data/data.csv"
root_path = "../data/"


df = pd.read_csv(file_path)
df["prefix"] = "clsorg"
df = df.rename({"message": "input_text", "label": "target_text"}, axis=1)
df.sample(20)

Unnamed: 0,input_text,target_text,prefix
7507,????#GLTR #buyback –ê–ö–¶–ò–û–ù–ï–†–´ GLOBALTRANS –£–¢–í–ï–†...,220,clsorg
12541,‚Äã–ù–ê–ü–†–ê–í–õ–ï–ù–ò–Ø –õ–û–ö–ê–õ–¨–ù–´–• –¢–†–ï–ù–î–û–í –ü–û –ë–ê–ó–û–í–´–ú –ò–ù–°–¢...,127;150;7;99;152,clsorg
10206,"–ì–û–°–ê - –§–æ—Å–ê–≥—Ä–æ: –î–ò–í–ò–î–ï–ù–î–´ = 465 —Ä—É–±–ª–µ–π, –î–î: ...",187,clsorg
2446,"""TRYRUB –ú–æ—Å–∫–æ–≤—Å–∫–∞—è –±–∏—Ä–∂–∞ –∑–∞–ø—É—Å—Ç–∏–ª–∞ —Ç–æ—Ä–≥–∏ –≤–∞–ª—é—Ç...",103,clsorg
4161,"""üõ¢üá∑üá∫#–Ω–µ—Ñ—Ç—å #–≥–∞–∑ #—Å–ø–≥ #—Ä–æ—Å—Å–∏—è –∑–∞—è–≤–ª–µ–Ω–∏—è –ê–ª–µ–∫—Å–∞...",48,clsorg
9156,"GMKN - RUAL - –û–§–ï–†–¢–ê ¬´–†—É—Å–∞–ª¬ª, –ø–æ–¥—Ç–≤–µ—Ä–¥–∏–≤—à–∏–π –≥–æ...",11,clsorg
3865,"""üèÅ –ò—Ç–æ–≥–∏ –¥–Ω—è: 24 –º–∞—è üì¶ Ozon —Ä–∞—Å–∫—Ä—ã–ª —Å–∏–ª—å–Ω—ã–µ —Ä...",204;26,clsorg
14394,üá∑üá∫#SMLT #–æ—Ç—á–µ—Ç–Ω–æ—Å—Ç—å –°–∞–º–æ–ª–µ—Ç ‚Äî –†–°–ë–£ 9–º 2023–≥,56,clsorg
15858,üîç –í–∑–≥–ª—è–¥ –Ω–∞ –∫–æ–º–ø–∞–Ω–∏—é: –ú–ú–ö ‚Äì –æ–ø–µ—Ä–∞—Ü–∏–æ–Ω–Ω—ã–µ —Ä–µ–∑—É–ª...,89;116;90,clsorg
15497,"üí∏ –ú–æ—Å–±–∏—Ä–∂–∞ –æ–±—ä—è–≤–∏–ª–∞ –¥–∏–≤–∏–¥–µ–Ω–¥—ã –∑–∞ 2023 17,35 —Ä...",103,clsorg


In [None]:
m_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(m_name)

In [None]:
from src.t5.dataset import NERDataModel
from src.t5.model import NERModel

BATCH_SIZE = 64
EPOCHS = 10
train_df, test_df = train_test_split(df, test_size=0.25, random_state=42)
data_module = NERDataModel(train_df, test_df, tokenizer, batch_size=BATCH_SIZE)
data_module.setup()

In [None]:
model = NERModel()

checkpoint_callback = ModelCheckpoint(
    dirpath="checkpoints",
    filename="ner",
    save_top_k=1,
    verbose=True,
    monitor="val_loss",
    mode="min",
)

trainer = Trainer(
    callbacks=[checkpoint_callback],
    max_epochs=EPOCHS,
    accelerator="cuda",
)

In [None]:
!rm -r lightning_logs

In [None]:
trainer.fit(model, data_module)

In [None]:
trained_model = NERModel.load_from_checkpoint("checkpoints/ner-v4.ckpt")
trained_model.freeze()

In [None]:
from src.t5.utils import evaluate_f1, generate_answer_batched

predictions = generate_answer_batched(
    trained_model=trained_model.model, tokenizer=tokenizer, data=test_df, batch_size=512
)

In [None]:
ldf = test_df.copy()
ldf["predictions"] = predictions
ldf["tcomp"] = ldf["target_text"].str.split(";", expand=True)[0]
ldf["pcomp"] = ldf["predictions"].str.split(";", expand=True)[0]

In [None]:
# [pin]

evaluate_f1(
    predictions=ldf["pcomp"].tolist(),
    labels=ldf["tcomp"].tolist(),
)

0.6451461823244122