In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys

sys.path.append("../")

In [None]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer

In [None]:
import random

import numpy as np
import torch


def seed_everything(seed=10):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


seed_everything()

In [None]:
# [pin]
file_path = "../data/data-hard.csv"
root_path = "../data/"


df = pd.read_csv(file_path)
df["prefix"] = "clsorg"
df = df.rename({"message": "input_text", "label": "target_text"}, axis=1)
df.sample(20)

Unnamed: 0,input_text,target_text,prefix
5222,üá∑üá∫#TCSG #–æ—Ç—á–µ—Ç–Ω–æ—Å—Ç—å –ö–û–ù–°–ï–ù–°–£–°: TCS Group –≤–æ I...,225-4,clsorg
4135,‚è∞ –î–æ–±—Ä–æ–µ —É—Ç—Ä–æ! 16 –º–∞—Ä—Ç–∞ üåç –ù–æ—á–Ω–æ–µ –¥–µ–∂—É—Ä—Å—Ç–≤–æ (–∑...,228-3;251-3,clsorg
3130,–°–ü–ë –ë–∏—Ä–∂–∞ –Ω–∞—á–Ω–µ—Ç —Ç–æ—Ä–≥–∏ —Ü–µ–Ω–Ω—ã–º–∏ –±—É–º–∞–≥–∞–º–∏ –≤–æ—Å—å–º–∏...,255-4,clsorg
1629,#SMLT –û—Ç–∫—É–ø –≤ –∞–∫—Ü–∏—è—Ö –°–∞–º–æ–ª—ë—Ç–∞: –≥—ç–ø –Ω–∞ –æ—Ç–∫—Ä—ã—Ç–∏...,56-3,clsorg
3917,‚Äã‚Äãüü¢ –ò–¢–û–ì–ò –î–ù–Ø. –†–æ—Å—Å–∏–π—Å–∫–∏–µ –∞–∫—Ü–∏–∏ –Ω–µ–º–Ω–æ–≥–æ –ø–æ–¥—Ä–æ—Å...,90-2;152-2,clsorg
4392,‚õîÔ∏è –†–æ—Å—Å–∏—è –∑–∞–ø—Ä–µ—â–∞–µ—Ç —ç–∫—Å–ø–æ—Ä—Ç –±–µ–Ω–∑–∏–Ω–∞. –ö–∞–∫–∏–µ –∫–æ–º...,25-2,clsorg
5193,üá∑üá∫#SPBE #–æ—Ç—á–µ—Ç–Ω–æ—Å—Ç—å –ò—Ç–æ–≥–∏ —Ç–æ—Ä–≥–æ–≤ –Ω–∞ –°–ü–ë –ë–∏—Ä–∂–µ...,255-3,clsorg
2996,–ü—Ä–æ—Å—Ç–æ –≤—Å–ø–æ–º–Ω–∏—Ç–µ –∫–∞–∫ –ø—Ä–æ—Å—Ä–∞–ª—Å—è –°–µ–≤–∫–∞ –Ω–∞ –æ–∂–∏–¥–∞–Ω...,90-5,clsorg
5391,üá∑üá∫#–∞–≤–∏–∞ #—Ä–æ—Å—Å–∏—è –†–æ—Å—Å–∏—è –º–æ–∂–µ—Ç –æ—Ç–∫—Ä—ã—Ç—å –ø—Ä—è–º—ã–µ –∞...,32-4,clsorg
4306,‚ö°Ô∏è –°–±–µ—Ä (SBER) –æ—Ç—ã–≥—Ä–∞–ª –≤—Å—ë –ø–∞–¥–µ–Ω–∏–µ –Ω–∞ –°–í–û. #—Ö–≤...,150-3,clsorg


In [None]:
# m_name = "t5-small"
m_name = "cointegrated/rut5-small"
tokenizer = T5Tokenizer.from_pretrained(m_name)

In [None]:
from src.t5.dataset import NERDataModel

BATCH_SIZE = 128
EPOCHS = 10
num_workers = 12
train_df, test_df = train_test_split(df, test_size=0.25, random_state=42)
data_module = NERDataModel(
    train_df, test_df, tokenizer, batch_size=BATCH_SIZE, num_workers=num_workers
)
data_module.setup()

In [None]:
from transformers import T5ForConditionalGeneration

In [None]:
m_name = "t5-small"
trained_model = T5ForConditionalGeneration.from_pretrained(m_name, return_dict=True)
state_dict = torch.load("./checkpoints/ner-v8.ckpt")["state_dict"]
state_dict = {k.partition("model.")[2]: v for k, v in state_dict.items()}
trained_model.load_state_dict(state_dict)
trained_model.save_pretrained("./pretrained")

In [None]:
m_name = "../pretrained-rut5-2"
trained_model = T5ForConditionalGeneration.from_pretrained(
    m_name, return_dict=True, torch_dtype=torch.float16
)
trained_model.save_pretrained("../pretrained-rut5-2-fp16")

In [None]:
m_name = "../pretrained-rut5-2-fp16"
trained_model = T5ForConditionalGeneration.from_pretrained(
    m_name, return_dict=True, torch_dtype=torch.float16
)
trained_model.cuda();

In [None]:
import torch

from src.t5.utils import evaluate_metric, generate_answer_batched

In [None]:
# [pin]
with torch.inference_mode(), torch.cuda.amp.autocast():
    predictions = generate_answer_batched(
        trained_model=trained_model,
        tokenizer=tokenizer,
        data=test_df[:],
        batch_size=128,
        num_beams=1,
        max_source_length=396,
        max_target_length=40,
        verbose=False,
    )

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 15/15 [00:15<00:00,  1.02s/it]


In [None]:
ldf = test_df.copy()[:]
ldf[["tcomp", "tsent"]] = (
    ldf["target_text"].str.split(";", expand=True)[0].str.split("-", expand=True)
)

In [None]:
from src.t5.utils import postprocess_predictions

orgsent = list(
    map(lambda x: x[0] if len(x) else [0, 1], postprocess_predictions(predictions))
)
org = list(map(lambda x: x[0], orgsent))
sent = list(map(lambda x: x[1], orgsent))

In [None]:
len(org), len(ldf["tcomp"].tolist())

In [None]:
# [pin]

evaluate_metric(
    company_predictions=org,
    company_labels=ldf["tcomp"].tolist(),
    sentiment_predictions=sent,
    sentiment_labels=ldf["tsent"].tolist(),
)

{'total': 74.53947501804372,
 'f1': 0.7940727502217538,
 'accuracy': 0.6967167501391207}