In [1]:
import sys
import os
# This code enables using of "src.data" imports in vs code (when you're launching it directly from notebooks directory)
project_root = os.path.abspath(os.path.join(os.getcwd(), "../../"))
sys.path.append(project_root)

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import transformers

torch.manual_seed(42)

<torch._C.Generator at 0x7f08f04401d0>

In [2]:
# Loading model weights
qconf = transformers.BitsAndBytesConfig(load_in_8bit=True)

model_name = "AnatoliiPotapov/T-lite-instruct-0.1"
tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side='left')

model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    device_map="cuda:0",
    torch_dtype="auto",
    quantization_config=qconf,
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [3]:
from src.data.classification import SST2Dataset

test_sst2_ds = SST2Dataset(
    tokenizer=tokenizer,
    data_path="../../data/sst-2/test-00000-of-00001.parquet",
    config_path="../../data/",
    device=model.device
)
test_sst2_ds.prompt

'Please perform Sentiment Classification task\n\nAnswer using the label from [negative, positive].\nGenerate the final answer bracketed with <ans> and </ans>.\n\nThe input:\n<INPUT>\n\nResponse:\n'

In [5]:
from src.evaluation.evaluator import TextClassificationEvaluator

# terminators were taken from hf model page (t-lite 0.1)

terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

model_generate_params = {
    "max_new_tokens": 50,
    "eos_token_id": terminators
}

evaluator = TextClassificationEvaluator()
macro_f1 = evaluator.evaluate(
    model=model, 
    tokenizer=tokenizer,
    eval_ds=test_sst2_ds,
    batch_size=64,
    model_generate_args = model_generate_params
)
macro_f1

  0%|          | 0/29 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  3%|▎         | 1/29 [00:02<01:20,  2.89s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  7%|▋         | 2/29 [00:05<01:13,  2.71s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 10%|█         | 3/29 [00:08<01:09,  2.66s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 14%|█▍        | 4/29 [00:10<01:05,  2.63s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 17%|█▋        | 5/29 [00:13<01:02,  2.62s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 21%|██        | 6/29 [00:15<01:00,  2.61s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 24%|██▍       | 7/29 [00:18<00:57,  2.60s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 28%|██▊       | 8/29 [00:21<00:54,  2.60s/it]Setting `p

0.5565279674105245

In [6]:
torch.cuda.empty_cache()

# Generation

In [7]:
from src.data.generation import SamsumDataset

gen_ds = SamsumDataset(
    tokenizer=tokenizer,
    data_path="../../data/samsum/test-00000-of-00001.parquet",
    config_path='../../data',
    device=model.device
)
gen_ds.prompt

'INSTRUCTION:\nSummarize the following text\n\nINPUT:\n<INPUT>\n\nRESPONSE:\n'

In [8]:
from src.evaluation.evaluator import GenerationEvaluator

terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

model_generate_params = {
    "max_new_tokens": 256,
    "eos_token_id": terminators
}

evaluator = GenerationEvaluator()
metrics = evaluator.evaluate(
    model=model, 
    tokenizer=tokenizer,
    eval_ds=gen_ds,
    batch_size=32,
    model_generate_args = model_generate_params
)
metrics

[nltk_data] Downloading package wordnet to
[nltk_data]     /nfs/home/edyagin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /nfs/home/edyagin/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /nfs/home/edyagin/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
  0%|          | 0/26 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  4%|▍         | 1/26 [00:51<21:21, 51.24s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  8%|▊         | 2/26 [01:35<18:45, 46.90s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 12%|█▏        | 3/26 [02:07<15:27, 40.31s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 15%|█▌        | 4/26 [03:03<17:02, 46.48s/it]Setting `pad_token_id` to `eos_token_id`:128001 for o

{'bleu': 0.0983179127962664,
 'rouge': 0.3313492109838392,
 'meteor': 0.4484997598087172}