# Datasets using example

### This notebook will show an example of using our custmom dataset classes

In [1]:
import sys
import os
# This code enables using of "src.data" imports in vs code (when you're launching it directly from notebooks directory)
project_root = os.path.abspath(os.path.join(os.getcwd(), "../../"))
sys.path.append(project_root)

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import transformers
from src.data.classification import SST2Dataset
from src.data.generation import SamsumDataset
from src.data.multi_task import BBHDataset
from src.evaluation.evaluator import TextClassificationEvaluator, GenerationEvaluator

torch.manual_seed(42)

  from .autonotebook import tqdm as notebook_tqdm


<torch._C.Generator at 0x7f7f3e6ba7f0>

In [2]:
# Loading model weights
qconf = transformers.BitsAndBytesConfig(load_in_8bit=True)

model_name = "AnatoliiPotapov/T-lite-instruct-0.1"
tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side='left')

model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    device_map="cuda:0",
    torch_dtype="auto",
    quantization_config=qconf,
)

Loading checkpoint shards: 100%|██████████| 4/4 [00:06<00:00,  1.58s/it]


In [42]:
model.device

device(type='cuda', index=0)

In [47]:
# initializing dataset

sst2_ds = SST2Dataset(
    tokenizer=tokenizer,
    device=model.device
)

In [48]:
# data length

print(len(sst2_ds))

1821


In [49]:
# you can get your prompt like that

sst2_ds.prompt

'Please perform Sentiment Classification task.\n\nAnswer using the label from [negative, positive].\nGenerate the final answer bracketed with <ans> and </ans>.\n\nThe input:\n<INPUT>\n\nResponse:\n'

In [50]:
# getting first data sample

input_ids, attention_mask, label = next(iter(sst2_ds))
print(input_ids.shape, attention_mask.shape, label.shape)

torch.Size([99]) torch.Size([99]) torch.Size([])


In [51]:
# terminators were taken from hf model page (t-lite 0.1)

terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

# generating answer for our sample 
# unsqueeze(0) - to make to necessary shape (when using DataLoader it'll be done automatically)
outputs = model.generate(
    input_ids=input_ids.unsqueeze(0),
    attention_mask = attention_mask.unsqueeze(0),
    max_new_tokens=50,
    eos_token_id=terminators,
)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [52]:
# decoding the answer

ans = tokenizer.decode(outputs[0], skip_special_tokens=True)
ans

'Please perform Sentiment Classification task.\n\nAnswer using the label from [negative, positive].\nGenerate the final answer bracketed with <ans> and </ans>.\n\nThe input:\nno movement, no yuks, not much of anything.\n\nResponse:\n<ans>negative</ans>'

In [54]:
pos = ans.find("Response:\n")
ans[pos:]   

'Response:\n<ans>negative</ans>'

In [55]:
model_generate_params = {
    "max_new_tokens": 50,
    "eos_token_id": terminators
}

evaluator = TextClassificationEvaluator()
metrics = evaluator.evaluate(
    model=model, 
    tokenizer=tokenizer,
    eval_ds=sst2_ds,
    batch_size=128,
    model_generate_args = model_generate_params
)

  0%|          | 0/15 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  7%|▋         | 1/15 [00:04<01:04,  4.61s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 13%|█▎        | 2/15 [00:09<00:59,  4.59s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 20%|██        | 3/15 [00:13<00:55,  4.59s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 27%|██▋       | 4/15 [00:18<00:50,  4.58s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 33%|███▎      | 5/15 [00:22<00:45,  4.58s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 40%|████      | 6/15 [00:27<00:41,  4.59s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 47%|████▋     | 7/15 [00:32<00:36,  4.59s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 53%|█████▎    | 8/15 [00:36<00:32,  4.59s/it]Setting `p

In [56]:
metrics

{'f1': 0.5751775831182536, 'accuracy': 0.8594179022515102}

In [57]:
# You can also use your prompt instead of basic one

my_prompt = "You will be given movie reviews. Determine if the given review has negative or positive sentiment."

prompted_sst2_ds = SST2Dataset(
    tokenizer=tokenizer,
    prompt=my_prompt,
    device=model.device
)

In [59]:
metrics = evaluator.evaluate(
    model=model, 
    tokenizer=tokenizer,
    eval_ds=prompted_sst2_ds,
    batch_size=128,
    model_generate_args = model_generate_params
)
metrics

  0%|          | 0/15 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  7%|▋         | 1/15 [00:05<01:10,  5.06s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 13%|█▎        | 2/15 [00:10<01:05,  5.04s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 20%|██        | 3/15 [00:15<01:03,  5.27s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 27%|██▋       | 4/15 [00:20<00:57,  5.18s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 33%|███▎      | 5/15 [00:25<00:51,  5.13s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 40%|████      | 6/15 [00:31<00:47,  5.27s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 47%|████▋     | 7/15 [00:36<00:42,  5.37s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 53%|█████▎    | 8/15 [00:41<00:36,  5.27s/it]Setting `p

{'f1': 0.6361686997429025, 'accuracy': 0.9505766062602965}

In [60]:
# you can also use generation dataset

sds = SamsumDataset(
    tokenizer=tokenizer,
    device=model.device
)

In [61]:
print(len(sds))

819


In [62]:
input_ids, attention_mask, label = next(iter(sds))
print(input_ids.shape, attention_mask.shape, label.shape)

torch.Size([712]) torch.Size([712]) torch.Size([84])


In [65]:
model_generate_params = {
    "max_new_tokens": 256,
    "eos_token_id": terminators
}

evaluator = GenerationEvaluator()
metrics = evaluator.evaluate(
    model=model, 
    tokenizer=tokenizer,
    eval_ds=sds,
    batch_size=32,
    model_generate_args = model_generate_params
)
metrics

Downloading builder script: 100%|██████████| 7.02k/7.02k [00:00<00:00, 16.5MB/s]
[nltk_data] Downloading package wordnet to
[nltk_data]     /nfs/home/vzhuravlev/nltk_data...
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /nfs/home/vzhuravlev/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /nfs/home/vzhuravlev/nltk_data...
  0%|          | 0/26 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  4%|▍         | 1/26 [00:42<17:45, 42.61s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  8%|▊         | 2/26 [01:22<16:19, 40.81s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 12%|█▏        | 3/26 [01:59<15:06, 39.43s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 15%|█▌        | 4/26 [02:44<15:13, 41.51s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 

{'bleu': 0.08601431946357946,
 'rouge': np.float64(0.3092796788507406),
 'meteor': np.float64(0.44558376449318987)}

In [3]:
# Multi-task dataset example

ds = BBHDataset(
    tokenizer,
    device=model.device
)

ds = ds.task('boolean_expressions')

terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

model_generate_params = {
    "max_new_tokens": 50,
    "eos_token_id": terminators
}

evaluator = TextClassificationEvaluator()
metrics = evaluator.evaluate(
    model=model, 
    tokenizer=tokenizer,
    eval_ds=ds,
    batch_size=128,
    model_generate_args = model_generate_params
)

  0%|          | 0/1 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
100%|██████████| 1/1 [00:01<00:00,  1.92s/it]


In [4]:
metrics

{'f1': 0.4848915748311411, 'accuracy': 0.6349206349206349}