# Datasets using example

### This notebook will show an example of using our custmom dataset classes

In [None]:
import sys
import os
# This code enables using of "src.data" imports in vs code (when you're launching it directly from notebooks directory)
project_root = os.path.abspath(os.path.join(os.getcwd(), "../"))
sys.path.append(project_root)

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from torch.utils.data import DataLoader
import transformers
from tqdm import tqdm
from src.data.classification import SST2Dataset, SamsumDataset
from src.data.base.datasets import BaseClassificationDataset

torch.manual_seed(42)

  from .autonotebook import tqdm as notebook_tqdm


<torch._C.Generator at 0x7f635819e950>

In [2]:
# Loading model weights
qconf = transformers.BitsAndBytesConfig(load_in_8bit=True)

model_name = "t-bank-ai/T-lite-instruct-0.1"
tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side='left')

model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    device_map="auto",
    torch_dtype="auto",
    quantization_config=qconf,
)

Loading checkpoint shards: 100%|██████████| 4/4 [00:09<00:00,  2.48s/it]
Some weights of LlamaForCausalLM were not initialized from the model checkpoint at t-bank-ai/T-lite-instruct-0.1 and are newly initialized: ['model.layers.17.self_attn.rotary_emb.inv_freq', 'model.layers.24.self_attn.rotary_emb.inv_freq', 'model.layers.0.self_attn.rotary_emb.inv_freq', 'model.layers.1.self_attn.rotary_emb.inv_freq', 'model.layers.25.self_attn.rotary_emb.inv_freq', 'model.layers.10.self_attn.rotary_emb.inv_freq', 'model.layers.7.self_attn.rotary_emb.inv_freq', 'model.layers.9.self_attn.rotary_emb.inv_freq', 'model.layers.18.self_attn.rotary_emb.inv_freq', 'model.layers.12.self_attn.rotary_emb.inv_freq', 'model.layers.2.self_attn.rotary_emb.inv_freq', 'model.layers.15.self_attn.rotary_emb.inv_freq', 'model.layers.31.self_attn.rotary_emb.inv_freq', 'model.layers.27.self_attn.rotary_emb.inv_freq', 'model.layers.8.self_attn.rotary_emb.inv_freq', 'model.layers.21.self_attn.rotary_emb.inv_freq', 'model.l

In [3]:
model.device

device(type='cuda', index=0)

In [4]:
# initializing dataset

sst2_ds = SST2Dataset(
    tokenizer=tokenizer,
    data_path="../data/sst-2/test-00000-of-00001.parquet",
    config_path="../data/",
    device=model.device
)

In [5]:
# data length

print(len(sst2_ds))

1821


In [6]:
# you can get your prompt like that

sst2_ds.prompt

'Please perform Sentiment Classification task\n\nAnswer using the label from [negative, positive].\nGenerate the final answer bracketed with <ans> and </ans>.\n\nThe input:\n<INPUT>\n\nResponse:\n'

In [7]:
# getting first data sample

input_ids, attention_mask, label = next(iter(sst2_ds))
print(input_ids.shape, attention_mask.shape, label.shape)

torch.Size([99]) torch.Size([99]) torch.Size([])


In [8]:
# terminators were taken from hf model page (t-lite 0.1)

terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

# generating answer for our sample 
# unsqueeze(0) - to make to necessary shape (when using DataLoader it'll be done automatically)
outputs = model.generate(
    input_ids=input_ids.unsqueeze(0),
    attention_mask = attention_mask.unsqueeze(0),
    max_new_tokens=50,
    eos_token_id=terminators,
)



In [9]:
# decoding the answer

ans = tokenizer.decode(outputs[0], skip_special_tokens=True)
ans

'Please perform Sentiment Classification task\n\nAnswer using the label from [negative, positive].\nGenerate the final answer bracketed with <ans> and </ans>.\n\nThe input:\nno movement, no yuks, not much of anything.\n\nResponse:\n<ans>negative</ans>'

In [10]:
pos = ans.find("Response:\n")
ans[pos:]   

'Response:\n<ans>negative</ans>'

In [11]:
def find_second_position(input: str, substr: str) -> int:
    first_position = input.find(substr)
    return input.find(substr, first_position + 1)

def parse_answer(model_output: str) -> str:
    left_bracket_pos = find_second_position(model_output, "<ans>")
    right_bracket_pos = find_second_position(model_output, "</ans>")
    
    if left_bracket_pos == -1 or right_bracket_pos == -1:
        return ""
    
    return model_output[left_bracket_pos + len("<ans>"):right_bracket_pos]

def evaluate_accuracy(ds: BaseClassificationDataset, batch_size: int = 64) -> float:
    # creating data loadet
    dl = DataLoader(sst2_ds, batch_size=batch_size)

    # keeping history of answers and labels
    results = []
    all_labels = []
    
    label_mapping = ds.get_labels_mapping()

    for input_ids, attention_mask, labels in tqdm(dl):
        # generating batch output
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask = attention_mask,
            max_new_tokens=50,
            eos_token_id=terminators,
        )
        
        # decoding answers
        answers = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    
        # parsing answers        
        answers = [parse_answer(answer) for answer in answers]
        # convert string answers to labels
        labeled_answers = [label_mapping.get(answer, -1) for answer in answers]

        res = torch.Tensor(labeled_answers).type(torch.long)
        results.append(res)
        all_labels.append(labels)
        
        # torch.cuda.empty_cache()
        
    # concatenating all steps together
    results = torch.cat(results).cpu()
    all_labels = torch.cat(all_labels).cpu()
    
    accuracy = torch.mean((results == all_labels).type(torch.float))
    return accuracy.item()

In [12]:
evaluate_accuracy(sst2_ds, batch_size=128)

100%|██████████| 15/15 [17:52<00:00, 71.49s/it]


0.6990664601325989

In [13]:
# You can also use your prompt instead of basic one

my_prompt = "You will be given reviews. Determine if the given review has negative or positive sentiment."

prompted_sst2_ds = SST2Dataset(
    tokenizer=tokenizer,
    data_path="../data/sst-2/test-00000-of-00001.parquet",
    config_path="../data/",
    prompt=my_prompt
)

In [14]:
evaluate_accuracy(prompted_sst2_ds, batch_size=128)

100%|██████████| 15/15 [17:51<00:00, 71.41s/it]


0.6990664601325989

In [None]:
# you can also use generation dataset

sds = SamsumDataset(
    tokenizer=tokenizer,
    data_path="../data/samsum/test-00000-of-00001.parquet",
    config_path='../data'
)

In [16]:
print(len(sds))

819


In [20]:
input_ids, attention_mask, label = next(iter(sds))
print(input_ids.shape, attention_mask.shape, label.shape)

torch.Size([715]) torch.Size([715]) torch.Size([84])


In [22]:
outputs = model.generate(
    input_ids=input_ids.unsqueeze(0),
    attention_mask = attention_mask.unsqueeze(0),
    max_new_tokens=256,
    eos_token_id=terminators,
)

In [23]:
ans = tokenizer.decode(outputs[0], skip_special_tokens=True)
ans

"INSTRUCTION:\nSummarize the following text\n\nINPUT:\nHannah: Hey, do you have Betty's number?\nAmanda: Lemme check\nHannah: <file_gif>\nAmanda: Sorry, can't find it.\nAmanda: Ask Larry\nAmanda: He called her last time we were at the park together\nHannah: I don't know him well\nHannah: <file_gif>\nAmanda: Don't be shy, he's very nice\nHannah: If you say so..\nHannah: I'd rather you texted him\nAmanda: Just text him 🙂\nHannah: Urgh.. Alright\nHannah: Bye\nAmanda: Bye bye\n\nRESPONSE:\nHannah asked Amanda for Betty's number. Amanda couldn't find it, but suggested asking Larry, who had called Betty recently. Hannah was hesitant to ask Larry, but Amanda reassured her that Larry was a nice person. Hannah finally agreed to ask Larry for Betty's number."