# Datasets using example

### This notebook will show an example of using our custom dataset classes

In [1]:
import sys
import os
# This code enables using of "src.data" imports in vs code (when you're launching it directly from notebooks directory)
project_root = os.path.abspath(os.path.join(os.getcwd(), "../../"))
sys.path.append(project_root)

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import transformers
from src.data.classification import SST2Dataset
from src.data.generation import SamsumDataset
from src.data.multi_task import BBHDataset
from src.evaluation.evaluator import TextClassificationEvaluator, GenerationEvaluator

torch.manual_seed(42)

INFO 03-20 11:37:59 __init__.py:190] Automatically detected platform cuda.


<torch._C.Generator at 0x7f0a88348410>

In [2]:
# Loading model weights

model_name = "AnatoliiPotapov/T-lite-instruct-0.1"
tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side='left')

device = "cuda:0"

model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    device_map=device,
    torch_dtype="float16",
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [3]:
# initializing dataset

sst2_ds = SST2Dataset(
    tokenizer=tokenizer,
    device=device
)

In [4]:
# data length

print(len(sst2_ds))

1821


In [5]:
# you can get your prompt like that

sst2_ds.prompt

'Please perform Sentiment Classification task.\n\nAnswer using the label from [negative, positive].\nGenerate the final answer bracketed with <ans> and </ans>.\n\nThe input:\n<INPUT>\n\nResponse:\n'

In [6]:
# getting first data sample

input_ids, attention_mask, label = next(iter(sst2_ds))
print(input_ids.shape, attention_mask.shape, label.shape)

torch.Size([99]) torch.Size([99]) torch.Size([])


In [7]:
# terminators were taken from hf model page (t-lite 0.1)

terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

# generating answer for our sample 
# unsqueeze(0) - to make to necessary shape (when using DataLoader it'll be done automatically)
outputs = model.generate(
    input_ids=input_ids.unsqueeze(0),
    attention_mask = attention_mask.unsqueeze(0),
    max_new_tokens=50,
    eos_token_id=terminators,
)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [8]:
# decoding the answer

ans = tokenizer.decode(outputs[0], skip_special_tokens=True)
ans

'Please perform Sentiment Classification task.\n\nAnswer using the label from [negative, positive].\nGenerate the final answer bracketed with <ans> and </ans>.\n\nThe input:\nno movement, no yuks, not much of anything.\n\nResponse:\n<ans>negative</ans>'

In [9]:
pos = ans.find("Response:\n")
ans[pos:]   

'Response:\n<ans>negative</ans>'

*Huggingface evaluation*

In [10]:
model_generate_params = {
    "max_new_tokens": 50,
    "eos_token_id": terminators
}


evaluator = TextClassificationEvaluator()
evaluator.evaluate(
    model=model, 
    tokenizer=tokenizer,
    eval_ds=sst2_ds,
    batch_size=64,
    model_generate_args = model_generate_params
)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  3%|▎         | 1/29 [00:01<00:52,  1.88s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  7%|▋         | 2/29 [00:03<00:46,  1.72s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 10%|█         | 3/29 [00:05<00:43,  1.67s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 14%|█▍        | 4/29 [00:06<00:41,  1.64s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 17%|█▋        | 5/29 [00:08<00:39,  1.63s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 21%|██        | 6/29 [00:09<00:37,  1.62s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 24%|██▍       | 7/29 [00:11<00:35,  1.61s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 28%|██▊       | 8/29 [00:13<00:33,  1.61s/it]Setting `pad_token_id` to `eos_token_id`:128001 

{'f1': 0.6061193982765608, 'accuracy': 0.9093904448105437}

In [11]:
# You can also use your prompt instead of basic one

my_prompt = "You will be given movie reviews. Determine if the given review has negative or positive sentiment."

prompted_sst2_ds = SST2Dataset(
    tokenizer=tokenizer,
    prompt=my_prompt,
    device=device
)

prompted_metrics_hf = evaluator.evaluate(
    model=model, 
    tokenizer=tokenizer,
    eval_ds=prompted_sst2_ds,
    batch_size=64,
    model_generate_args = model_generate_params
)
prompted_metrics_hf

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  3%|▎         | 1/29 [00:01<00:53,  1.90s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  7%|▋         | 2/29 [00:03<00:51,  1.89s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 10%|█         | 3/29 [00:05<00:49,  1.89s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 14%|█▍        | 4/29 [00:07<00:46,  1.84s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 17%|█▋        | 5/29 [00:09<00:43,  1.82s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 21%|██        | 6/29 [00:10<00:41,  1.80s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 24%|██▍       | 7/29 [00:12<00:39,  1.79s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 28%|██▊       | 8/29 [00:14<00:38,  1.82s/it]Setting `pad_token_id` to `eos_token_id`:128001 

{'f1': 0.9609998488752062, 'accuracy': 0.9610104338275672}

*vllm evaluation*

In [12]:
import gc
del model
torch.cuda.empty_cache()
gc.collect()

174

In [13]:
from vllm import LLM

model = LLM(model=model_name, dtype=torch.float16, trust_remote_code=True)

INFO 03-20 11:40:08 config.py:542] This model supports multiple tasks: {'embed', 'reward', 'generate', 'score', 'classify'}. Defaulting to 'generate'.
INFO 03-20 11:40:08 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.2) with config: model='AnatoliiPotapov/T-lite-instruct-0.1', speculative_config=None, tokenizer='AnatoliiPotapov/T-lite-instruct-0.1', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=AnatoliiPotapov/T-lite-inst

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 03-20 11:40:19 model_runner.py:1115] Loading model weights took 14.9605 GB
INFO 03-20 11:40:21 worker.py:267] Memory profiling takes 1.64 seconds
INFO 03-20 11:40:21 worker.py:267] the current vLLM instance can use total_gpu_memory (23.64GiB) x gpu_memory_utilization (0.90) = 21.27GiB
INFO 03-20 11:40:21 worker.py:267] model weights take 14.96GiB; non_torch_memory takes 0.02GiB; PyTorch activation peak memory takes 1.22GiB; the rest of the memory reserved for KV Cache is 5.07GiB.
INFO 03-20 11:40:22 executor_base.py:110] # CUDA blocks: 2593, # CPU blocks: 2048
INFO 03-20 11:40:22 executor_base.py:115] Maximum concurrency for 8192 tokens per request: 5.06x
INFO 03-20 11:40:25 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_u

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:18<00:00,  1.92it/s]

INFO 03-20 11:40:43 model_runner.py:1562] Graph capturing finished in 18 secs, took 0.24 GiB
INFO 03-20 11:40:43 llm_engine.py:431] init engine (profile, create kv cache, warmup model) took 23.52 seconds





In [14]:
vllm_generate_args =  {
    "max_tokens": 50,
    "stop_token_ids": terminators
}

prompted_metrics_vllm = evaluator.evaluate_vllm(
    model=model, 
    tokenizer=tokenizer,
    eval_ds=prompted_sst2_ds,
    batch_size=64,
    model_generate_args = vllm_generate_args
)
prompted_metrics_vllm

100%|██████████| 29/29 [00:32<00:00,  1.13s/it]


{'f1': 0.9642991254620392, 'accuracy': 0.9643053267435475}

In [15]:
# you can also use generation dataset

sds = SamsumDataset(
    tokenizer=tokenizer,
    device=device
)

In [16]:
print(len(sds))

819


In [17]:
input_ids, attention_mask, label = next(iter(sds))
print(input_ids.shape, attention_mask.shape, label.shape)

torch.Size([712]) torch.Size([712]) torch.Size([84])


In [18]:
model_generate_params = {
    "max_tokens": 256,
    "stop_token_ids": terminators
}

evaluator = GenerationEvaluator()
metrics = evaluator.evaluate_vllm(
    model=model, 
    tokenizer=tokenizer,
    eval_ds=sds,
    batch_size=32,
    model_generate_args = model_generate_params
)
metrics

[nltk_data] Downloading package wordnet to
[nltk_data]     /nfs/home/edyagin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /nfs/home/edyagin/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /nfs/home/edyagin/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
100%|██████████| 26/26 [02:04<00:00,  4.80s/it]


{'bleu': 0.0884306294872872,
 'rouge': 0.3109028862801102,
 'meteor': 0.4495479453172058}

In [19]:
# Multi-task dataset example

ds = BBHDataset(
    tokenizer,
    device=device
)

ds = ds.task('boolean_expressions')

terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

model_generate_params = {
    "max_tokens": 50,
    "stop_token_ids": terminators
}

evaluator = TextClassificationEvaluator()
metrics = evaluator.evaluate_vllm(
    model=model, 
    tokenizer=tokenizer,
    eval_ds=ds,
    batch_size=128,
    model_generate_args = model_generate_params
)

100%|██████████| 1/1 [00:00<00:00,  1.06it/s]


In [20]:
metrics

{'f1': 0.5625, 'accuracy': 0.6825396825396826}

*C использованием vllm сервера*

In [21]:
import ray
import contextlib

from vllm.distributed.parallel_state import (
    destroy_model_parallel,
    destroy_distributed_environment,
)

# Delete the llm object and free the memory
destroy_model_parallel()
destroy_distributed_environment()
del model.llm_engine.model_executor
del model
with contextlib.suppress(AssertionError):
    torch.distributed.destroy_process_group()
gc.collect()
torch.cuda.empty_cache()
ray.shutdown()

In [22]:
# сервер запущен такой командой
# vllm serve "AnatoliiPotapov/T-lite-instruct-0.1" --dtype half

metrics_vllm_server = evaluator.evaluate_vllm_server(
    model_name=model_name, 
    tokenizer=tokenizer,
    eval_ds=prompted_sst2_ds,
    batch_size=64,
    model_generate_args = vllm_generate_args
)
metrics_vllm_server

  0%|          | 0/29 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling p

{'f1': 0.6167765560167768, 'accuracy': 0.8934651290499726}