In [1]:
import sys
import os
# This code enables using of "src.data" imports in vs code (when you're launching it directly from notebooks directory)
project_root = os.path.abspath(os.path.join(os.getcwd(), "../../"))
sys.path.append(project_root)

from transformers import AutoTokenizer, AutoModelForCausalLM
import numpy as np
import torch

# Loading model weights

model_name = "AnatoliiPotapov/T-lite-instruct-0.1"
tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side='left')

torch.manual_seed(42)

<torch._C.Generator at 0x7f3574140e50>

# Generation

In [None]:
project_root = os.path.abspath('/nfs/home/edyagin/CoolPrompt/')
sys.path.append(project_root)

from src.data.generation import gsm8k_dataset, math_dataset, samsum_dataset
from src.data.qa import medqa_dataset, openbookqa_dataset


gen_datasets = [
    math_dataset.MathDataset, 
    gsm8k_dataset.GSM8KDataset, 
    samsum_dataset.SamsumDataset,
    medqa_dataset.MedQADataset,
    openbookqa_dataset.OpenbookQADataset
]

ds_to_mx = {}

for ds in gen_datasets:
    train_set = ds(tokenizer, split='train')
    test_set = ds(tokenizer, split='test')
    
    print(ds.__name__)
    print(train_set.prompt)
    
    full_ds = torch.utils.data.ConcatDataset([train_set, test_set])
    label_tokens = [label_id for _, _, label_id in full_ds]
    labels_unpadded = tokenizer.batch_decode(label_tokens, skip_special_tokens=True)
    label_tokens_unpadded = tokenizer(labels_unpadded)['input_ids']
    
    label_sizes = np.array([len(token) for token in label_tokens_unpadded])
    
    min_size = np.min(label_sizes)
    median_size = np.percentile(label_sizes, 50)
    max_size = np.max(label_sizes)
    
    min_idx = np.argmin(label_sizes)
    max_idx = np.argmax(label_sizes)
    
    ds_to_mx[ds.__name__] = max_size
    
    print(f"Min label length is {min_size} with label: {tokenizer.decode(label_tokens[min_idx], skip_special_tokens=True)}")
    print("Median label length is", median_size)
    print(f"Max label length is {max_size} with label: {tokenizer.decode(label_tokens[max_idx], skip_special_tokens=True)}")
    print("-------------------------------------------------------------")



MathDataset
Solve the math word problem

INPUT:
<INPUT>

RESPONSE:

Min label length is 0 with label: 
Median label length is 1.0
Max label length is 87 with label: \begin{pmatrix} \frac{4}{9} & -\frac{4}{9} & -\frac{2}{9} \\ -\frac{4}{9} & \frac{4}{9} & \frac{2}{9} \\ -\frac{2}{9} & \frac{2}{9} & \frac{1}{9} \end{pmatrix}
-------------------------------------------------------------
GSM8KDataset
Solve the math word problem, giving your answer as an arabic numeral.

INPUT:
<INPUT>

RESPONSE:

Min label length is 1 with label: 72
Median label length is 1.0
Max label length is 3 with label: 192000000
-------------------------------------------------------------
SamsumDataset
Summarize the following text

INPUT:
<INPUT>

RESPONSE:

Min label length is 1 with label: test
Median label length is 23.0
Max label length is 84 with label: Mousie told Joddie about the attack on Paweł Adamowicz, the President of Gdańsk, during Jurek Owsiak's Wielka Orkiestra concert. She is convinced that the ruli

In [25]:
ds_to_mx

{'MathDataset': 87,
 'GSM8KDataset': 3,
 'SamsumDataset': 84,
 'MedQADataset': 1,
 'OpenbookQADataset': 1}

In [27]:
from src.data.multi_task import BBHDataset

BBH_GENERATION_TASKS = set([
    'dyck_languages',
    'multistep_arithmetic_two',
    'object_counting',
    'word_sorting'
])

ds_to_mx = {}

for task in BBH_GENERATION_TASKS:
    train_set = BBHDataset(tokenizer, split='train').task(task)
    test_set = BBHDataset(tokenizer, split='test').task(task)
    
    print(task)
    print(train_set.prompt)
    
    full_ds = torch.utils.data.ConcatDataset([train_set, test_set])
    label_tokens = [label_id for _, _, label_id in full_ds]
    labels_unpadded = tokenizer.batch_decode(label_tokens, skip_special_tokens=True)
    label_tokens_unpadded = tokenizer(labels_unpadded)['input_ids']
    
    label_sizes = np.array([len(token) for token in label_tokens_unpadded])
    
    min_size = np.min(label_sizes)
    median_size = np.percentile(label_sizes, 50)
    max_size = np.max(label_sizes)
    
    min_idx = np.argmin(label_sizes)
    max_idx = np.argmax(label_sizes)
    
    ds_to_mx[task] = max_size
    
    print(f"Min label length is {min_size} with label: {tokenizer.decode(label_tokens[min_idx], skip_special_tokens=True)}")
    print("Median label length is", median_size)
    print(f"Max label length is {max_size} with label: {tokenizer.decode(label_tokens[max_idx], skip_special_tokens=True)}")
    print("-------------------------------------------------------------")


    

word_sorting
Sort a list of words.

INPUT:
<INPUT>

RESPONSE:

Min label length is 3 with label: beth kenya
Median label length is 22.5
Max label length is 49 with label: aeneas colombo foothold fox garry glycerine inviolate lucre magnanimity nevada notoriety plebiscite pompey quagmire satanic scription softball spleenwort tennyson type
-------------------------------------------------------------
object_counting
Questions that involve enumerating objects and asking the model to count them.

INPUT:
<INPUT>

RESPONSE:

Min label length is 1 with label: 7
Median label length is 1.0
Max label length is 1 with label: 7
-------------------------------------------------------------
dyck_languages
 Correctly close a Dyck-n word.

INPUT:
<INPUT>

RESPONSE:

Min label length is 1 with label: ]
Median label length is 2.0
Max label length is 3 with label: } ) >
-------------------------------------------------------------
multistep_arithmetic_two
Solve multi-step arithmetic problems.

INPUT:
<INP

In [28]:
ds_to_mx

{'word_sorting': 49,
 'object_counting': 1,
 'dyck_languages': 3,
 'multistep_arithmetic_two': 3}