In [1]:
import json
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import sys
from datetime import datetime
import random

import numpy as np
import torch
torch.cuda.empty_cache()
torch.cuda.reset_max_memory_allocated()
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import set_seed as hf_set_seed


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
datasets = ["gov_report", "summ_screen_fd", "qmsum", "qasper","narrative_qa", "quality"]

In [3]:
model_to_max_input_tokens = {
    "Qwen/Qwen2.5-1.5B-Instruct": 8192,
    "MBZUAI/LaMini-GPT-1.5B" : 512,
    "/assets/models/meta-llama-2-chat-7b" : 8192,
    "instruction-pretrain/InstructLM-1.3B":2048,
    "nvidia/AceInstruct-1.5B": 8192,
    "/assets/models/meta-llama-3.2-instruct-3b": 8192
    
}

In [4]:
def trim_doc_keeping_suffix(tokenizer, tokenized_input_full, example, suffix_index, max_tokens, device):
    seperator_and_suffix = f"{example['truncation_seperator'].strip()}\n\n{example['input'][suffix_index:].strip()}\n"
    tokenized_seperator_and_suffix = tokenizer(seperator_and_suffix, return_tensors="pt").input_ids.to(device)
    tokenized_input_trimmed = tokenized_input_full[:, :max_tokens - tokenized_seperator_and_suffix.shape[1]]
    tokenized_input = torch.cat([tokenized_input_trimmed, tokenized_seperator_and_suffix], dim=1)
    return tokenized_input

In [5]:
def process_model_input(tokenizer, example, max_tokens, device):
    tokenized_input_full = tokenizer(example["input"], return_tensors="pt").input_ids.to(device)
    if tokenized_input_full.shape[1] <= max_tokens:
        return tokenized_input_full

    seperator_and_query_text = example['truncation_seperator'] + example["input"][example['query_start_index']:]
    tokenized_seperator_and_query = tokenizer(seperator_and_query_text, return_tensors="pt").input_ids.to(device)
    input_without_query = example['input'][:example['query_start_index']]
    tokenized_input_without_query = tokenizer(input_without_query, return_tensors="pt").input_ids.to(device)
    tokenized_input_without_query = tokenized_input_without_query[:,
                                    :max_tokens - tokenized_seperator_and_query.shape[1]]

    tokenized_input = torch.cat([tokenized_input_without_query, tokenized_seperator_and_query], dim=1)
    
    return tokenized_input

In [6]:
model_name = "/assets/models/meta-llama-3.2-instruct-3b"
generations_dir = "generations/ipynb"
max_examples_per_task = -1


In [7]:
seed = 43
random.seed(seed)
np.random.seed(seed)
hf_set_seed(seed)
print("Params:")
print(f"model: {model_name}")
generations_dir = os.path.join(generations_dir, model_name.replace("/", "_").replace("-", "_"))
print(f"generations_dir: {generations_dir}")
print(f"max_examples_per_task: {max_examples_per_task}")
print("=" * 50)
time = datetime.now().strftime("%d_%m_%Y_%H_%M_%S")
print(f"time as start: {time}")

print("Loading tokenizer")
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token_id = tokenizer.eos_token_id
print(f"Loading model: {model_name}")
device = "cuda" if torch.cuda.is_available() else "cpu"

max_input_length = model_to_max_input_tokens[model_name]

model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto",
                                                       torch_dtype=torch.float16)



Params:
model: /assets/models/meta-llama-3.2-instruct-3b
generations_dir: generations/ipynb/_assets_models_meta_llama_3.2_instruct_3b
max_examples_per_task: -1
time as start: 02_03_2025_19_21_16
Loading tokenizer
Loading model: /assets/models/meta-llama-3.2-instruct-3b


Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.07s/it]


In [8]:
model = model.eval()

print(f"{model} model loaded!, device:{model.device}")

print("Will write to:", generations_dir)
os.makedirs(generations_dir, exist_ok=True)
for dataset in datasets:
    generations = dict()
    input_task = dict()
    output_task = dict()
    print(f"Processing {dataset}")
    time = datetime.now().strftime("%d_%m_%Y_%H_%M_%S")
    print(f"time as start {dataset}: {time}")
    print(f"Loading {dataset}")
    data = load_dataset("tau/zero_scrolls", dataset, cache_dir="/home/athul/datasets_cache")
    print(f"Loaded {dataset}")

    for i, example in enumerate(data["validation"]):

        if 0 < max_examples_per_task == i:
            print(f"Reached {max_examples_per_task} for {dataset}. Breaking")
            break

        model_input = process_model_input(tokenizer, example, max_input_length, device)

        prediction_token_ids = model.generate(model_input,
                                                  max_new_tokens=512,
                                                  do_sample=False,
                                                  top_p=0,
                                                  top_k=0,
                                                  temperature=1)

        predicted_text = tokenizer.decode(prediction_token_ids[0][model_input.shape[1]:], skip_special_tokens=True)
        generations[example["id"]] = predicted_text
        input_task[example["id"]] = example["input"]
        output_task[example["id"]] = example["output"]

    out_file_path_pred = os.path.join(generations_dir, f"{dataset}.json")
    # out_file_path_input = os.path.join(generations_dir, f"input_{dataset}.json")
    # out_file_path_output = os.path.join(generations_dir, f"output_{dataset}.json")
    # with open(out_file_path_input, 'w') as f_out:
    #     json.dump(input_task, f_out, indent=4)
    # with open(out_file_path_output, 'w') as f_out:
    #     json.dump(output_task, f_out, indent=4)
    with open(out_file_path_pred, 'w') as f_out:
        json.dump(generations, f_out, indent=4)
    


    print(f"Done generating {len(generations)} examples from {dataset}")
    time = datetime.now().strftime("%d_%m_%Y_%H_%M_%S")
    print(f"time at end: {time}")
    print(f"Look for predictions in {generations_dir}")

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((3072,), eps=1e-05)
    (rotary_emb

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Loaded gov_report


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end gene

Done generating 20 examples from gov_report
time at end: 02_03_2025_19_25_34
Look for predictions in generations/ipynb/_assets_models_meta_llama_3.2_instruct_3b
Processing summ_screen_fd
time as start summ_screen_fd: 02_03_2025_19_25_34
Loading summ_screen_fd


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Loaded summ_screen_fd


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end gene

Done generating 20 examples from summ_screen_fd
time at end: 02_03_2025_19_27_57
Look for predictions in generations/ipynb/_assets_models_meta_llama_3.2_instruct_3b
Processing qmsum
time as start qmsum: 02_03_2025_19_27_57
Loading qmsum


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Loaded qmsum


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end gene

Done generating 20 examples from qmsum
time at end: 02_03_2025_19_30_06
Look for predictions in generations/ipynb/_assets_models_meta_llama_3.2_instruct_3b
Processing qasper
time as start qasper: 02_03_2025_19_30_06
Loading qasper


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Loaded qasper


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end gene

Done generating 19 examples from qasper
time at end: 02_03_2025_19_31_21
Look for predictions in generations/ipynb/_assets_models_meta_llama_3.2_instruct_3b
Processing narrative_qa
time as start narrative_qa: 02_03_2025_19_31_21
Loading narrative_qa
Loaded narrative_qa


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end gene

Done generating 10 examples from narrative_qa
time at end: 02_03_2025_19_31_57
Look for predictions in generations/ipynb/_assets_models_meta_llama_3.2_instruct_3b
Processing quality
time as start quality: 02_03_2025_19_31_57
Loading quality


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Loaded quality


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end gene

Done generating 21 examples from quality
time at end: 02_03_2025_19_32_25
Look for predictions in generations/ipynb/_assets_models_meta_llama_3.2_instruct_3b


In [18]:
2+2

4