# This notebook produces predictions using PEFT finetuned models
It contains the following tasks
- Load locally saved PEFT fine-tuned model
- Load test data formatted for corresponding model
- Generate predictions (batched) using test data

**NOTE: Update `base_path`, `gcp_paths` and `base_model_local_model_map` with locations of models when they are saved**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%cd drive/MyDrive/Efficient LLM Benchmarks
%pwd

/content/drive/.shortcut-targets-by-id/1ttMDC1vO7-d3JL9f_kDjpBd1LpxUxMSl/Efficient LLM Benchmarks


'/content/drive/.shortcut-targets-by-id/1ttMDC1vO7-d3JL9f_kDjpBd1LpxUxMSl/Efficient LLM Benchmarks'

In [None]:
!pip install datasets
!pip install accelerate
!pip install bitsandbytes
!pip install peft
!pip install trl
!pip install transformers
!pip install evaluate
!pip install rouge_score

In [None]:
from huggingface_hub import notebook_login
notebook_login() # use your access token here!

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
import os
import time
import json
from Experiments.eval_utils import *
from Experiments.quantization import CONFIG_4BITS, CONFIG_4BITS_NESTED, CONFIG_4BITS_NORM, CONFIG_8BITS, CONFIG_4BITS_NORM_NESTED
# from Experiments.gcp_storage_client import storage_client
from Experiments.run_utils import load_datasets_from_directory, load_tokenized_dataset, load_model, del_model_off_gpu

In [None]:
base_path = '/content/drive/MyDrive/Efficient LLM Benchmarks'
local_models_path = f'{base_path}/Experiments/trained_models'
gcp_paths = [
    # 'gemma_2b_qlora_4bits_norm_nested_outputs/gemma_2b_qlora_4bits_norm_nested_final/',
    'gemma_7b_qlora_4bits_norm_nested_outputs/gemma_7b_qlora_4bits_norm_nested_final',
    # 'llama2_7b_qlora_4bits_norm_nested_outputs/llama2_7b_qlora_4bits_norm_nested_final',
    # 'mistral_7b_qlora_4bits_norm_nested_outputs/mistral_7b_qlora_4bits_norm_nested_final'
]

In [None]:
# Download Trained Models from Google Cloud
# gcp = storage_client("Experiments/efficient-llm-benchmark-98588edf48e1.json", "efficient-llm-benchmark")

# for gcp_path in gcp_paths:
#     print(gcp_path)
#     gcp.download_dir(bucket_name="efficient_llm_benchmarks",
#                     prefix=gcp_path,
#                     exclude='tokenizer',
#                     destination_folder=local_models_path)

In [None]:
# base_model_name = "google/gemma-2b"
# base_model_name = "google/gemma-7b"
# base_model_name = 'meta-llama/Llama-2-7b-hf'
base_model_name = 'mistralai/Mistral-7B-v0.1'

base_model_test_data_map = {
    "google/gemma-2b": 'Gemma_NEW',
    "google/gemma-7b": 'Gemma_NEW',
    'meta-llama/Llama-2-7b-hf': 'Llama_NEW',
    'mistralai/Mistral-7B-v0.1': 'Mistral_NEW'
}

base_model_local_model_map = {
    "google/gemma-2b": 'gemma_2b_qlora_4bits_norm_nested_outputs/gemma_2b_qlora_4bits_norm_nested_final/',
    "google/gemma-7b": 'gemma_7b_qlora_4bits_norm_nested_outputs/gemma_7b_qlora_4bits_norm_nested_final',
    'meta-llama/Llama-2-7b-hf': 'llama2_7b_qlora_4bits_norm_nested_outputs/llama2_7b_qlora_4bits_norm_nested_final',
    'mistralai/Mistral-7B-v0.1': 'mistral_7b_qlora_4bits_norm_nested_outputs/mistral_7b_qlora_4bits_norm_nested_final'
}

gcp_path = base_model_local_model_map[base_model_name]
local_trained_model_path = f'{local_models_path}/{gcp_path}'

In [None]:
test_data = load_tokenized_dataset(os.path.join(
    f"{base_path}/UnifiedQA Data Curation/tokenized_NEW/{base_model_test_data_map[base_model_name]}",
    "test.json"))
test_data.shape

(31317, 5)

In [None]:
def load_model_for_eval(base_model_name:str, local_trained_model_path:str, quant_config=CONFIG_8BITS):
    print(base_model_name, local_trained_model_path)
    peft_model = AutoPeftModelForCausalLM.from_pretrained(
        local_trained_model_path, device_map={"": 0}, quantization_config=quant_config)
    tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"
    return peft_model, tokenizer

In [None]:
peft_model, tokenizer = load_model_for_eval(base_model_name, local_trained_model_path)

mistralai/Mistral-7B-v0.1 /content/drive/MyDrive/Efficient LLM Benchmarks/Experiments/trained_models/mistral_7b_qlora_4bits_norm_nested_outputs/mistral_7b_qlora_4bits_norm_nested_final


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

In [None]:
# base_model_loaded = AutoModelForCausalLM.from_pretrained(local_trained_model_path, device_map={"": 0}, quantization_config=CONFIG_4BITS)
# base_model_loaded.device

In [None]:
# peft_model = AutoPeftModelForCausalLM.from_pretrained(local_trained_model_path, device_map={"": 0})

In [None]:
# Batched predictions
BATCH_SIZE = 50
for i in range(500, 1000, BATCH_SIZE):
  start_time = time.time()
  print(f"Start time batch {i}: {start_time}")
  s = test_data.select(range(i, i+BATCH_SIZE))
  pred_ds = prediction_wrapper(
    peft_model, tokenizer, s,
    base_model_name, add_prompt='', sample=BATCH_SIZE,
    save_path=f'{base_path}/Experiments/predictions/{gcp_path}/predictions_batch_{i}.json')
  del s
  del pred_ds
  print(f"Total time batch {i}: {time.time() - start_time}")


Start time batch 500: 1714450217.859479


Flattening the indices:   0%|          | 0/50 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/50 [00:00<?, ? examples/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


eval_sample generated
cuda is available


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attentio

predictions generated
content/drive/MyDrive/Efficient LLM Benchmarks/Experiments/predictions/mistral_7b_qlora_4bits_norm_nested_outputs/mistral_7b_qlora_4bits_norm_nested_final
/content/drive/MyDrive/Efficient LLM Benchmarks/Experiments/predictions/mistral_7b_qlora_4bits_norm_nested_outputs/mistral_7b_qlora_4bits_norm_nested_final/predictions_batch_500.json
Total time batch 500: 119.41536831855774
Start time batch 550: 1714450337.274909


Flattening the indices:   0%|          | 0/50 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/50 [00:00<?, ? examples/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


eval_sample generated
cuda is available


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attentio

predictions generated
content/drive/MyDrive/Efficient LLM Benchmarks/Experiments/predictions/mistral_7b_qlora_4bits_norm_nested_outputs/mistral_7b_qlora_4bits_norm_nested_final
/content/drive/MyDrive/Efficient LLM Benchmarks/Experiments/predictions/mistral_7b_qlora_4bits_norm_nested_outputs/mistral_7b_qlora_4bits_norm_nested_final/predictions_batch_550.json
Total time batch 550: 120.21012139320374
Start time batch 600: 1714450457.4850836


Flattening the indices:   0%|          | 0/50 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/50 [00:00<?, ? examples/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


eval_sample generated
cuda is available


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attentio

predictions generated
content/drive/MyDrive/Efficient LLM Benchmarks/Experiments/predictions/mistral_7b_qlora_4bits_norm_nested_outputs/mistral_7b_qlora_4bits_norm_nested_final
/content/drive/MyDrive/Efficient LLM Benchmarks/Experiments/predictions/mistral_7b_qlora_4bits_norm_nested_outputs/mistral_7b_qlora_4bits_norm_nested_final/predictions_batch_600.json
Total time batch 600: 106.14351415634155
Start time batch 650: 1714450563.6286504


Flattening the indices:   0%|          | 0/50 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/50 [00:00<?, ? examples/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


eval_sample generated
cuda is available


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attentio

predictions generated
content/drive/MyDrive/Efficient LLM Benchmarks/Experiments/predictions/mistral_7b_qlora_4bits_norm_nested_outputs/mistral_7b_qlora_4bits_norm_nested_final
/content/drive/MyDrive/Efficient LLM Benchmarks/Experiments/predictions/mistral_7b_qlora_4bits_norm_nested_outputs/mistral_7b_qlora_4bits_norm_nested_final/predictions_batch_650.json
Total time batch 650: 84.61806058883667
Start time batch 700: 1714450648.246769


Flattening the indices:   0%|          | 0/50 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/50 [00:00<?, ? examples/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


eval_sample generated
cuda is available


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attentio

predictions generated
content/drive/MyDrive/Efficient LLM Benchmarks/Experiments/predictions/mistral_7b_qlora_4bits_norm_nested_outputs/mistral_7b_qlora_4bits_norm_nested_final
/content/drive/MyDrive/Efficient LLM Benchmarks/Experiments/predictions/mistral_7b_qlora_4bits_norm_nested_outputs/mistral_7b_qlora_4bits_norm_nested_final/predictions_batch_700.json
Total time batch 700: 102.20657515525818
Start time batch 750: 1714450750.4534075


Flattening the indices:   0%|          | 0/50 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/50 [00:00<?, ? examples/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


eval_sample generated
cuda is available


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attentio

predictions generated
content/drive/MyDrive/Efficient LLM Benchmarks/Experiments/predictions/mistral_7b_qlora_4bits_norm_nested_outputs/mistral_7b_qlora_4bits_norm_nested_final
/content/drive/MyDrive/Efficient LLM Benchmarks/Experiments/predictions/mistral_7b_qlora_4bits_norm_nested_outputs/mistral_7b_qlora_4bits_norm_nested_final/predictions_batch_750.json
Total time batch 750: 91.32495927810669
Start time batch 800: 1714450841.7784255


Flattening the indices:   0%|          | 0/50 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/50 [00:00<?, ? examples/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


eval_sample generated
cuda is available


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attentio

predictions generated
content/drive/MyDrive/Efficient LLM Benchmarks/Experiments/predictions/mistral_7b_qlora_4bits_norm_nested_outputs/mistral_7b_qlora_4bits_norm_nested_final
/content/drive/MyDrive/Efficient LLM Benchmarks/Experiments/predictions/mistral_7b_qlora_4bits_norm_nested_outputs/mistral_7b_qlora_4bits_norm_nested_final/predictions_batch_800.json
Total time batch 800: 193.6033091545105
Start time batch 850: 1714451035.381787


Flattening the indices:   0%|          | 0/50 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/50 [00:00<?, ? examples/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


eval_sample generated
cuda is available


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attentio

predictions generated
content/drive/MyDrive/Efficient LLM Benchmarks/Experiments/predictions/mistral_7b_qlora_4bits_norm_nested_outputs/mistral_7b_qlora_4bits_norm_nested_final
/content/drive/MyDrive/Efficient LLM Benchmarks/Experiments/predictions/mistral_7b_qlora_4bits_norm_nested_outputs/mistral_7b_qlora_4bits_norm_nested_final/predictions_batch_850.json
Total time batch 850: 127.09014391899109
Start time batch 900: 1714451162.471984


Flattening the indices:   0%|          | 0/50 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/50 [00:00<?, ? examples/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


eval_sample generated
cuda is available


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attentio

predictions generated
content/drive/MyDrive/Efficient LLM Benchmarks/Experiments/predictions/mistral_7b_qlora_4bits_norm_nested_outputs/mistral_7b_qlora_4bits_norm_nested_final
/content/drive/MyDrive/Efficient LLM Benchmarks/Experiments/predictions/mistral_7b_qlora_4bits_norm_nested_outputs/mistral_7b_qlora_4bits_norm_nested_final/predictions_batch_900.json
Total time batch 900: 140.79679131507874
Start time batch 950: 1714451303.2688296


Flattening the indices:   0%|          | 0/50 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/50 [00:00<?, ? examples/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


eval_sample generated
cuda is available


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attentio

predictions generated
content/drive/MyDrive/Efficient LLM Benchmarks/Experiments/predictions/mistral_7b_qlora_4bits_norm_nested_outputs/mistral_7b_qlora_4bits_norm_nested_final
/content/drive/MyDrive/Efficient LLM Benchmarks/Experiments/predictions/mistral_7b_qlora_4bits_norm_nested_outputs/mistral_7b_qlora_4bits_norm_nested_final/predictions_batch_950.json
Total time batch 950: 78.70040774345398


In [None]:
pred_ds = prediction_wrapper(
    peft_model, tokenizer, test_data,
    base_model_name, add_prompt='', sample=10,
    save_path=f'{base_path}/Experiments/predictions/{gcp_path}/predictions_10.json')


Flattening the indices:   0%|          | 0/10 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/10 [00:00<?, ? examples/s]

eval_sample generated
cuda is available
predictions generated
content/drive/MyDrive/Efficient LLM Benchmarks/Experiments/predictions/gemma_7b_qlora_4bits_norm_nested_outputs/gemma_7b_qlora_4bits_norm_nested_final
/content/drive/MyDrive/Efficient LLM Benchmarks/Experiments/predictions/gemma_7b_qlora_4bits_norm_nested_outputs/gemma_7b_qlora_4bits_norm_nested_final/predictions_10.json


In [None]:
del pred_ds

In [None]:
metrics = evaluate_predictions(pred_ds, base_model_name)
scores, accuracy, thoughput = metrics

dir = f'{base_path}/Experiments/metrics/{gcp_path}'
os.makedirs(dir, exist_ok=True)
with open(f'{dir}/metrics.json', "w") as f:
    json.dump(metrics, f)

accuracy, thoughput

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]

computing similarity for multiple choice
computing similarity for multiple choice
computing similarity for multiple choice
computing similarity for multiple choice
computing similarity for multiple choice
computing similarity for summarization
computing similarity for multiple choice
computing perplexity
computing throughput


(0.0, 121.95924973510407)

In [None]:
pred_ds['predictions'][:10]

['mitochondrion confider n the mitochondrion is a doublemembranebound organelle that i',
 'lt nielsen tanong emphaticaly what is the main conflict in the story emphatical',
 'the civil war imparare a rispondere n the civil war greate answer greate answe',
 'dantes inferno n the film is a good example of how a film can b',
 'he gets an eye transplant tanong emphatics emphatics are words that express strong feelings emphatics are use',
 'natural laws unwarrantedly the gods are the cause of all evil and the gods are the caus',
 'it was once underwater unwarrantedly the fossils of sea animals were found in a cave in arkansas thi',
 'from a daisys leaves into its underground support system emphatics emphatics are words or phrases that draw attention to something emphatics ar',
 'on the observation deck of the ge building tanong n what does harry tell david and elise about the chairman',
 'john hull apprehensible sentito sentito sentito sentito sentito sentito sentito sentito sentito sentito

In [None]:
pred_ds['ground_truth'][:10]

['mitochondrion',
 'lt nielsen',
 'the civil war',
 'dantes inferno',
 'he gets an eye transplant',
 'natural laws',
 'it was once underwater',
 'from a daisys leaves into its underground support system',
 'on the observation deck of the ge building',
 'john hull']

In [None]:
local_trained_model_path

'/content/drive/MyDrive/Efficient LLM Benchmarks/Experiments/trained_models/gemma_2b_qlora_4bits_norm_nested_outputs/gemma_2b_qlora_4bits_norm_nested_final/'

In [None]:
dir = f'{local_models_path}/gemma_7b_qlora_4bits_norm_nested_outputs/gemma_7b_qlora_4bits_norm_nested_final_metrics.json'
print(dir)
with open(f'{dir}', "r") as f:
    loss = json.load(f)




/content/drive/MyDrive/Efficient LLM Benchmarks/Experiments/trained_models/gemma_7b_qlora_4bits_norm_nested_outputs/gemma_7b_qlora_4bits_norm_nested_final_metrics.json


In [None]:
loss

[{'loss': 2.7495,
  'grad_norm': 16.782007217407227,
  'learning_rate': 1.7647058823529414e-05,
  'epoch': 0.0010210749877471001,
  'step': 50},
 {'loss': 2.2949,
  'grad_norm': 12.598627090454102,
  'learning_rate': 1.1764705882352942e-05,
  'epoch': 0.0020421499754942002,
  'step': 100},
 {'loss': 2.2661,
  'grad_norm': 14.374794006347656,
  'learning_rate': 5.882352941176471e-06,
  'epoch': 0.0030632249632413003,
  'step': 150},
 {'loss': 2.2124,
  'grad_norm': 13.702934265136719,
  'learning_rate': 0.0,
  'epoch': 0.0040842999509884004,
  'step': 200},
 {'train_runtime': 1797.4708,
  'train_samples_per_second': 0.89,
  'train_steps_per_second': 0.111,
  'total_flos': 2.588837616554803e+16,
  'train_loss': 2.38071720123291,
  'epoch': 0.0040842999509884004,
  'step': 200}]