# This notebook loads predictions and calculates performance metrics and throughbput

It expects loaded predictions to be in the same format as is produced in the `Predict_PEFT_Model.ipynb` notebook. I.e. a .json file with one list of predictions, one list of ground truths, one list saying which dataset the sample came from, and one list saying the throughput when generating the prediction.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%load_ext autoreload
%autoreload 2
%cd drive/MyDrive/Efficient LLM Benchmarks

/content/drive/.shortcut-targets-by-id/1ttMDC1vO7-d3JL9f_kDjpBd1LpxUxMSl/Efficient LLM Benchmarks


In [3]:
!pip install datasets
!pip install accelerate
!pip install bitsandbytes
!pip install peft
!pip install trl
!pip install transformers
!pip install evaluate
!pip install rouge_score

Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.22.2-py3-none-any

In [2]:
import os
import json
import time
import torch
import numpy as np

import datasets

# from Experiments.gcp_storage_client import storage_client
from Experiments.eval_utils import evaluate_predictions, load_saved_predictions

  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


In [4]:
base_path = '.'#'/content/drive/MyDrive/Efficient LLM Benchmarks'
local_models_path = f'{base_path}/Experiments/trained_models'

In [5]:
# gcp = storage_client("Experiments/efficient-llm-benchmark-98588edf48e1.json", "efficient-llm-benchmark")
# bucket = gcp.client.get_bucket("efficient_llm_benchmarks")
# blobs = bucket.list_blobs()
# for blob in blobs:
#     if 'final_metrics.json' in blob.name:
#         dest = '/'.join(blob.name.split('/')[:-1])
#         os.makedirs(f'{local_models_path}/{dest}', exist_ok=True)
#         blob.download_to_filename(f'{local_models_path}/{blob.name}')  # Download

In [19]:
base_model_pred_map = {
    "google/gemma-2b": [
        'gemma_2b_qlora_4bits_norm_nested_linear_outputs/gemma_2b_qlora_4bits_norm_nested_linear_final',
        'gemma_2b_adalora_4bits_norm_nested_linear_outputs/gemma_2b_adalora_4bits_norm_nested_linear_final',
        'gemma_2b_ia3_4bits_norm_nested_linear_outputs/gemma_2b_ia3_4bits_norm_nested_linear_final',
        'gemma_2b_prompt_tuning_4bits_norm_nested_linear_outputs/gemma_2b_prompt_tuning_4bits_norm_nested_linear_final',
    ],
    "google/gemma-7b": [
        'gemma_7b_qlora_4bits_norm_nested_outputs/gemma_7b_qlora_4bits_norm_nested_final',
        'gemma_7b_prompt_tuning_4bits_norm_nested_linear_outputs/gemma_7b_prompt_tuning_4bits_norm_nested_linear_final',
    ],
    'meta-llama/Llama-2-7b-hf': [
        'llama2_7b_adalora_4bits_norm_nested_linear_outputs/llama2_7b_adalora_4bits_norm_nested_linear_final',
        'llama2_7b_qlora_4bits_norm_nested_linear_outputs/llama2_7b_qlora_4bits_norm_nested_linear_final',
        'llama2_7b_ia3_4bits_norm_nested_linear_outputs/llama2_7b_ia3_4bits_norm_nested_linear_final',
        'llama2_7b_prompt_tuning_4bits_norm_nested_linear_outputs/llama2_7b_prompt_tuning_4bits_norm_nested_linear_final',
    ],
    'mistralai/Mistral-7B-v0.1': [
        'mistral_7b_adalora_4bits_norm_nested_linear_outputs/mistral_7b_adalora_4bits_norm_nested_linear_final',
        'mistral_7b_qlora_4bits_norm_nested_linear_outputs/mistral_7b_qlora_4bits_norm_nested_linear_final',
        'mistral_7b_ia3_4bits_norm_nested_linear_outputs/mistral_7b_ia3_4bits_norm_nested_linear_final',
        'mistral_7b_prompt_tuning_4bits_norm_nested_linear_outputs/mistral_7b_prompt_tuning_4bits_norm_nested_linear_final',
    ],
}

In [23]:
metrics = {"google/gemma-2b":{},
           "google/gemma-7b":{},
           'meta-llama/Llama-2-7b-hf':{},
           'mistralai/Mistral-7B-v0.1':{}}

In [31]:
# base_model_name = "google/gemma-2b"
# base_model_name = "google/gemma-7b"
# base_model_name = 'meta-llama/Llama-2-7b-hf'
base_model_name = 'mistralai/Mistral-7B-v0.1'

In [32]:
for gcp_path in base_model_pred_map[base_model_name]:

    BATCH_SIZE = 100
    pred_ds_list = []
    for i in range(0, 500, BATCH_SIZE):
        load_path=f'{base_path}/Experiments/predictions/{gcp_path}/predictions_batch_{i}.json'
        pred_ds_list.append(load_saved_predictions(load_path))

    pred_ds = datasets.concatenate_datasets(pred_ds_list)
    m = evaluate_predictions(pred_ds, base_model_name)
    method = gcp_path.split('/')[-1]
    print(method)
    metrics[base_model_name][method] = {}
    metrics[base_model_name][method]['avg_score'] = np.mean(m[0])
    metrics[base_model_name][method]['accuracy'] = m[1]
    metrics[base_model_name][method]['throughput'] = m[2]
metrics

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

computing similarity for multiple choice
computing similarity for multiple choice
computing similarity for multiple choice
computing similarity for summarization
computing similarity for multiple choice
computing similarity for multiple choice
computing similarity for multiple choice
computing perplexity
computing throughput
mistral_7b_adalora_4bits_norm_nested_linear_final


Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

computing similarity for multiple choice
computing similarity for multiple choice
computing similarity for multiple choice
computing similarity for summarization
computing similarity for multiple choice
computing similarity for multiple choice
computing similarity for multiple choice
computing perplexity
computing throughput
mistral_7b_qlora_4bits_norm_nested_linear_final


Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

computing similarity for multiple choice
computing similarity for multiple choice
computing similarity for multiple choice
computing similarity for summarization
computing similarity for multiple choice
computing similarity for multiple choice
computing similarity for multiple choice
computing perplexity
computing throughput
mistral_7b_ia3_4bits_norm_nested_linear_final


Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

computing similarity for multiple choice
computing similarity for multiple choice
computing similarity for multiple choice
computing similarity for summarization
computing similarity for multiple choice
computing similarity for multiple choice
computing similarity for multiple choice
computing perplexity
computing throughput
mistral_7b_prompt_tuning_4bits_norm_nested_linear_final


{'google/gemma-2b': {'gemma_2b_qlora_4bits_norm_nested_linear_final': {'avg_score': 0.1878396775566969,
   'accuracy': 0.0,
   'throughput': 169.6252416735759},
  'gemma_2b_adalora_4bits_norm_nested_linear_final': {'avg_score': 0.21333175247932473,
   'accuracy': 0.0,
   'throughput': 135.01750960657802},
  'gemma_2b_ia3_4bits_norm_nested_linear_final': {'avg_score': 0.2125162886653766,
   'accuracy': 0.0,
   'throughput': 180.27046684573975},
  'gemma_2b_prompt_tuning_4bits_norm_nested_linear_final': {'avg_score': 0.04490500005618935,
   'accuracy': 0.0,
   'throughput': 190.5249001316942}},
 'google/gemma-7b': {'gemma_7b_qlora_4bits_norm_nested_final': {'avg_score': 0.5059944987122345,
   'accuracy': 13.714285714285715,
   'throughput': 59.367752578065854},
  'gemma_7b_prompt_tuning_4bits_norm_nested_linear_final': {'avg_score': 0.013714285714285714,
   'accuracy': 1.3714285714285714,
   'throughput': 0.0}},
 'meta-llama/Llama-2-7b-hf': {'llama2_7b_adalora_4bits_norm_nested_linear_fi

In [33]:
dir = f'{base_path}/Experiments/metrics'
os.makedirs(dir, exist_ok=True)
with open(f'{dir}/peft_metrics.json', "w") as f:
    json.dump(metrics, f)

In [123]:
data_dict = {}
for dir in os.listdir(local_models_path):
  fs = os.listdir(f"{local_models_path}/{dir}")
  for file in fs:
      if '_final_metrics.json' in file:
        with open(f"{local_models_path}/{dir}/{file}", 'r', encoding='utf-8') as fp:
            data_dict[file] = fp.readlines()
for i, j in data_dict.items():
    data_dict[i] = json.loads(j[0])
data_dict.keys()

dict_keys(['gemma_7b_qlora_4bits_norm_nested_final_metrics.json', 'llama2_7b_qlora_4bits_norm_nested_final_metrics.json', 'mistral_7b_qlora_4bits_norm_nested_final_metrics.json', 'gemma_2b_adalora_4bits_norm_nested_linear_final_metrics.json', 'gemma_2b_ia3_4bits_norm_nested_linear_final_metrics.json', 'gemma_2b_prompt_tuning_4bits_norm_nested_linear_final_metrics.json', 'gemma_2b_qlora_4bits_norm_nested_linear_final_metrics.json', 'gemma_7b_prompt_tuning_4bits_norm_nested_linear_final_metrics.json', 'llama2_7b_adalora_4bits_norm_nested_linear_final_metrics.json', 'llama2_7b_ia3_4bits_norm_nested_linear_final_metrics.json', 'llama2_7b_prompt_tuning_4bits_norm_nested_linear_final_metrics.json', 'llama2_7b_qlora_4bits_norm_nested_linear_final_metrics.json', 'mistral_7b_adalora_4bits_norm_nested_linear_final_metrics.json', 'mistral_7b_ia3_4bits_norm_nested_linear_final_metrics.json', 'mistral_7b_prompt_tuning_4bits_norm_nested_linear_final_metrics.json', 'mistral_7b_qlora_4bits_norm_nested

In [125]:
data_dict

{'gemma_7b_qlora_4bits_norm_nested_final_metrics.json': [{'loss': 2.7495,
   'grad_norm': 16.782007217407227,
   'learning_rate': 1.7647058823529414e-05,
   'epoch': 0.0010210749877471001,
   'step': 50},
  {'loss': 2.2949,
   'grad_norm': 12.598627090454102,
   'learning_rate': 1.1764705882352942e-05,
   'epoch': 0.0020421499754942002,
   'step': 100},
  {'loss': 2.2661,
   'grad_norm': 14.374794006347656,
   'learning_rate': 5.882352941176471e-06,
   'epoch': 0.0030632249632413003,
   'step': 150},
  {'loss': 2.2124,
   'grad_norm': 13.702934265136719,
   'learning_rate': 0.0,
   'epoch': 0.0040842999509884004,
   'step': 200},
  {'train_runtime': 1797.4708,
   'train_samples_per_second': 0.89,
   'train_steps_per_second': 0.111,
   'total_flos': 2.588837616554803e+16,
   'train_loss': 2.38071720123291,
   'epoch': 0.0040842999509884004,
   'step': 200}],
 'llama2_7b_qlora_4bits_norm_nested_final_metrics.json': [{'loss': 2.1258,
   'grad_norm': 2.347311496734619,
   'learning_rate': 

In [122]:
for i, j in data_dict.items():
    method = '_'.join(i.split('_')[:-1])
    print(method)
    if 'gemma' in method:
      if '7b' in method:
          model = 'google/gemma-7b'

      elif '2b' in method:
          model = 'google/gemma-2b'

    elif 'llama2' in method:
      model = 'meta-llama/Llama-2-7b-hf'
    elif 'mistral' in method:
      model = 'mistralai/Mistral-7B-v0.1'
    metrics[model][method]={}
    metrics[model][method]['loss'] = j[0]['loss']
    metrics[model][method]['perplexity'] = np.exp(j[0]['loss'])

gemma_7b_qlora_4bits_norm_nested_final
llama2_7b_qlora_4bits_norm_nested_final
mistral_7b_qlora_4bits_norm_nested_final
gemma_2b_adalora_4bits_norm_nested_linear_final
gemma_2b_ia3_4bits_norm_nested_linear_final
gemma_2b_prompt_tuning_4bits_norm_nested_linear_final
gemma_2b_qlora_4bits_norm_nested_linear_final
gemma_7b_prompt_tuning_4bits_norm_nested_linear_final
llama2_7b_adalora_4bits_norm_nested_linear_final
llama2_7b_ia3_4bits_norm_nested_linear_final
llama2_7b_prompt_tuning_4bits_norm_nested_linear_final
llama2_7b_qlora_4bits_norm_nested_linear_final
mistral_7b_adalora_4bits_norm_nested_linear_final
mistral_7b_ia3_4bits_norm_nested_linear_final
mistral_7b_prompt_tuning_4bits_norm_nested_linear_final
mistral_7b_qlora_4bits_norm_nested_linear_final


In [126]:
metrics

{'google/gemma-2b': {'gemma_2b_adalora_4bits_norm_nested_linear_final': {'loss': 6.4589,
   'perplexity': 638.3584758975394},
  'gemma_2b_ia3_4bits_norm_nested_linear_final': {'loss': 3.3529,
   'perplexity': 28.585511541272083},
  'gemma_2b_prompt_tuning_4bits_norm_nested_linear_final': {'loss': 3.5746,
   'perplexity': 35.68034582251858},
  'gemma_2b_qlora_4bits_norm_nested_linear_final': {'loss': 3.2044,
   'perplexity': 24.640711153548914}},
 'google/gemma-7b': {'gemma_7b_qlora_4bits_norm_nested_final': {'loss': 2.7495,
   'perplexity': 15.634812523249213},
  'gemma_7b_prompt_tuning_4bits_norm_nested_linear_final': {'loss': 19.3197,
   'perplexity': 245719205.27891803}},
 'meta-llama/Llama-2-7b-hf': {'llama2_7b_qlora_4bits_norm_nested_final': {'loss': 2.1258,
   'perplexity': 8.379598486159592},
  'llama2_7b_adalora_4bits_norm_nested_linear_final': {'loss': 4.3354,
   'perplexity': 76.35549498013454},
  'llama2_7b_ia3_4bits_norm_nested_linear_final': {'loss': 2.2229,
   'perplexity

In [124]:
data_dict

{'gemma_7b_qlora_4bits_norm_nested_final_metrics.json': [{'loss': 2.7495,
   'grad_norm': 16.782007217407227,
   'learning_rate': 1.7647058823529414e-05,
   'epoch': 0.0010210749877471001,
   'step': 50},
  {'loss': 2.2949,
   'grad_norm': 12.598627090454102,
   'learning_rate': 1.1764705882352942e-05,
   'epoch': 0.0020421499754942002,
   'step': 100},
  {'loss': 2.2661,
   'grad_norm': 14.374794006347656,
   'learning_rate': 5.882352941176471e-06,
   'epoch': 0.0030632249632413003,
   'step': 150},
  {'loss': 2.2124,
   'grad_norm': 13.702934265136719,
   'learning_rate': 0.0,
   'epoch': 0.0040842999509884004,
   'step': 200},
  {'train_runtime': 1797.4708,
   'train_samples_per_second': 0.89,
   'train_steps_per_second': 0.111,
   'total_flos': 2.588837616554803e+16,
   'train_loss': 2.38071720123291,
   'epoch': 0.0040842999509884004,
   'step': 200}],
 'llama2_7b_qlora_4bits_norm_nested_final_metrics.json': [{'loss': 2.1258,
   'grad_norm': 2.347311496734619,
   'learning_rate': 