In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM, GenerationConfig
from datasets import load_dataset
import accelerate
import torch
import csv
import time
import os
import json
import inspect

from transformers import logging

In [2]:
access_tk = os.getenv("HF_API_TOKEN")

In [3]:
model_list = [
    "meta-llama/Llama-3.2-1B-Instruct",
    "meta-llama/Llama-3.2-3B-Instruct",
    "google/gemma-2-2b-it",
    "mistralai/Mistral-7B-Instruct-v0.1",
    "meta-llama/Llama-3.1-8B-Instruct",
]

In [18]:
def get_model_metadata(model_name : str, device_str: str = "auto"):
    print(f"###########\nPROCESSING MODEL {model_name}\n")
    padding_side='left'
    model = AutoModelForCausalLM.from_pretrained(model_name, token=access_tk, device_map=device_str, 
                                                 torch_dtype=torch.float16)
    tokenizer = AutoTokenizer.from_pretrained(model_name, token=access_tk, device_map=device_str, 
                                              torch_dtype=torch.float16, padding_side=padding_side)
    generation_config = GenerationConfig.from_pretrained(model_name, token=access_tk)

    ret_dict = {'generation_strategy': {'do_sample': generation_config.do_sample, 
                                        'num_beams': generation_config.num_beams},
                'outpit_logit_config': {'top_k': generation_config.top_k, 
                                        'top_p': generation_config.top_p, 
                                        'min-p': generation_config.min_p,
                                        'temperature': generation_config.temperature},
                'input_context_length': model.config.max_position_embeddings,
               }
    del model
    del tokenizer
    
    return ret_dict

In [19]:
models_raw_metainfo = {name: get_model_metadata(name) for name in model_list}

###########
PROCESSING MODEL meta-llama/Llama-3.2-1B-Instruct

###########
PROCESSING MODEL meta-llama/Llama-3.2-3B-Instruct



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

###########
PROCESSING MODEL google/gemma-2-2b-it



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

###########
PROCESSING MODEL mistralai/Mistral-7B-Instruct-v0.1



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

###########
PROCESSING MODEL meta-llama/Llama-3.1-8B-Instruct



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [20]:
models_raw_metainfo

{'meta-llama/Llama-3.2-1B-Instruct': {'generation_strategy': {'do_sample': True,
   'num_beams': 1},
  'outpit_logit_config': {'top_k': 50,
   'top_p': 0.9,
   'min-p': None,
   'temperature': 0.6},
  'input_context_length': 131072},
 'meta-llama/Llama-3.2-3B-Instruct': {'generation_strategy': {'do_sample': True,
   'num_beams': 1},
  'outpit_logit_config': {'top_k': 50,
   'top_p': 0.9,
   'min-p': None,
   'temperature': 0.6},
  'input_context_length': 131072},
 'google/gemma-2-2b-it': {'generation_strategy': {'do_sample': False,
   'num_beams': 1},
  'outpit_logit_config': {'top_k': 50,
   'top_p': 1.0,
   'min-p': None,
   'temperature': 1.0},
  'input_context_length': 8192},
 'mistralai/Mistral-7B-Instruct-v0.1': {'generation_strategy': {'do_sample': False,
   'num_beams': 1},
  'outpit_logit_config': {'top_k': 50,
   'top_p': 1.0,
   'min-p': None,
   'temperature': 1.0},
  'input_context_length': 32768},
 'meta-llama/Llama-3.1-8B-Instruct': {'generation_strategy': {'do_sample': 

In [7]:
dst_file = "../data/models_raw_metadata.json"

In [9]:
with open(dst_file, 'w') as f:
    json.dump(models_raw_metainfo, f)

In [21]:
torch.cuda.device_count()

2

In [22]:
torch.cuda.current_device()

0

In [23]:
torch.cuda.mem_get_info()

(15110963200, 25425608704)

In [17]:
torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]

10314645504

In [13]:
torch.cuda.memory_reserved()

7218397184

In [16]:
torch.cuda.memory_allocated()

0

In [None]:
sum([torch.cuda.memory_reserved(i) - torch])