In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install datasets
!pip install accelerate
!pip install bitsandbytes
!pip install peft
!pip install trl
!pip install transformers

Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/542.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━[0m [32m276.5/542.0 kB[0m [31m8.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.

In [None]:
# File to test that all 3 models can be loaded with various PEFT Methods and be ready to be trained

import os
import gc
import json
import torch
import logging
import pandas as pd

from collections import defaultdict
from datasets import Dataset
import accelerate
import bitsandbytes

from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, IA3Config, AdaLoraConfig, PromptEmbedding, PromptTuningConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, pipeline
from trl import SFTTrainer



In [None]:
# Quantization
CONFIG_4BITS = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16) # For QLORA
CONFIG_4BITS_NORM = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=getattr(torch, "float16")) # For QLORA and GEMMA
CONFIG_4BITS_NORM_NESTED = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=getattr(torch, "float16"), bnb_4bit_use_double_quant=True) # For QLORA and GEMMA
CONFIG_4BITS_NESTED = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True) # For QLORA
CONFIG_8BITS = BitsAndBytesConfig(load_in_8bit=True)


In [None]:
# Helper functions

def load_tokenized_dataset(file_path:str) -> Dataset:
    data_dict = {}
    with open(file_path, 'r') as fp:
        id, questions, answers, text, input_id = json.load(fp)

        data_dict['id'] = id
        data_dict['questions'] = questions
        data_dict['answers'] = answers
        data_dict['text'] = text
        data_dict['input_ids'] = input_id


    return Dataset.from_dict(data_dict)


def load_model(base_model: str, bnb_config:BitsAndBytesConfig=None, on_gpu:bool=False, use_cache:bool=False, pretraining_tp:int=1) -> AutoModelForCausalLM:
    if on_gpu:
        print("in here")
        base_model_loaded = AutoModelForCausalLM.from_pretrained(base_model, quantization_config=bnb_config, device_map={"": 0})
        print(base_model)
    else:
        base_model_loaded = AutoModelForCausalLM.from_pretrained(base_model)

    base_model_loaded.config.use_cache = use_cache
    base_model_loaded.config.pretraining_tp = pretraining_tp

    tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    return base_model_loaded, tokenizer

# for lora and qlora: https://www.databricks.com/blog/efficient-fine-tuning-lora-guide-llms
def prepare_lora_config(r:int=8, lora_alpha:int = 8, lora_dropout:float=.05, bias='none', targets:str='linear', task_type:str='CAUSAL_LM'): # can also take attn
    assert targets in ['linear', 'attn'], "Targets must be 'linear' or 'attn'."
    if targets == 'linear':  # per literature review, best performance is when LoRA and QLoRA are applied to lora linear layers
        target_modules = ['q_proj','k_proj','v_proj','o_proj','gate_proj','down_proj','up_proj','lm_head']
    elif targets == 'attn':
        target_modules = ["q_proj", "v_proj"]

    return LoraConfig(r=r, target_modules=target_modules, lora_alpha=lora_alpha, lora_dropout=lora_dropout, bias=bias, task_type=task_type)


# for IA3: https://huggingface.co/docs/peft/en/package_reference/ia3
def prepare_ia3_config(r:int=8, targets:str='linear', feedforward_modules=None, task_type:str='CAUSAL_LM'): # can also take attn
    assert targets in ['linear', 'attn'], "Targets must be 'linear' or 'attn'."
    if targets == 'linear':
        target_modules = ['q_proj','k_proj','v_proj','o_proj','gate_proj','down_proj','up_proj','lm_head']
    elif targets == 'attn':
        target_modules = ["q_proj", "v_proj"]

    return IA3Config(peft_type="IA3", task_type=task_type, target_modules=target_modules, feedforward_modules=feedforward_modules)


# for AdaLora: https://huggingface.co/docs/peft/en/package_reference/adalora
def prepare_adalora_config(r:int=8, lora_alpha:int = 8, lora_dropout:float=.05, bias='none', targets:str='linear', task_type:str='CAUSAL_LM'): # can also take attn
    assert targets in ['linear', 'attn'], "Targets must be 'linear' or 'attn'."
    if targets == 'linear':  # per literature review, best performance is when LoRA and QLoRA are applied to lora linear layers
        target_modules = ['q_proj','k_proj','v_proj','o_proj','gate_proj','down_proj','up_proj','lm_head']
    elif targets == 'attn':
        target_modules = ["q_proj", "v_proj"]

    return AdaLoraConfig(peft_type="ADALORA", task_type=task_type, r=r, target_modules=target_modules, lora_alpha=lora_alpha, lora_dropout=lora_dropout, bias=bias)


# https://huggingface.co/docs/peft/en/package_reference/prompt_tuning
# https://huggingface.co/docs/peft/main/en/task_guides/clm-prompt-tuning
def prepare_prompt_tuning_config(task_type:str='CAUSAL_LM', num_virtual_tokens:int = 8, prompt_tuning_init_task:str = None, tokenizer_model:AutoTokenizer=None):

    return PromptTuningConfig(task_type=task_type, prompt_tuning_init="TEXT", num_virtual_tokens=num_virtual_tokens, prompt_tuning_init_text=prompt_tuning_init_task, tokenizer_name_or_path=tokenizer_model)


def prepare_peft_model(base_model:AutoModelForCausalLM, tokenizer:AutoTokenizer, use_cache:bool=False) -> PeftModel: # For LoRA and QLoRA. To run with QLoRA load model in 4bit quantization
    peft_model = prepare_model_for_kbit_training(base_model)
    peft_model.config.pad_token_id = tokenizer.pad_token_id
    peft_model.use_cache = use_cache

    return peft_model


def del_model_off_gpu(model_on_cuda):
    '''
    Deletes model from GPU and clears all the Cache!
    '''
    del model_on_cuda
    gc.collect()
    torch.cuda.empty_cache()


def setup_trainer(model, ds, tokenizer, peft_config, custom_args=None):

    default_args = {
        "output_dir": "./results_qlora",
        "evaluation_strategy": "steps",
        "do_eval": True,
        "optim": "paged_adamw_8bit",
        "per_device_train_batch_size": 4,
        "per_device_eval_batch_size": 4,
        "log_level": "debug",
        "save_steps": 50,
        "logging_steps": 50,
        "learning_rate": 2e-5,
        "eval_steps": 50,
        "max_steps": 300,
        "warmup_steps": 30,
        "lr_scheduler_type": "linear",
    }

    if custom_args:
        default_args.update(custom_args)

    training_arguments = TrainingArguments(**default_args)


    trainer = SFTTrainer(
        model=model,
        train_dataset=ds['train'],
        eval_dataset=ds['dev'],
        peft_config=peft_config,
        dataset_text_field="text",
        max_seq_length=512,
        tokenizer=tokenizer,
        args=training_arguments,
    )

    return trainer


def speculative_decoding(model, assistant_model, inputs, tokenizer):
    outputs = model.generate(**inputs, assistant_model=assistant_model)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)


def throughput(model, assistant_model, tokenizer, inputs, max_new_tokens=200, temperature=.5):
    start = time.time()
    response = model.generate(**inputs, assistant_model=assistant_model, max_new_tokens=max_new_tokens, temperature=temperature)
    end = time.time()

    latency = end - start
    print(f"Latency: {latency} seconds")

    output_tokens = len(response[0])
    through_put = output_tokens / latency
    print(f"Throughput: {through_put} tokens/second")

    text = tokenizer.decode(response[0])
    print(text)

In [None]:
from huggingface_hub import notebook_login
notebook_login() # use your access token here!

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
gemma_train_dataset = load_tokenized_dataset("/content/drive/MyDrive/Efficient LLM Benchmarks/UnifiedQA Data Curation/tokenized/Gemma/train.json")
gemma_dev_dataset = load_tokenized_dataset("/content/drive/MyDrive/Efficient LLM Benchmarks/UnifiedQA Data Curation/tokenized/Gemma/dev.json")
gemma_test_dataset = load_tokenized_dataset("/content/drive/MyDrive/Efficient LLM Benchmarks/UnifiedQA Data Curation/tokenized/Gemma/test.json")

llama_train_dataset = load_tokenized_dataset("/content/drive/MyDrive/Efficient LLM Benchmarks/UnifiedQA Data Curation/tokenized/Llama/train.json")
llama_dev_dataset = load_tokenized_dataset("/content/drive/MyDrive/Efficient LLM Benchmarks/UnifiedQA Data Curation/tokenized/Llama/dev.json")
llama_test_dataset = load_tokenized_dataset("/content/drive/MyDrive/Efficient LLM Benchmarks/UnifiedQA Data Curation/tokenized/Llama/test.json")

mistral_train_dataset = load_tokenized_dataset("/content/drive/MyDrive/Efficient LLM Benchmarks/UnifiedQA Data Curation/tokenized/Mistral/train.json")
mistral_dev_dataset = load_tokenized_dataset("/content/drive/MyDrive/Efficient LLM Benchmarks/UnifiedQA Data Curation/tokenized/Mistral/dev.json")
mistral_test_dataset = load_tokenized_dataset("/content/drive/MyDrive/Efficient LLM Benchmarks/UnifiedQA Data Curation/tokenized/Mistral/test.json")

In [None]:
# Testing all Quantization for Gemma-7b
gemma_model, gemma_tokenizer = load_model(base_model="google/gemma-7b", bnb_config=CONFIG_4BITS_NORM_NESTED, on_gpu=True, use_cache=False, pretraining_tp=1)
del_model_of_gpu(gemma_model)
gemma_model, gemma_tokenizer = load_model(base_model="google/gemma-7b", bnb_config=CONFIG_4BITS_NORM, on_gpu=True, use_cache=False, pretraining_tp=1)
del_model_of_gpu(gemma_model)
gemma_model, gemma_tokenizer = load_model(base_model="google/gemma-7b", bnb_config=CONFIG_4BITS_NESTED, on_gpu=True, use_cache=False, pretraining_tp=1)
del_model_of_gpu(gemma_model)
gemma_model, gemma_tokenizer = load_model(base_model="google/gemma-7b", bnb_config=CONFIG_4BITS, on_gpu=True, use_cache=False, pretraining_tp=1)
del_model_of_gpu(gemma_model)

in here


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/2.11G [00:00<?, ?B/s]

Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

google/gemma-7b


tokenizer_config.json:   0%|          | 0.00/33.6k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

In [None]:
# Testing all Quantization for llama2

llama2, llama2_tokenizer= load_model(base_model="meta-llama/Llama-2-7b-hf", bnb_config=CONFIG_4BITS_NORM_NESTED, on_gpu=True, use_cache=False, pretraining_tp=1) # Andrey
del_model_of_gpu(llama2)

llama2, llama2_tokenizer= load_model(base_model="meta-llama/Llama-2-7b-hf", bnb_config=CONFIG_4BITS_NORM, on_gpu=True, use_cache=False, pretraining_tp=1) # Andrey
del_model_of_gpu(llama2)

llama2, llama2_tokenizer= load_model(base_model="meta-llama/Llama-2-7b-hf", bnb_config=CONFIG_4BITS_NESTED, on_gpu=True, use_cache=False, pretraining_tp=1) # Andrey
del_model_of_gpu(llama2)

llama2, llama2_tokenizer= load_model(base_model="meta-llama/Llama-2-7b-hf", bnb_config=CONFIG_4BITS, on_gpu=True, use_cache=False, pretraining_tp=1) # Andrey
del_model_of_gpu(llama2)

in here


config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

meta-llama/Llama-2-7b-hf


tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

in here


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

meta-llama/Llama-2-7b-hf
in here


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

meta-llama/Llama-2-7b-hf
in here


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

meta-llama/Llama-2-7b-hf


In [None]:
# Testing all Quantization for Mistral

mistral_model, mistral_tokenizer = load_model(base_model="mistralai/Mistral-7B-v0.1", bnb_config=CONFIG_4BITS_NORM_NESTED, on_gpu=True, use_cache=False, pretraining_tp=1) # Andrey
del_model_of_gpu(mistral_model)

mistral_model, mistral_tokenizer = load_model(base_model="mistralai/Mistral-7B-v0.1", bnb_config=CONFIG_4BITS_NORM, on_gpu=True, use_cache=False, pretraining_tp=1) # Andrey
del_model_of_gpu(mistral_model)

mistral_model, mistral_tokenizer = load_model(base_model="mistralai/Mistral-7B-v0.1", bnb_config=CONFIG_4BITS_NESTED, on_gpu=True, use_cache=False, pretraining_tp=1) # Andrey
del_model_of_gpu(mistral_model)

mistral_model, mistral_tokenizer = load_model(base_model="mistralai/Mistral-7B-v0.1", bnb_config=CONFIG_4BITS, on_gpu=True, use_cache=False, pretraining_tp=1) # Andrey
del_model_of_gpu(mistral_model)

in here


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

mistralai/Mistral-7B-v0.1


tokenizer_config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

in here


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

mistralai/Mistral-7B-v0.1
in here


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

mistralai/Mistral-7B-v0.1
in here


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

mistralai/Mistral-7B-v0.1


In [None]:
# Testing QLoRA load for Gemma-7b, Llama-2 & Mistral (if this works, Lora works as Lora is unquantized variant)

peftConfig = prepare_lora_config()

gemma_model, gemma_tokenizer = load_model(base_model="google/gemma-7b", bnb_config=CONFIG_4BITS_NORM_NESTED, on_gpu=True, use_cache=False, pretraining_tp=1)
peft_model = prepare_peft_model(gemma_model, gemma_tokenizer)
ds = {'train':gemma_train_dataset, 'dev': gemma_dev_dataset}
trainer = setup_trainer(peft_model, ds, gemma_tokenizer, peftConfig)
del_model_of_gpu(gemma_model)
del_model_of_gpu(peft_model)

llama2, llama2_tokenizer= load_model(base_model="meta-llama/Llama-2-7b-hf", bnb_config=CONFIG_4BITS_NORM_NESTED, on_gpu=True, use_cache=False, pretraining_tp=1) # Andrey
peft_model = prepare_peft_model(llama2, llama2_tokenizer)
ds = {'train':llama_train_dataset, 'dev': llama_dev_dataset}
trainer = setup_trainer(peft_model, ds, llama2_tokenizer, peftConfig)
del_model_of_gpu(llama2)
del_model_of_gpu(peft_model)


mistral_model, mistral_tokenizer = load_model(base_model="mistralai/Mistral-7B-v0.1", bnb_config=CONFIG_4BITS_NORM_NESTED, on_gpu=True, use_cache=False, pretraining_tp=1) # Andrey
peft_model = prepare_peft_model(mistral_model, mistral_tokenizer)
ds = {'train':mistral_train_dataset, 'dev': mistral_dev_dataset}
trainer = setup_trainer(peft_model, ds, mistral_tokenizer, peftConfig)
del_model_of_gpu(mistral_model)
del_model_of_gpu(peft_model)


in here


Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

meta-llama/Llama-2-7b-hf


tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

max_steps is given, it will override any value given in num_train_epochs


in here


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--mistralai--Mistral-7B-v0.1/snapshots/26bca36bde8333b5d7f72e9ed20ccda6a618af24/config.json
Model config MistralConfig {
  "_name_or_path": "mistralai/Mistral-7B-v0.1",
  "architectures": [
    "MistralForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 32768,
  "model_type": "mistral",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "rms_norm_eps": 1e-05,
  "rope_theta": 10000.0,
  "sliding_window": 4096,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.40.0",
  "use_cache": true,
  "vocab_size": 32000
}



model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--mistralai--Mistral-7B-v0.1/snapshots/26bca36bde8333b5d7f72e9ed20ccda6a618af24/model.safetensors.index.json


Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Instantiating MistralForCausalLM model under default dtype torch.float16.
Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2
}



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

All model checkpoint weights were used when initializing MistralForCausalLM.

All the weights of MistralForCausalLM were initialized from the model checkpoint at mistralai/Mistral-7B-v0.1.
If your task is similar to the task the model of the checkpoint was trained on, you can already use MistralForCausalLM for predictions without further training.


generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--mistralai--Mistral-7B-v0.1/snapshots/26bca36bde8333b5d7f72e9ed20ccda6a618af24/generation_config.json
Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2
}



mistralai/Mistral-7B-v0.1


tokenizer_config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

loading file tokenizer.model from cache at /root/.cache/huggingface/hub/models--mistralai--Mistral-7B-v0.1/snapshots/26bca36bde8333b5d7f72e9ed20ccda6a618af24/tokenizer.model
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--mistralai--Mistral-7B-v0.1/snapshots/26bca36bde8333b5d7f72e9ed20ccda6a618af24/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--mistralai--Mistral-7B-v0.1/snapshots/26bca36bde8333b5d7f72e9ed20ccda6a618af24/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--mistralai--Mistral-7B-v0.1/snapshots/26bca36bde8333b5d7f72e9ed20ccda6a618af24/tokenizer_config.json
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. Yo

In [None]:
# Testing IA3 loads for Gemma-7b, Llama-2, & Mistral

peftConfig = prepare_ia3_config()

gemma_model, gemma_tokenizer = load_model(base_model="google/gemma-7b", bnb_config=CONFIG_4BITS_NORM_NESTED, on_gpu=True, use_cache=False, pretraining_tp=1)
peft_model = prepare_peft_model(gemma_model, gemma_tokenizer)
ds = {'train':gemma_train_dataset, 'dev': gemma_dev_dataset}
trainer = setup_trainer(peft_model, ds, gemma_tokenizer, peftConfig)
del_model_of_gpu(gemma_model)
del_model_of_gpu(peft_model)

llama2, llama2_tokenizer= load_model(base_model="meta-llama/Llama-2-7b-hf", bnb_config=CONFIG_4BITS_NORM_NESTED, on_gpu=True, use_cache=False, pretraining_tp=1) # Andrey
peft_model = prepare_peft_model(llama2, llama2_tokenizer)
ds = {'train':llama_train_dataset, 'dev': llama_dev_dataset}
trainer = setup_trainer(peft_model, ds, llama2_tokenizer, peftConfig)
del_model_of_gpu(llama2)
del_model_of_gpu(peft_model)


mistral_model, mistral_tokenizer = load_model(base_model="mistralai/Mistral-7B-v0.1", bnb_config=CONFIG_4BITS_NORM_NESTED, on_gpu=True, use_cache=False, pretraining_tp=1) # Andrey
peft_model = prepare_peft_model(mistral_model, mistral_tokenizer)
ds = {'train':mistral_train_dataset, 'dev': mistral_dev_dataset}
trainer = setup_trainer(peft_model, ds, mistral_tokenizer, peftConfig)
del_model_of_gpu(mistral_model)
del_model_of_gpu(peft_model)


loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/config.json
Model config LlamaConfig {
  "_name_or_path": "meta-llama/Llama-2-7b-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.40.0",
  "use_cache": true,
  "vocab_size": 32000
}

loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-2-7

in here


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

All model checkpoint weights were used when initializing LlamaForCausalLM.

All the weights of LlamaForCausalLM were initialized from the model checkpoint at meta-llama/Llama-2-7b-hf.
If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training.
loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/generation_config.json
Generate config GenerationConfig {
  "bos_token_id": 1,
  "do_sample": true,
  "eos_token_id": 2,
  "max_length": 4096,
  "pad_token_id": 0,
  "temperature": 0.6,
  "top_p": 0.9
}

loading file tokenizer.model from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/tokenizer.model
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-

meta-llama/Llama-2-7b-hf


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
PyTorch: setting up devices
max_steps is given, it will override any value given in num_train_epochs
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--mistralai--Mistral-7B-v0.1/snapshots/26bca36bde8333b5d7f72e9ed20ccda6a618af24/config.json
Model config MistralConfig {
  "_name_or_path": "mistralai/Mistral-7B-v0.1",
  "architectures": [
    "MistralForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 32768,
  "model_type": "mistral",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,


in here


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

All model checkpoint weights were used when initializing MistralForCausalLM.

All the weights of MistralForCausalLM were initialized from the model checkpoint at mistralai/Mistral-7B-v0.1.
If your task is similar to the task the model of the checkpoint was trained on, you can already use MistralForCausalLM for predictions without further training.
loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--mistralai--Mistral-7B-v0.1/snapshots/26bca36bde8333b5d7f72e9ed20ccda6a618af24/generation_config.json
Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2
}

loading file tokenizer.model from cache at /root/.cache/huggingface/hub/models--mistralai--Mistral-7B-v0.1/snapshots/26bca36bde8333b5d7f72e9ed20ccda6a618af24/tokenizer.model
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--mistralai--Mistral-7B-v0.1/snapshots/26bca36bde8333b5d7f72e9ed20ccda6a618af24/tokenizer.json
loading file added_toke

mistralai/Mistral-7B-v0.1


max_steps is given, it will override any value given in num_train_epochs


In [None]:
# Testing Adalora loads for Gemma-7b, Llama-2, & Mistral

peftConfig = prepare_adalora_config()

gemma_model, gemma_tokenizer = load_model(base_model="google/gemma-7b", bnb_config=CONFIG_4BITS_NORM_NESTED, on_gpu=True, use_cache=False, pretraining_tp=1)
peft_model = prepare_peft_model(gemma_model, gemma_tokenizer)
ds = {'train':gemma_train_dataset, 'dev': gemma_dev_dataset}
trainer = setup_trainer(peft_model, ds, gemma_tokenizer, peftConfig)
del_model_of_gpu(gemma_model)
del_model_of_gpu(peft_model)

llama2, llama2_tokenizer= load_model(base_model="meta-llama/Llama-2-7b-hf", bnb_config=CONFIG_4BITS_NORM_NESTED, on_gpu=True, use_cache=False, pretraining_tp=1) # Andrey
peft_model = prepare_peft_model(llama2, llama2_tokenizer)
ds = {'train':llama_train_dataset, 'dev': llama_dev_dataset}
trainer = setup_trainer(peft_model, ds, llama2_tokenizer, peftConfig)
del_model_of_gpu(llama2)
del_model_of_gpu(peft_model)


mistral_model, mistral_tokenizer = load_model(base_model="mistralai/Mistral-7B-v0.1", bnb_config=CONFIG_4BITS_NORM_NESTED, on_gpu=True, use_cache=False, pretraining_tp=1) # Andrey
peft_model = prepare_peft_model(mistral_model, mistral_tokenizer)
ds = {'train':mistral_train_dataset, 'dev': mistral_dev_dataset}
trainer = setup_trainer(peft_model, ds, mistral_tokenizer, peftConfig)
del_model_of_gpu(mistral_model)
del_model_of_gpu(peft_model)


loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/config.json
Model config LlamaConfig {
  "_name_or_path": "meta-llama/Llama-2-7b-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.40.0",
  "use_cache": true,
  "vocab_size": 32000
}

loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-2-7

in here


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

All model checkpoint weights were used when initializing LlamaForCausalLM.

All the weights of LlamaForCausalLM were initialized from the model checkpoint at meta-llama/Llama-2-7b-hf.
If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training.
loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/generation_config.json
Generate config GenerationConfig {
  "bos_token_id": 1,
  "do_sample": true,
  "eos_token_id": 2,
  "max_length": 4096,
  "pad_token_id": 0,
  "temperature": 0.6,
  "top_p": 0.9
}

loading file tokenizer.model from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/tokenizer.model
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-

meta-llama/Llama-2-7b-hf


The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
PyTorch: setting up devices
max_steps is given, it will override any value given in num_train_epochs
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--mistralai--Mistral-7B-v0.1/snapshots/26bca36bde8333b5d7f72e9ed20ccda6a618af24/config.json
Model config MistralConfig {
  "_name_or_path": "mistralai/Mistral-7B-v0.1",
  "architectures": [
    "MistralForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 32768,
  "model_type": "mistral",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,


in here


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

All model checkpoint weights were used when initializing MistralForCausalLM.

All the weights of MistralForCausalLM were initialized from the model checkpoint at mistralai/Mistral-7B-v0.1.
If your task is similar to the task the model of the checkpoint was trained on, you can already use MistralForCausalLM for predictions without further training.
loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--mistralai--Mistral-7B-v0.1/snapshots/26bca36bde8333b5d7f72e9ed20ccda6a618af24/generation_config.json
Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2
}

loading file tokenizer.model from cache at /root/.cache/huggingface/hub/models--mistralai--Mistral-7B-v0.1/snapshots/26bca36bde8333b5d7f72e9ed20ccda6a618af24/tokenizer.model
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--mistralai--Mistral-7B-v0.1/snapshots/26bca36bde8333b5d7f72e9ed20ccda6a618af24/tokenizer.json
loading file added_toke

mistralai/Mistral-7B-v0.1


The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
PyTorch: setting up devices
max_steps is given, it will override any value given in num_train_epochs


In [None]:
# Testing prompt-tuning loads for Gemma-7b, Llama-2, & Mistral

gemma_model, gemma_tokenizer = load_model(base_model="google/gemma-7b", bnb_config=CONFIG_4BITS_NORM_NESTED, on_gpu=True, use_cache=False, pretraining_tp=1)

peftConfig = prepare_prompt_tuning_config(prompt_tuning_init_task="Answer this question truthfully", num_virtual_tokens=20, tokenizer_model="google/gemma-7b")
peft_model = prepare_peft_model(gemma_model, gemma_tokenizer)
ds = {'train':gemma_train_dataset, 'dev': gemma_dev_dataset}
trainer = setup_trainer(peft_model, ds, gemma_tokenizer, peftConfig)
del_model_of_gpu(gemma_model)
del_model_of_gpu(peft_model)

llama2, llama2_tokenizer= load_model(base_model="meta-llama/Llama-2-7b-hf", bnb_config=CONFIG_4BITS_NORM_NESTED, on_gpu=True, use_cache=False, pretraining_tp=1)
peftConfig = prepare_prompt_tuning_config(prompt_tuning_init_task="Answer this question truthfully", num_virtual_tokens=20, tokenizer_model="meta-llama/Llama-2-7b-hf")
peft_model = prepare_peft_model(llama2, llama2_tokenizer)
ds = {'train':llama_train_dataset, 'dev': llama_dev_dataset}
trainer = setup_trainer(peft_model, ds, llama2_tokenizer, peftConfig)
del_model_of_gpu(llama2)
del_model_of_gpu(peft_model)


mistral_model, mistral_tokenizer = load_model(base_model="mistralai/Mistral-7B-v0.1", bnb_config=CONFIG_4BITS_NORM_NESTED, on_gpu=True, use_cache=False, pretraining_tp=1) # Andrey
peftConfig = prepare_prompt_tuning_config(prompt_tuning_init_task="Answer this question truthfully", num_virtual_tokens=20, tokenizer_model="mistralai/Mistral-7B-v0.1")
peft_model = prepare_peft_model(llama2, llama2_tokenizer)
ds = {'train':mistral_train_dataset, 'dev': mistral_dev_dataset}
trainer = setup_trainer(peft_model, ds, llama2_tokenizer, peftConfig)
del_model_of_gpu(mistral_model)
del_model_of_gpu(peft_model)

in here


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

google/gemma-7b
