In [2]:
# File to test that all 3 models can be loaded with various PEFT Methods and be ready to be trained

import os
import json
import torch
import logging 
import pandas as pd

from collections import defaultdict
from datasets import Dataset
import accelerate
import bitsandbytes

from peft import LoraConfig, LoraModel
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, pipeline
from trl import SFTTrainer
from quantization import CONFIG_4BITS, CONFIG_4BITS_NESTED, CONFIG_4BITS_NORM, CONFIG_8BITS, CONFIG_4BITS_NORM_NESTED
from run_utils import *


In [3]:
gemma_train_dataset = load_tokenized_dataset("/home/andrusha/Desktop/DL Research/Efficient-LLM-Benchmark/UnifiedQA Data Curation/tokenized/Gemma/train.json")
gemma_dev_dataset = load_tokenized_dataset("/home/andrusha/Desktop/DL Research/Efficient-LLM-Benchmark/UnifiedQA Data Curation/tokenized/Gemma/dev.json")
gemma_test_dataset = load_tokenized_dataset("/home/andrusha/Desktop/DL Research/Efficient-LLM-Benchmark/UnifiedQA Data Curation/tokenized/Gemma/test.json")

llama_train_dataset = load_tokenized_dataset("/home/andrusha/Desktop/DL Research/Efficient-LLM-Benchmark/UnifiedQA Data Curation/tokenized/Llama/train.json")
llama_dev_dataset = load_tokenized_dataset("/home/andrusha/Desktop/DL Research/Efficient-LLM-Benchmark/UnifiedQA Data Curation/tokenized/Llama/dev.json")
llama_test_dataset = load_tokenized_dataset("/home/andrusha/Desktop/DL Research/Efficient-LLM-Benchmark/UnifiedQA Data Curation/tokenized/Llama/test.json")

mistral_train_dataset = load_tokenized_dataset("/home/andrusha/Desktop/DL Research/Efficient-LLM-Benchmark/UnifiedQA Data Curation/tokenized/Mistral/train.json")
mistral_dev_dataset = load_tokenized_dataset("/home/andrusha/Desktop/DL Research/Efficient-LLM-Benchmark/UnifiedQA Data Curation/tokenized/Mistral/dev.json")
mistral_test_dataset = load_tokenized_dataset("/home/andrusha/Desktop/DL Research/Efficient-LLM-Benchmark/UnifiedQA Data Curation/tokenized/Mistral/test.json")

In [4]:
# Testing all Quantization for Gemma-7b 
gemma_model, gemma_tokenizer = load_model(base_model="google/gemma-7b", bnb_config=CONFIG_4BITS_NORM_NESTED, on_gpu=True, use_cache=False, pretraining_tp=1) 
del_model_of_gpu(gemma_model)
gemma_model, gemma_tokenizer = load_model(base_model="google/gemma-7b", bnb_config=CONFIG_4BITS_NORM, on_gpu=True, use_cache=False, pretraining_tp=1)
del_model_of_gpu(gemma_model)
gemma_model, gemma_tokenizer = load_model(base_model="google/gemma-7b", bnb_config=CONFIG_4BITS_NESTED, on_gpu=True, use_cache=False, pretraining_tp=1)
del_model_of_gpu(gemma_model)
gemma_model, gemma_tokenizer = load_model(base_model="google/gemma-7b", bnb_config=CONFIG_4BITS, on_gpu=True, use_cache=False, pretraining_tp=1)
del_model_of_gpu(gemma_model)

in here


ImportError: Using `bitsandbytes` 8-bit quantization requires Accelerate: `pip install accelerate` and the latest version of bitsandbytes: `pip install -i https://pypi.org/simple/ bitsandbytes`

In [None]:
# Testing all Quantization for llama2
 
llama2, llama2_tokenizer= load_model(base_model="meta-llama/Llama-2-7b-hf", bnb_config=CONFIG_4BITS_NORM_NESTED, on_gpu=True, use_cache=False, pretraining_tp=1) # Andrey 
del_model_of_gpu(llama2)

llama2, llama2_tokenizer= load_model(base_model="meta-llama/Llama-2-7b-hf", bnb_config=CONFIG_4BITS_NORM, on_gpu=True, use_cache=False, pretraining_tp=1) # Andrey 
del_model_of_gpu(llama2)

llama2, llama2_tokenizer= load_model(base_model="meta-llama/Llama-2-7b-hf", bnb_config=CONFIG_4BITS_NESTED, on_gpu=True, use_cache=False, pretraining_tp=1) # Andrey 
del_model_of_gpu(llama2)

llama2, llama2_tokenizer= load_model(base_model="meta-llama/Llama-2-7b-hf", bnb_config=CONFIG_4BITS, on_gpu=True, use_cache=False, pretraining_tp=1) # Andrey 
del_model_of_gpu(llama2)

In [None]:
# Testing all Quantization for Mistral

mistral_model, mistral_tokenizer = load_model(base_model="mistralai/Mistral-7B-v0.1", bnb_config=CONFIG_4BITS_NORM_NESTED, on_gpu=True, use_cache=False, pretraining_tp=1) # Andrey 
del_model_of_gpu(mistral_model)

mistral_model, mistral_tokenizer = load_model(base_model="mistralai/Mistral-7B-v0.1", bnb_config=CONFIG_4BITS_NORM, on_gpu=True, use_cache=False, pretraining_tp=1) # Andrey 
del_model_of_gpu(mistral_model)

mistral_model, mistral_tokenizer = load_model(base_model="mistralai/Mistral-7B-v0.1", bnb_config=CONFIG_4BITS_NESTED, on_gpu=True, use_cache=False, pretraining_tp=1) # Andrey 
del_model_of_gpu(mistral_model)

mistral_model, mistral_tokenizer = load_model(base_model="mistralai/Mistral-7B-v0.1", bnb_config=CONFIG_4BITS, on_gpu=True, use_cache=False, pretraining_tp=1) # Andrey 
del_model_of_gpu(mistral_model)

In [None]:
# Testing QLoRA load for Gemma-7b, Llama-2 & Mistral (if this works, Lora works as Lora is unquantized variant)

peftConfig = prepare_lora_config()

gemma_model, gemma_tokenizer = load_model(base_model="google/gemma-7b", bnb_config=CONFIG_4BITS_NORM_NESTED, on_gpu=True, use_cache=False, pretraining_tp=1) 
peft_model = prepare_peft_model(gemma_model, gemma_tokenizer)
trainer = setup_trainer(peft_model, gemma_tokenizer, peftConfig)
del_model_of_gpu(gemma_model)
del_model_of_gpu(peft_model)


llama2, llama2_tokenizer= load_model(base_model="meta-llama/Llama-2-7b-hf", bnb_config=CONFIG_4BITS_NORM_NESTED, on_gpu=True, use_cache=False, pretraining_tp=1) # Andrey 
peft_model = prepare_peft_model(llama2, llama2_tokenizer)
trainer = setup_trainer(peft_model, llama2_tokenizer, peftConfig)
del_model_of_gpu(llama2)
del_model_of_gpu(peft_model)


mistral_model, mistral_tokenizer = load_model(base_model="mistralai/Mistral-7B-v0.1", bnb_config=CONFIG_4BITS_NORM_NESTED, on_gpu=True, use_cache=False, pretraining_tp=1) # Andrey 
peft_model = prepare_peft_model(llama2, llama2_tokenizer)
trainer = setup_trainer(peft_model, llama2_tokenizer, peftConfig)
del_model_of_gpu(mistral_model)
del_model_of_gpu(peft_model)

In [None]:
# Testing IA3 loads for Gemma-7b, Llama-2, & Mistral

peftConfig = prepare_ia3_config()

gemma_model, gemma_tokenizer = load_model(base_model="google/gemma-7b", bnb_config=CONFIG_4BITS_NORM_NESTED, on_gpu=True, use_cache=False, pretraining_tp=1) 
peft_model = prepare_peft_model(gemma_model, gemma_tokenizer)
trainer = setup_trainer(peft_model, gemma_tokenizer, peftConfig)
del_model_of_gpu(gemma_model)
del_model_of_gpu(peft_model)


llama2, llama2_tokenizer= load_model(base_model="meta-llama/Llama-2-7b-hf", bnb_config=CONFIG_4BITS_NORM_NESTED, on_gpu=True, use_cache=False, pretraining_tp=1) # Andrey 
peft_model = prepare_peft_model(llama2, llama2_tokenizer)
trainer = setup_trainer(peft_model, llama2_tokenizer, peftConfig)
del_model_of_gpu(llama2)
del_model_of_gpu(peft_model)


mistral_model, mistral_tokenizer = load_model(base_model="mistralai/Mistral-7B-v0.1", bnb_config=CONFIG_4BITS_NORM_NESTED, on_gpu=True, use_cache=False, pretraining_tp=1) # Andrey 
peft_model = prepare_peft_model(llama2, llama2_tokenizer)
trainer = setup_trainer(peft_model, llama2_tokenizer, peftConfig)
del_model_of_gpu(mistral_model)
del_model_of_gpu(peft_model)

In [None]:
# Testing Adalora loads for Gemma-7b, Llama-2, & Mistral

peftConfig = prepare_adalora_config()

gemma_model, gemma_tokenizer = load_model(base_model="google/gemma-7b", bnb_config=CONFIG_4BITS_NORM_NESTED, on_gpu=True, use_cache=False, pretraining_tp=1) 
peft_model = prepare_peft_model(gemma_model, gemma_tokenizer)
trainer = setup_trainer(peft_model, gemma_tokenizer, peftConfig)
del_model_of_gpu(gemma_model)
del_model_of_gpu(peft_model)


llama2, llama2_tokenizer= load_model(base_model="meta-llama/Llama-2-7b-hf", bnb_config=CONFIG_4BITS_NORM_NESTED, on_gpu=True, use_cache=False, pretraining_tp=1) # Andrey 
peft_model = prepare_peft_model(llama2, llama2_tokenizer)
trainer = setup_trainer(peft_model, llama2_tokenizer, peftConfig)
del_model_of_gpu(llama2)
del_model_of_gpu(peft_model)


mistral_model, mistral_tokenizer = load_model(base_model="mistralai/Mistral-7B-v0.1", bnb_config=CONFIG_4BITS_NORM_NESTED, on_gpu=True, use_cache=False, pretraining_tp=1) # Andrey 
peft_model = prepare_peft_model(llama2, llama2_tokenizer)
trainer = setup_trainer(peft_model, llama2_tokenizer, peftConfig)
del_model_of_gpu(mistral_model)
del_model_of_gpu(peft_model)

In [None]:
# Testing prompt-tuning loads for Gemma-7b, Llama-2, & Mistral

gemma_model, gemma_tokenizer = load_model(base_model="google/gemma-7b", bnb_config=CONFIG_4BITS_NORM_NESTED, on_gpu=True, use_cache=False, pretraining_tp=1) 
peftConfig = prepare_prompt_tuning_config(prompt_tuning_init_task="Answer this question truthfully", num_virtual_tokens=20, tokenizer_model=gemma_tokenizer)
peft_model = prepare_peft_model(gemma_model, gemma_tokenizer)
trainer = setup_trainer(peft_model, gemma_tokenizer, peftConfig)
del_model_of_gpu(gemma_model)
del_model_of_gpu(peft_model)


llama2, llama2_tokenizer= load_model(base_model="meta-llama/Llama-2-7b-hf", bnb_config=CONFIG_4BITS_NORM_NESTED, on_gpu=True, use_cache=False, pretraining_tp=1) 
peftConfig = prepare_prompt_tuning_config(prompt_tuning_init_task="Answer this question truthfully", num_virtual_tokens=20, tokenizer_model=llama2_tokenizer)
peft_model = prepare_peft_model(llama2, llama2_tokenizer)
trainer = setup_trainer(peft_model, llama2_tokenizer, peftConfig)
del_model_of_gpu(llama2)
del_model_of_gpu(peft_model)


mistral_model, mistral_tokenizer = load_model(base_model="mistralai/Mistral-7B-v0.1", bnb_config=CONFIG_4BITS_NORM_NESTED, on_gpu=True, use_cache=False, pretraining_tp=1) # Andrey 
peftConfig = prepare_prompt_tuning_config(prompt_tuning_init_task="Answer this question truthfully", num_virtual_tokens=20, tokenizer_model=mistral_tokenizer)
peft_model = prepare_peft_model(llama2, llama2_tokenizer)
trainer = setup_trainer(peft_model, llama2_tokenizer, peftConfig)
del_model_of_gpu(mistral_model)
del_model_of_gpu(peft_model)