In [1]:
# Import necessary libraries
import mlflow
import os
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, pipeline, logging
from peft import LoraConfig
from trl import SFTTrainer
import torch
import gc
#os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
#print(os.environ["PYTORCH_CUDA_ALLOC_CONF"])
project_id = os.environ['DOMINO_PROJECT_ID']
new_model = "llama-2-7b-chat-guanaco"

#Increment the version number each time you retrain it
prefix = f"{project_id}/version0"
model_folder_prefix = f"{project_id}/model/version0/"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#Delete folder if it exists
! rm -rm prefix

rm: invalid option -- 'm'
Try 'rm --help' for more information.


In [2]:
per_device_train_batch_size=8 #8 for prod
max_steps=-1 # -1 for prod

In [3]:
#Force garbage collection
gc.collect()

#For PyTorch memory management add the following code

#os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:1024"



# Define model, dataset, and new model name
base_model = "NousResearch/Llama-2-7b-chat-hf"
guanaco_dataset = "mlabonne/guanaco-llama2-1k"


# Load dataset
dataset = load_dataset(guanaco_dataset, split="train")

Downloading readme: 100%|██████████| 1.02k/1.02k [00:00<00:00, 3.10MB/s]
Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]
Downloading data:   0%|          | 0.00/967k [00:00<?, ?B/s][A
Downloading data: 100%|██████████| 967k/967k [00:00<00:00, 4.61MB/s][A
Downloading data files: 100%|██████████| 1/1 [00:00<00:00,  4.67it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 1222.83it/s]
Generating train split: 100%|██████████| 1000/1000 [00:00<00:00, 99130.34 examples/s]


In [4]:
# 4-bit Quantization Configuration
#for dev
#compute_dtype = getattr(torch, "float32")
#for Prod
compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(load_in_4bit=True,
                                  bnb_4bit_quant_type="nf4",
                                  bnb_4bit_compute_dtype=compute_dtype,
                                  bnb_4bit_use_double_quant=False)

# Load model with 4-bit precision
model = AutoModelForCausalLM.from_pretrained(base_model,
                                             cache_dir=f"/artifacts/mlflow/{prefix}/llama2-model-cache/",
                                             quantization_config=quant_config,
                                             device_map="auto")
model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards: 100%|██████████| 2/2 [00:44<00:00, 22.36s/it]


In [5]:


# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, 
                                          cache_dir=f"/artifacts/mlflow/{prefix}/llama2-model-cache/",
                                          trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Set PEFT Parameters
peft_params = LoraConfig(lora_alpha=16,
                         lora_dropout=0.1,
                         r=64, bias="none",
                         task_type="CAUSAL_LM")


# Define training parameters
training_params = TrainingArguments(output_dir=f"/artifacts/mlflow/{prefix}/results",
                                    num_train_epochs=1,
                                    per_device_train_batch_size=per_device_train_batch_size,
                                    gradient_accumulation_steps=1,
                                    optim="paged_adamw_32bit",
                                    #optim="lion_8bit",
                                    save_steps=25,
                                    logging_steps=25,
                                    learning_rate=2e-4,
                                    weight_decay=0.001,
                                    fp16=False,
                                    bf16=False,
                                    max_grad_norm=0.3,
                                    max_steps=max_steps,
                                    warmup_ratio=0.03,
                                    group_by_length=True,
                                    lr_scheduler_type="constant",
                                    report_to=None)


# Initialize the trainer
trainer = SFTTrainer(model=model,
                     train_dataset=dataset,
                     peft_config=peft_params,
                     dataset_text_field="text",
                     max_seq_length=None,
                     tokenizer=tokenizer,
                     args=training_params,
                     packing=False)



Map: 100%|██████████| 1000/1000 [00:00<00:00, 3899.47 examples/s]


In [6]:
#https://discuss.pytorch.org/t/how-does-reserved-in-total-by-pytorch-work/70172/33
# Train the model
#os.environ['PYTORCH_NO_CUDA_MEMORY_CACHING']='1'
#os.environ['PYTORCH_CUDA_ALLOC_CONF']='expandable_segments:True'
#Force clean the pytorch cache
torch.cuda.empty_cache()
gc.collect()

experiment_name = f'llama2-7b-4bit-lora-sft-{project_id}'
exp = mlflow.set_experiment(experiment_name)
print("Fine-tuning model:")
with mlflow.start_run() as run:
    trainer.train()
    # Save the model and tokenizer
    trainer.model.save_pretrained(f"/artifacts/mlflow/{prefix}/{new_model}")
    trainer.tokenizer.save_pretrained(f"/artifacts/mlflow/{prefix}/{new_model}")

# Test the model
logging.set_verbosity(logging.CRITICAL)
prompt = "Who is Leonardo Da Vinci?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

Fine-tuning model:
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
25,1.4653
50,1.4288
75,1.3069
100,1.3422
125,1.3942




<s>[INST] Who is Leonardo Da Vinci? [/INST] Leonardo da Vinci (1452-1519) was an Italian polymath, artist, inventor, and scientist. He is widely considered one of the greatest painters of all time, and his inventions and designs were centuries ahead of his time. He is known for his famous works such as the Mona Lisa, The Last Supper, and Vitruvian Man. He also made significant contributions to engineering, anatomy, and mathematics. Da Vinci was a true Renaissance man, and his legacy continues to inspire and influence people around the world.


In [8]:
#Force garbage collection; kill the kernel and run the first cell and then this cell
torch.cuda.empty_cache()
gc.collect()

# Reload model in FP16 and merge it with LoRA weights
from peft import LoraConfig, PeftModel

model_name = "NousResearch/Llama-2-7b-chat-hf"

base_model = AutoModelForCausalLM.from_pretrained(model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    cache_dir=f"/artifacts/mlflow/{prefix}/llama2-model-cache/",
    torch_dtype=torch.float16,
    device_map="auto",
)
model = PeftModel.from_pretrained(base_model, f"/artifacts/mlflow/{prefix}/{new_model}")
model = model.merge_and_unload()

output_merged_dir = f"/artifacts/mlflow/{prefix}/final_merged_checkpoint"
os.makedirs(output_merged_dir, exist_ok=True)
model.save_pretrained(output_merged_dir)

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
tokenizer.save_pretrained(output_merged_dir)

Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.86s/it]


('/artifacts/mlflow/673b4f99011f5130a7ff0019/version0/final_merged_checkpoint/tokenizer_config.json',
 '/artifacts/mlflow/673b4f99011f5130a7ff0019/version0/final_merged_checkpoint/special_tokens_map.json',
 '/artifacts/mlflow/673b4f99011f5130a7ff0019/version0/final_merged_checkpoint/tokenizer.json')

In [None]:
#import shutil,os

#deployment_dir = f'/artifacts/mlflow/{model_folder_prefix}/final_merged_checkpoint'
#shutil.rmtree(deployment_dir)
#shutil.copytree(src=output_merged_dir,dst=deployment_dir)

In [7]:
f"/artifacts/mlflow/{prefix}/{new_model}"

'/artifacts/mlflow/673b4f99011f5130a7ff0019/version0/llama-2-7b-chat-guanaco'

In [10]:
torch.cuda.empty_cache()
gc.collect()

0