In [3]:
#!pip -qq install huggingface_hub
#!pip -qq install -U bitsandbytes
#!pip -qq install sentencepiece
#!pip -qq install -U transformers@git+https://github.com/huggingface/transformers.git
#!pip -qq install -U peft@git+https://github.com/huggingface/peft.git
#!pip -qq install -U accelerate@git+https://github.com/huggingface/accelerate.git

from huggingface_hub import login
login(token='YOUR_HUGGINGFACE_TOKEN')

!nvidia-smi

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful
Tue Dec  5 15:58:31 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  On   | 00000000:0F:00.0 Off |                    0 |
| N/A   36C    P0    69W / 400W |  43025MiB / 81920MiB |      0%      Default |
|                               |                      |             Disabled |
+-

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score
import pandas as pd
import torch, gc
from datasets import Dataset, load_dataset
from peft import LoraConfig, PeftModel, AutoPeftModelForCausalLM, get_peft_model, prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from tqdm import tqdm
import numpy as np
import random
import os

gc.collect()
torch.cuda.empty_cache()

2023-12-05 15:58:36.068150: I tensorflow/core/platform/cpu_feature_guard.cc:183] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"

BASE_MODEL_NAME = "meta-llama/Llama-2-13b-chat-hf"
SHORT_MODEL_NAME = "llama2_13B_chat"

In [4]:
dataset = load_dataset('hotal/emergency_classification_prompt')
dataset = dataset['train'].train_test_split(test_size=0.2)
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 20791
    })
    test: Dataset({
        features: ['text'],
        num_rows: 5198
    })
})

In [5]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16, ## Datatype for the 4bit weights
)

peft_config = LoraConfig(
    r=16,
    lora_alpha=64,
    target_modules=['o_proj', 'up_proj', 'gate_proj', 'down_proj', 'k_proj', 'q_proj', 'v_proj'],
    #modules_to_save=["embed_tokens", "lm_head"], # To train special tokens as well
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)

In [None]:
%%time

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_NAME, 
    torch_dtype=torch.bfloat16, ##
    quantization_config=bnb_config,
    use_cache = False,
    device_map='auto',
    max_memory = {i: '76GB' for i in range(torch.cuda.device_count())}, ## For A100 GPUs
    pretraining_tp = 1 
)
base_model = prepare_model_for_kbit_training(base_model)

base_tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME, use_fast=False)
base_tokenizer.eos_token = '</s>' # Fix eos issue of llama2
base_tokenizer.pad_token = '<pad>'
base_tokenizer.padding_side = "right"  # Fix weird overflow issue with fp16 training

In [None]:
# TRAIN
training_args = TrainingArguments(
    report_to="tensorboard",
    auto_find_batch_size=True,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_checkpointing=True,
    max_grad_norm= 0.3,
    num_train_epochs=5,
    learning_rate=1e-4,
    bf16=True, ##
    save_total_limit=3,
    output_dir=SHORT_MODEL_NAME+'_sft',
    optim="paged_adamw_32bit",
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    evaluation_strategy="epoch",
    group_by_length=True,
)

trainer = SFTTrainer(
    base_model,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    dataset_text_field="text",
    tokenizer=base_tokenizer,
    peft_config=peft_config,
    max_seq_length=4096,
    packing=False,
    args=training_args,
)

trainer.train() 

In [None]:
# Save losses to a file
f = open(f'losses_{SHORT_MODEL_NAME+'_merged'}.txt', 'w+')
f.write(str(trainer.state.log_history))
f.close()

In [None]:
# SAVE SFT
trainer.save_model(SHORT_MODEL_NAME + '_sft')

trainer.model.save_pretrained(SHORT_MODEL_NAME + '_sft/final')
base_tokenizer.save_pretrained(SHORT_MODEL_NAME + '_sft/final')

In [None]:
# MERGE ADAPTER
torch.cuda.empty_cache()

model = AutoPeftModelForCausalLM.from_pretrained(
    SHORT_MODEL_NAME + '_sft/final', 
    device_map="cpu", 
    torch_dtype=torch.bfloat16 ##
)
model = model.merge_and_unload()

tokenizer = AutoTokenizer.from_pretrained(SHORT_MODEL_NAME + '_sft/final')

model.save_pretrained(SHORT_MODEL_NAME + '_merged', safe_serialization=True)
tokenizer.save_pretrained(SHORT_MODEL_NAME + '_merged')

In [None]:
print('DONE!')