In [1]:
import torch
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, TrainingArguments, Trainer, BitsAndBytesConfig
from peft import PeftConfig, PeftModel
from datasets import load_dataset, DatasetDict
from peft import prepare_model_for_kbit_training
import os

In [2]:
from huggingface_hub import login
login(token=os.environ['HUGGINGFACE_TOKEN'], add_to_git_credential=True)

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /home/biniyam_ajaw/.cache/huggingface/token
Login successful


In [3]:
quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)
nf4_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type='nf4')

In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_id = "NousResearch/Llama-2-7b-hf"
#model_id = "meta-llama/Llama-2-7b-chat-hf"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=nf4_config)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [5]:
tokenizer = "BiniyamAjaw/amharic_tokenizer"
tokenizer = AutoTokenizer.from_pretrained(tokenizer)

tokenizer.tokenize("ኢትዮጵያውብሀገርነች።አስደናቂተራሮችአሉት")

['ኢትዮጵያ', 'ውብ', 'ሀገር', 'ነች', '።', 'አስደናቂ', 'ተራሮች', 'አሉት']

In [None]:
# from os import path
# dataset_dict = load_dataset('text', data_files='/home/biniyam_ajaw/finetuning/merged/merged.txt')

# dataset = dataset_dict['train']

# train_test = dataset.train_test_split(test_size=0.2)
# dataset = train_test


# context_len = 1024
# def tokenize(element):
#     return tokenizer(
#         element['text'],
#         truncation=True,
#         padding = True,
#         max_length = context_len,
#         return_overflowing_tokens=True
#     )
    
# tokenized_datasets = dataset.map(
#     tokenize, batched=True, remove_columns=dataset['train'].column_names
# )
# tokenized_datasets

In [6]:
from datasets import load_dataset

data = load_dataset("BiniyamAjaw/amharic_dataset_v2")
data = data.map(lambda samples: tokenizer(samples["text"]), batched=True)

In [7]:
#Resize token embeddings

model.resize_token_embeddings(
    new_num_tokens=len(tokenizer),
    pad_to_multiple_of=64)   # phi2 default is 64, see configuration_phi.py
model.config.eos_token_id = tokenizer.eos_token_id

In [8]:
def collate(elements):

    # Extract input_ids from each element and find the maximum length among them
    tokens = [e["input_ids"] for e in elements]
    tokens_maxlen = max([len(t) for t in tokens])

    for i, sample in enumerate(elements):
        input_ids = sample["input_ids"]
        labels = sample["labels"]
        attention_mask = sample["attention_mask"]

        # Calculate the padding length required to match the maximum token length
        pad_len = tokens_maxlen-len(input_ids)

        # Pad 'input_ids' with the pad token ID, 'labels' with IGNORE_INDEX, and 'attention_mask' with 0
        input_ids.extend( pad_len * [tokenizer.pad_token_id] )
        labels.extend( pad_len * [IGNORE_INDEX] )
        attention_mask.extend( pad_len * [0] )

    # create and return batch with all the data in elements
    batch={
        "input_ids": torch.tensor( [e["input_ids"] for e in elements] ),
        "labels": torch.tensor( [e["labels"] for e in elements] ),
        "attention_mask": torch.tensor( [e["attention_mask"] for e in elements] ),
    }
    return batch

In [None]:
num_tokens_train = len(tokenized_datasets['train'])
num_tokens_train

In [11]:
from peft import prepare_model_for_kbit_training
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)


def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

: 

In [21]:
from peft import LoraConfig, get_peft_model

# Configure LoRA settings for fine-tuning
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules = ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "down_proj", "up_proj"],
    lora_dropout=0.1,
    bias="none",
    modules_to_save = ["lm_head", "embed_tokens"],   # because we added new tokens
    task_type="CAUSAL_LM"
)

# Add adapters to model
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing = False)
model = get_peft_model(model, lora_config)
model.config.use_cache = False

In [22]:
#Resize token embeddings

model.resize_token_embeddings(
    new_num_tokens=len(tokenizer),
    pad_to_multiple_of=64)   # phi2 default is 64, see configuration_phi.py
model.config.eos_token_id = tokenizer.eos_token_id

In [23]:
import transformers
# needed for Llama tokenizer
tokenizer.pad_token = tokenizer.eos_token # </s>

trainer = transformers.Trainer(
    model=model,
    train_dataset=data["train"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=10,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit"
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

In [24]:
model.config.use_cache = True
trainer.train()

  from IPython.utils import traitlets as _traitlets
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


<IPython.core.display.Javascript object>

: 

In [None]:
# steps_per_epoch=len(tokenized_datasets["train"])//(bs*ga_steps)

# args = TrainingArguments(
    # output_dir="out",
    # per_device_train_batch_size=bs,
    # per_device_eval_batch_size=16,
    # evaluation_strategy="steps",
    # logging_steps=1,
    # eval_steps=steps_per_epoch//2,      # eval twice per epoch
    # save_steps=steps_per_epoch,         # save once per epoch
    # gradient_accumulation_steps=ga_steps,
    # num_train_epochs=epochs,
    # lr_scheduler_type="constant",
    # optim="paged_adamw_32bit",      # val_loss will go NaN with paged_adamw_8bit
    # learning_rate=lr,
    # group_by_length=False,
    # bf16=True,
    # ddp_find_unused_parameters=False,
# )

In [None]:
# trainer = Trainer(
#     model=model,
#     tokenizer=tokenizer,
#     args=args,
#     data_collator=collate,
#     train_dataset=tokenized_datasets["train"],
#     eval_dataset=tokenized_datasets["test"],
# )



In [None]:
from transformers import DataCollatorForLanguageModeling
tokenizer.pad_token = '[PAD]'
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

out = data_collator([tokenized_datasets["train"][i] for i in range(5)])
for key in out:
    print(f"{key} shape: {out[key].shape}")

In [None]:
steps_per_epoch=len(tokenized_datasets["train"])//(bs*ga_steps)
args = TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=10,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit",
)

In [None]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)

In [None]:
trainer.train()

In [None]:

from transformers import TextStreamer
model.config.use_cache = True
model.eval()

In [None]:
# Define a stream *without* function calling capabilities
def stream(user_prompt):
    runtimeFlag = "cuda:0"
    system_prompt = 'You are a helpful assistant that provides accurate and concise responses in the Amharic Language'

    B_INST, E_INST = "[INST]", "[/INST]"
    B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"

    prompt = f"{B_INST} {B_SYS}{system_prompt.strip()}{E_SYS}{user_prompt.strip()} {E_INST}\n\n"

    inputs = tokenizer([prompt], return_tensors="pt").to(runtimeFlag)

    streamer = TextStreamer(tokenizer)

    # Despite returning the usual output, the streamer will also print the generated text to stdout.
    _ = model.generate(**inputs, streamer=streamer, max_new_tokens=1024)

In [None]:
# Extract the last portion of the base_model
base_model_name = model_id.split("/")[-1]

# Define the save and push paths
adapter_model = f"BiniyamAjaw/{base_model_name}-fine-tuned-adapters" 
new_model = f"BiniyamAjaw/{base_model_name}-fine-tuned" 

In [None]:
# Save the model
model.save_pretrained(adapter_model, push_to_hub=True, use_auth_token=True)

# Push the model to the hub
model.push_to_hub(adapter_model, use_auth_token=True)

In [None]:

model = AutoModelForCausalLM.from_pretrained(model_id, device_map='cpu', trust_remote_code=True, torch_dtype=torch.float16, cache_dir=cache_dir)

In [None]:
from peft import PeftModel

# load perf model with new adapters
model = PeftModel.from_pretrained(
    model,
    adapter_model,
)

In [None]:
model = model.merge_and_unload() # merge adapters with the base model.

In [None]:
model.push_to_hub(new_model, use_auth_token=True, max_shard_size="5GB")