In [1]:
import torch
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, TrainingArguments, Trainer, BitsAndBytesConfig
from peft import PeftConfig, PeftModel
from datasets import load_dataset, DatasetDict
from peft import prepare_model_for_kbit_training
import os

In [2]:
from huggingface_hub import login
login(token=os.environ['HUGGINGFACE_TOKEN'], add_to_git_credential=True)

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /home/biniyam_ajaw/.cache/huggingface/token
Login successful


In [3]:
quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)
nf4_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type='nf4')

In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_id = "NousResearch/Llama-2-7b-hf"
#model_id = "meta-llama/Llama-2-7b-chat-hf"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=nf4_config)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [5]:
tokenizer = "BiniyamAjaw/amharic_tokenizer"
tokenizer = AutoTokenizer.from_pretrained(tokenizer)

tokenizer.tokenize("ኢትዮጵያውብሀገርነች።አስደናቂተራሮችአሉት")

['ኢትዮጵያ', 'ውብ', 'ሀገር', 'ነች', '።', 'አስደናቂ', 'ተራሮች', 'አሉት']

In [6]:
from datasets import load_dataset

data = load_dataset("BiniyamAjaw/amharic_dataset_v2")
data = data.map(lambda samples: tokenizer(samples["text"]), batched=True)

In [7]:
#Resize token embeddings

model.resize_token_embeddings(
    new_num_tokens=len(tokenizer),
    pad_to_multiple_of=64)
model.config.eos_token_id = tokenizer.eos_token_id

In [8]:
def collate(elements):

    # Extract input_ids from each element and find the maximum length among them
    tokens = [e["input_ids"] for e in elements]
    tokens_maxlen = max([len(t) for t in tokens])

    for i, sample in enumerate(elements):
        input_ids = sample["input_ids"]
        labels = sample["labels"]
        attention_mask = sample["attention_mask"]

        # Calculate the padding length required to match the maximum token length
        pad_len = tokens_maxlen-len(input_ids)

        # Pad 'input_ids' with the pad token ID, 'labels' with IGNORE_INDEX, and 'attention_mask' with 0
        input_ids.extend( pad_len * [tokenizer.pad_token_id] )
        labels.extend( pad_len * [IGNORE_INDEX] )
        attention_mask.extend( pad_len * [0] )

    # create and return batch with all the data in elements
    batch={
        "input_ids": torch.tensor( [e["input_ids"] for e in elements] ),
        "labels": torch.tensor( [e["labels"] for e in elements] ),
        "attention_mask": torch.tensor( [e["attention_mask"] for e in elements] ),
    }
    return batch

In [9]:
from peft import prepare_model_for_kbit_training
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)


def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [10]:
embedding_size= model.get_input_embeddings().weight.shape[0]

In [11]:
print(f"Embedding size: {embedding_size}")

Embedding size: 100032


In [12]:
if len(tokenizer) != embedding_size:
    print('Embedding size mismatch. Resizing the token embeddings...')
    model.resize_token_embeddings(new_num_tokens=len(tokenizer))

Embedding size mismatch. Resizing the token embeddings...


In [13]:
print(f"Embedding size: {embedding_size}")

Embedding size: 100032


In [14]:
from peft import LoraConfig, get_peft_model

# Configure LoRA settings for fine-tuning
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules = ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "down_proj", "up_proj"],
    lora_dropout=0.05,
    bias="none",
    modules_to_save = ["lm_head", "embed_tokens"],   # because we added new tokens
    task_type="CAUSAL_LM"
)

enable_profiler = False

# Add adapters to model
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing = True)
model = get_peft_model(model, lora_config)
model.config.use_cache = False

In [15]:
from contextlib import nullcontext
from transformers import TrainerCallback
if enable_profiler:
        wait, warmup, active, repeat = 1, 1, 2, 1
        total_steps = (wait + warmup + active) * (1 + repeat)
        schedule =  torch.profiler.schedule(wait=wait, warmup=warmup, active=active, repeat=repeat)
        profiler = torch.profiler.profile(
            schedule=schedule,
            on_trace_ready=torch.profiler.tensorboard_trace_handler(f"{OUTPUT_DIR}/logs/tensorboard"),
            record_shapes=True,
            profile_memory=True,
            with_stack=True)

        class ProfilerCallback(TrainerCallback):
            def __init__(self, profiler):
                self.profiler = profiler

            def on_step_end(self, *args, **kwargs):
                self.profiler.step()

        profiler_callback = ProfilerCallback(profiler)
else:
    profiler = nullcontext()

In [16]:
config = {
        'lora_config': lora_config,
        'learning_rate': 1e-4,
        'num_train_epochs': 1,
        'gradient_accumulation_steps': 1,
        'per_device_train_batch_size': 2,
        'gradient_checkpointing': True,
}

In [19]:
    # Define training args
training_args = TrainingArguments(
        output_dir='outputs',
        bf16=True,  # Use BF16 if available
        # logging strategies
        logging_dir="outputs/logs",
        logging_strategy="steps",
        logging_steps=10,
        save_strategy="steps",
        save_steps=1000,
        save_total_limit=1,
        warmup_ratio=0.03,
        optim="paged_adamw_8bit",
        max_steps=total_steps if enable_profiler else -1,
        **{k:v for k,v in config.items() if k != 'lora_config'}
    )

In [None]:
#Resize token embeddings

model.resize_token_embeddings(
    new_num_tokens=len(tokenizer),
    pad_to_multiple_of=64)   
model.config.eos_token_id = tokenizer.eos_token_id

In [21]:
import transformers
# needed for Llama tokenizer
tokenizer.pad_token = tokenizer.eos_token # </s>
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

trainer = transformers.Trainer(
    model=model,
    train_dataset=data["train"],
    eval_dataset=data["test"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=40,
        learning_rate=2e-4,
        eval_steps=25,
        fp16=True,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit",
        report_to="tensorboard",
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

model.config.use_cache = True
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
1,16.7831
2,16.7771
3,18.1036
4,17.6547
5,17.1421
6,16.1425
7,15.6847
8,15.81
9,15.4374
10,14.6501


TrainOutput(global_step=40, training_loss=11.738054633140564, metrics={'train_runtime': 110.1525, 'train_samples_per_second': 1.453, 'train_steps_per_second': 0.363, 'total_flos': 352019939819520.0, 'train_loss': 11.738054633140564, 'epoch': 0.0})

In [22]:

from transformers import TextStreamer
model.config.use_cache = True
model.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): ModulesToSaveWrapper(
          (original_module): Embedding(100000, 4096)
          (modules_to_save): ModuleDict(
            (default): Embedding(100000, 4096)
          )
        )
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): Pa

In [23]:

def stream(user_prompt):
    runtimeFlag = "cuda:0"
    system_prompt = 'You are a helpful assistant that provides accurate and concise responses in the Amharic Language'

    B_INST, E_INST = "[INST]", "[/INST]"
    B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"

    prompt = f"{B_INST} {B_SYS}{system_prompt.strip()}{E_SYS}{user_prompt.strip()} {E_INST}\n\n"

    inputs = tokenizer([prompt], return_tensors="pt").to(runtimeFlag)

    streamer = TextStreamer(tokenizer)

    # Despite returning the usual output, the streamer will also print the generated text to stdout.
    _ = model.generate(**inputs, streamer=streamer, max_new_tokens=1024)

In [24]:
# Extract the last portion of the base_model
base_model_name = model_id.split("/")[-1]

# Define the save and push paths
adapter_model = f"BiniyamAjaw/{base_model_name}-fine-tuned-adapters" 
new_model = f"BiniyamAjaw/{base_model_name}-fine-tuned" 

In [27]:
stream('Hello can you recommend me good places in ethiopia')



ValueError: The following `model_kwargs` are not used by the model: ['token_type_ids'] (note: typos in the generate arguments will also show up in this list)

In [28]:
# Save the model
login(token='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', add_to_git_credential=True)
model.save_pretrained(adapter_model, push_to_hub=True, use_auth_token=True)

# Push the model to the hub
model.push_to_hub(adapter_model, use_auth_token=True)

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /home/biniyam_ajaw/.cache/huggingface/token
Login successful






HfHubHTTPError: 401 Client Error: Unauthorized for url: https://huggingface.co/api/repos/create (Request ID: Root=1-65bba6df-28dd21c3182740845c3c76bf;b3b5e470-8753-42da-a3d2-2c16a73f4a7e)

Invalid username or password.

In [None]:

model = AutoModelForCausalLM.from_pretrained(model_id, device_map='cpu', trust_remote_code=True, torch_dtype=torch.float16, cache_dir=cache_dir)

In [None]:
from peft import PeftModel

# load perf model with new adapters
model = PeftModel.from_pretrained(
    model,
    adapter_model,
)

In [None]:
model = model.merge_and_unload() # merge adapters with the base model.

In [None]:
model.push_to_hub(new_model, use_auth_token=True, max_shard_size="5GB")