In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, pipeline, logging
from datasets import load_dataset
import os, sys
from huggingface_hub import notebook_login
import torch.nn as nn
import getpass
from trl import SFTTrainer
from peft import PeftConfig, LoraConfig

In [3]:
os.environ["HUGGING_FACE_HUB_TOKEN"] = getpass.getpass("Token:")
assert os.environ["HUGGING_FACE_HUB_TOKEN"]

### Quantization Config

In [5]:
quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16)
nf4_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4")
double_quant_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True)

In [6]:
model_id = "microsoft/phi-2"
new_model = 'amharic-phi'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map='cuda:0', quantization_config=nf4_config)

tokenizer_config.json: 100%|██████████| 7.34k/7.34k [00:00<00:00, 42.8MB/s]
vocab.json: 100%|██████████| 798k/798k [00:00<00:00, 37.1MB/s]
merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 58.8MB/s]
tokenizer.json: 100%|██████████| 2.11M/2.11M [00:00<00:00, 47.3MB/s]
added_tokens.json: 100%|██████████| 1.08k/1.08k [00:00<00:00, 9.93MB/s]
special_tokens_map.json: 100%|██████████| 99.0/99.0 [00:00<00:00, 919kB/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
config.json: 100%|██████████| 866/866 [00:00<00:00, 8.45MB/s]
model.safetensors.index.json: 100%|██████████| 35.7k/35.7k [00:00<00:00, 112MB/s]
model-00001-of-00002.safetensors: 100%|██████████| 5.00G/5.00G [00:13<00:00, 379MB/s]
model-00002-of-00002.safetensors: 100%|██████████| 564M/564M [00:01<00:00, 383MB/s]
Downloading shards: 100%|██████████| 2/2 [00:14<00:00,  7.38s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.01it/s]
generation_

In [7]:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

### Dataset Setup

I will try to use a random Amharic dataset from HF 

In [8]:
from datasets import load_dataset
dataset_name = 'Henok/amharic-qa'
dataset = load_dataset(dataset_name, split="train")

In [9]:
import re
def get_num_layers(model):
    numbers = set()
    for name, _ in model.named_parameters():
        for number in re.findall(r'\d+', name):
            numbers.add(int(number))
    return max(numbers)

def get_last_layer_linears(model):
    names = []
    
    num_layers = get_num_layers(model)
    for name, module in model.named_modules():
        if str(num_layers) in name and not "encoder" in name:
            if isinstance(module, torch.nn.Linear):
                names.append(name)
    return names

In [10]:
config = LoraConfig(
    r=4,
    lora_alpha=32,
    # target_modules=get_last_layer_linears(model_id),
    lora_dropout=0.03,
    bias='none',
    task_type="CAUSAL_LM"
)

### Training Arguments

In [11]:
#TrainingArguments
training_arguments = TrainingArguments(
    output_dir="./results",
    num_train_epochs=2,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim='paged_adamw_32bit',
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-8,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=25,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type='constant',
    report_to="tensorboard",
    gradient_checkpointing=True
)

### SFT Trainer

In [12]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=config,
    dataset_text_field='inputs',
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False
)

Map: 100%|██████████| 1831/1831 [00:01<00:00, 1656.97 examples/s]


In [13]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
25,1.5845


Checkpoint destination directory ./results/checkpoint-25 already exists and is non-empty.Saving will proceed but saved results may be invalid.


TrainOutput(global_step=25, training_loss=1.5845008850097657, metrics={'train_runtime': 155.6188, 'train_samples_per_second': 0.643, 'train_steps_per_second': 0.161, 'total_flos': 1630125686784000.0, 'train_loss': 1.5845008850097657, 'epoch': 0.05})

In [15]:
trainer.model.save_pretrained(new_model)

## Inference

In [None]:
logging.set_verbosity(logging.CRITICAL)

prompt = "የኢትዮጵያ ጂዲፒ ምን ያህል ነበር?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

In [None]:
from transformers import PreTrainedTokenizerFast
fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="/home/biniyam_ajaw/llama-2-amharic-3784m/tokenizer.json")

In [14]:
print(len(tokenizer.encode('የኢትዮጵያ ጂዲፒ ምን ያህል ነበር?')))

52
