In [18]:
import torch
from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, pipeline, logging
from datasets import load_dataset
import os, sys
from huggingface_hub import notebook_login
import torch.nn as nn
import getpass
from trl import SFTTrainer
from peft import PeftConfig, LoraConfig

In [20]:
os.environ["HUGGING_FACE_HUB_TOKEN"] = getpass.getpass("Token:")
assert os.environ["HUGGING_FACE_HUB_TOKEN"]

### Quantization Config

In [21]:
quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16)
nf4_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4")
double_quant_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True)

In [22]:
model_id = "microsoft/phi-2"
new_model = 'amharic-phi'
model = AutoModelForCausalLM.from_pretrained(model_id, device_map='cuda:0', quantization_config=nf4_config)

config.json: 100%|██████████| 863/863 [00:00<00:00, 4.40MB/s]
Downloading shards: 100%|██████████| 2/2 [00:00<00:00, 28.17it/s]


Loading checkpoint shards: 100%|██████████| 2/2 [00:20<00:00, 10.24s/it]
generation_config.json: 100%|██████████| 124/124 [00:00<00:00, 1.10MB/s]


In [17]:
tokenizer = AutoTokenizer.from_pretrained("dagim/amharic_tokenizer")
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.padding_side = "right"

In [27]:
tokenizer.tokenize("ከአሜሪካ ወደ አዲስ አበባለመጓዝምንያህልጊዜይወስዳል??")

['ከአሜሪካ', 'ወደ', 'አዲስ', 'አበባ', 'ለመጓዝ', 'ምንያህል', 'ጊዜ', 'ይወስዳል', '??']

### Dataset Setup

I will try to use a random Amharic dataset from HF 

In [28]:
from datasets import load_dataset
dataset_name = 'Henok/amharic-qa'
dataset = load_dataset(dataset_name, split="train")

In [29]:
import re
def get_num_layers(model):
    numbers = set()
    for name, _ in model.named_parameters():
        for number in re.findall(r'\d+', name):
            numbers.add(int(number))
    return max(numbers)

def get_last_layer_linears(model):
    names = []
    
    num_layers = get_num_layers(model)
    for name, module in model.named_modules():
        if str(num_layers) in name and not "encoder" in name:
            if isinstance(module, torch.nn.Linear):
                names.append(name)
    return names

In [30]:
config = LoraConfig(
    r=4,
    lora_alpha=32,
    # target_modules=get_last_layer_linears(model_id),
    lora_dropout=0.03,
    bias='none',
    task_type="CAUSAL_LM"
)

### Training Arguments

In [31]:
#TrainingArguments
training_arguments = TrainingArguments(
    output_dir="./results",
    num_train_epochs=2,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim='paged_adamw_32bit',
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-8,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=25,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type='constant',
    report_to="tensorboard",
    gradient_checkpointing=True
)

### SFT Trainer

In [32]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=config,
    dataset_text_field='inputs',
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False
)

Map:   0%|          | 0/1831 [00:00<?, ? examples/s]

Map: 100%|██████████| 1831/1831 [00:00<00:00, 4509.68 examples/s]


In [33]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
/opt/conda/conda-bld/pytorch_1702400430266/work/aten/src/ATen/native/cuda/Indexing.cu:1292: indexSelectLargeIndex: block: [72,0,0], thread: [96,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1702400430266/work/aten/src/ATen/native/cuda/Indexing.cu:1292: indexSelectLargeIndex: block: [72,0,0], thread: [97,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1702400430266/work/aten/src/ATen/native/cuda/Indexing.cu:1292: indexSelectLargeIndex: block: [72,0,0], thread: [98,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1702400430266/work/aten/src/ATen/native/cuda/Indexing.cu:1292: indexSelectLargeIndex: block: [72,0,0], thread: [99,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1702400430266/work/aten/src/ATen/native/cuda/Indexing.cu:1292: indexSelectLargeIndex: block:

RuntimeError: CUDA error: CUBLAS_STATUS_NOT_INITIALIZED when calling `cublasCreate(handle)`

In [15]:
trainer.model.save_pretrained(new_model)

## Inference

In [None]:
logging.set_verbosity(logging.CRITICAL)

prompt = "የኢትዮጵያ ጂዲፒ ምን ያህል ነበር?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

In [None]:
from transformers import PreTrainedTokenizerFast
fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="/home/biniyam_ajaw/llama-2-amharic-3784m/tokenizer.json")

In [14]:
print(len(tokenizer.encode('የኢትዮጵያ ጂዲፒ ምን ያህል ነበር?')))

52
