# Fine tuning Llama-3.1-8B with uzinfocom/wikidata_articles_uz

## Installing the required libraries and packages

In [1]:
!pip install torch torchvision transformers datasets bitsandbytes peft



In [2]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset

## Configure torch with GPU

In [3]:
# Switch to GPU (cuda)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


## Model Loading from Huggingface

In [4]:
model_name = "NousResearch/Meta-Llama-3.1-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

## Data Loading & Preprocessing

In [5]:
# Download the dataset and split it into training and test set

full_dataset = load_dataset("uzinfocom/wikidata_articles_uz")

# Select the first 15000 rows
dataset = full_dataset['train'].select(range(100))

train_test_split = dataset.train_test_split(test_size=0.2, shuffle=True)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

In [6]:
print(full_dataset, full_dataset.keys())

DatasetDict({
    train: Dataset({
        features: ['id', 'url', 'title', 'text'],
        num_rows: 221845
    })
}) dict_keys(['train'])


In [7]:
# Let's have a look at our dataset and how a single data point looks like
print("Number of rows:", len(dataset))
print("Columns:", dataset.column_names)

Number of rows: 100
Columns: ['id', 'url', 'title', 'text']


In [8]:
# Tokenize the dataset
def prompt_builder(row):
    return {"prompt": f'Title: {row["title"]}\nContent: {row["text"]}'}

def tokenize_function(examples):
    return tokenizer(examples["prompt"], truncation=True, max_length=300)

# Apply the prompt builder function
train_dataset = train_dataset.map(prompt_builder)
test_dataset = test_dataset.map(prompt_builder)

print("Train dataset example:", train_dataset[0])
print("Test dataset example:", test_dataset[0])

# Apply the tokenization
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Convert datasets to PyTorch tensors
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

# Remove unnecessary columns
train_dataset = train_dataset.remove_columns(['id', 'url', 'title', 'text'])
test_dataset = test_dataset.remove_columns(['id', 'url', 'title', 'text'])

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Train dataset example: {'id': 1577, 'url': 'https://uz.wikipedia.org/wiki/Xorazm%20viloyati', 'title': 'Xorazm viloyati', 'text': 'Xorazm viloyati\xa0— Oʻzbekiston Respublikasi tarkibidagi viloyat. 1925-yil fevraldan 1938-yil yanvargacha Xorazm okrugi, 1938-yil 15-yanvarda viloyat maqomiga oʻtkazilgan. Umumiy maydoni\xa0— 6,05 ming kvadrat kilometr.\n\nEtimologiyasi \nXorazm soʻzi birinchi marta zardushtiylarning muqaddas kitobi „Avesto“da tilga olingan. Qadimgi xorazm tilidan tarjima qilingan Xorazm „pastlikdagi yer“, „oziq-ovqatli yer“ degan maʼnoni bildirgan degan fikrlar ham bor.\n\nTarixi \nMiloddan avvalgi 8-asrda xorazmliklar tomonidan asos solingan Markaziy Osiyodagi eng qadimiy davlatlardan biri tashkil topdi. Bu yerda massagetlar, saklar, apasaklar, oslar, daxlar va boshqalar qabilalari ham yashagan.\n\n12-asrda Oʻrta Osiyoda qudratli Xorazmshohlar davlati tashkil topdi. U oʻzining eng gullab-yashnagan choʻqqisiga 13-asr boshlarida Alauddin Muhammad II davrida erishgan. Ammo 

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

## Turn the model into LoRA model

In [9]:
from peft import prepare_model_for_kbit_training

peft_model = prepare_model_for_kbit_training(model)

peft_model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear8bitLt(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear8bitLt(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear8bitLt(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear8bitLt(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear8bitLt(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNor

In [10]:
from peft import LoraConfig, TaskType, get_peft_model

lora_config = LoraConfig(
    r=32, #the rank of the update matrices, expressed in int. Lower rank results in smaller update matrices with fewer trainable parameters
    lora_alpha=32, #LoRA scaling factor
    lora_dropout=0.1,
    task_type=TaskType.CAUSAL_LM,
    #target_modules='all-linear' # The modules (for example, attention blocks) to apply the LoRA update matrices.
)

lora_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, inference_mode=False, r=32, target_modules=None, lora_alpha=32, lora_dropout=0.1, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False))

In [11]:
lora_model = get_peft_model(peft_model, lora_config)
lora_model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear8bitLt(
                (base_layer): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_pro

In [12]:
lora_model.print_trainable_parameters()

trainable params: 13,631,488 || all params: 8,043,892,736 || trainable%: 0.1695


In [13]:
import numpy as np
from transformers import DataCollatorWithPadding,DataCollatorForLanguageModeling, Trainer, TrainingArguments

trainer = Trainer(
    model=lora_model,
    args=TrainingArguments(
        output_dir="Meta-Llama-3.1-8B-Instruct-finetuned",
        learning_rate=2e-5,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=1,
        weight_decay=0.01,
        load_best_model_at_end=True,
        logging_steps=1,
        report_to="none"
    ),
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
)

  trainer = Trainer(


## Let's start the training!

In [14]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss
1,1.5861,1.625495




TrainOutput(global_step=80, training_loss=1.755404146015644, metrics={'train_runtime': 101.2973, 'train_samples_per_second': 0.79, 'train_steps_per_second': 0.79, 'total_flos': 1012433735393280.0, 'train_loss': 1.755404146015644, 'epoch': 1.0})

## Generate the first response

In [None]:
from peft import PeftModel

# Load the fine-tuned model with the adapter attached
model_with_adapter = PeftModel.from_pretrained(model, "Meta-Llama-3.1-8B-Instruct-finetuned/checkpoint-80").to("cuda")
model_with_adapter.eval()
inputs = tokenizer("Biometriya ma'noni anglatadi", return_tensors="pt")

outputs = model_with_adapter.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=100)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
