<a href="https://colab.research.google.com/github/Ayesha9811/Streamlit-Dataset-Creator-Sinhala/blob/main/Copy_of_Fine_Tuning_DeepSeek_R1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Setting up

In [34]:
%%capture
!pip install unsloth
!pip install --force-reinstall --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [35]:
from unsloth import FastLanguageModel
import torch

# Define configurations for loading the model
max_seq_length = 2048
dtype = None  # Automatically choose the best data type (float16, bfloat16, etc.)
load_in_4bit = True  # Enable 4-bit quantization to reduce memory usage

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/DeepSeek-R1-Distill-Llama-8B",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit
)


==((====))==  Unsloth 2025.3.1: Fast Llama patching. Transformers: 4.48.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [36]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # LoRA rank (controls low-rank approximation quality)
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],  # Layers to apply LoRA
    lora_alpha=16, # Scaling factor for LoRA weights
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=False,
    loftq_config=None
)

In [37]:
from datasets import load_dataset  # Load datasets from Hugging Face Hub

# Load a dataset
dataset = load_dataset("AyeshaKalpani98/Sinhala-QnA-Generate", split="train")

In [38]:
from unsloth.chat_templates import standardize_sharegpt

# Convert dataset format from ShareGPT format to Hugging Face's standardized ("role", "content") structure
dataset = [{"conversations": entry["messages"]} for entry in dataset if "messages" in entry]



In [39]:
{"from": "system", "value": "You are an assistant"}
{"from": "human", "value": "What's the capital of France?"}
{"from": "gpt", "value": "The capital of France is Paris."}

{"role": "system", "content": "You are an assistant"}
{"role": "user", "content": "What's the capital of France?"}
{"role": "assistant", "content": "The capital of France is Paris."}

{'role': 'assistant', 'content': 'The capital of France is Paris.'}

In [40]:
!pip install datasets




In [41]:
# Extract the actual list from the nested structure
#dataset = Dataset.from_dict({"conversations": [entry["conversations"] for entry in dataset]})


In [42]:
print(dataset[0])  # Should print a list of conversation dictionaries


{'conversations': [{'role': 'system', 'content': 'මගේ නම නවෝදි. මම ඔබට කෙසේද සහය වෙන්නේ?   '}, {'role': 'user', 'content': 'නාසා මූලස්ථානය පිහිටා ඇත්තේ කිනම් නගරයකද?'}, {'role': 'assistant', 'content': 'වොෂින්ටන් ඩී.සී'}]}


In [43]:
#print(type(dataset))
#print(dataset.column_names)
#print(dataset[0])


In [44]:
def formatting_prompts_func(examples):
    texts = []
    for convo in examples:
        formatted_text = tokenizer.apply_chat_template(convo["conversations"], tokenize=False, add_generation_prompt=False)
        texts.append(formatted_text)
    return {"text": texts}



In [45]:
from datasets import Dataset
from unsloth.chat_templates import get_chat_template

# Ensure `dataset` is a list of dictionaries before converting
if isinstance(dataset, list):
    dataset = Dataset.from_list(dataset)  # ✅ Convert list of dicts to Hugging Face Dataset

# Apply the Llama-3.1 chat template to the tokenizer
tokenizer = get_chat_template(
    tokenizer,  # Ensure `tokenizer` is defined before this step
    chat_template="llama-3.1",  # The chat template format
)

# Function to format the conversation data into tokenized text
def formatting_prompts_func(examples):
    texts = []
    for convos in examples["conversations"]:  # ✅ Iterate over a batch
        text = tokenizer.apply_chat_template(convos, tokenize=False, add_generation_prompt=False)
        texts.append(text)
    return {"text": texts}

# ✅ Apply `.map()` on the Hugging Face Dataset
dataset = dataset.map(formatting_prompts_func, batched=True)



Map:   0%|          | 0/103 [00:00<?, ? examples/s]

In [46]:
# Print an item in its original conversation format
print(dataset[0]["conversations"])

# Print the same item in its formatted text format
print(dataset[0]["text"])

[{'content': 'මගේ නම නවෝදි. මම ඔබට කෙසේද සහය වෙන්නේ?   ', 'role': 'system'}, {'content': 'නාසා මූලස්ථානය පිහිටා ඇත්තේ කිනම් නගරයකද?', 'role': 'user'}, {'content': 'වොෂින්ටන් ඩී.සී', 'role': 'assistant'}]
<｜begin▁of▁sentence｜><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

මගේ නම නවෝදි. මම ඔබට කෙසේද සහය වෙන්නේ?   <|eot_id|><|start_header_id|>user<|end_header_id|>

නාසා මූලස්ථානය පිහිටා ඇත්තේ කිනම් නගරයකද?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

වොෂින්ටන් ඩී.සී<|eot_id|>


In [47]:
print(type(tokenizer))

<class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>


In [48]:
from trl import SFTTrainer
from transformers import TrainingArguments, AutoTokenizer, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

# Initialize the tokenizer properly
tokenizer = AutoTokenizer.from_pretrained("unsloth/DeepSeek-R1-Distill-Llama-8B")

# Preprocess the dataset
def preprocess_function(examples):
    examples["text"] = [
        f"User: {msg[0]['content']}\nAssistant: {msg[1]['content']}"
        for msg in examples["conversations"]
    ]
    return examples

dataset = dataset.map(preprocess_function, batched=True)

# Define training configurations
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
    dataset_num_proc=2,
    packing=False,

    args=TrainingArguments(
        per_device_train_batch_size=2,  # Number of examples per GPU batch
        gradient_accumulation_steps=4,  # Accumulate gradients over 4 batches before updating model
        warmup_steps=5,  # Number of warmup steps for learning rate schedule
        max_steps=60,  # Limit training steps to 60 (for quick testing)
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,  # Log training metrics after every step
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",  # Linear decay of learning rate
        seed=3407,
        output_dir="outputs",  # Directory to save model checkpoints
        report_to="none",  # Use this for WandB etc
    ),
)

Map:   0%|          | 0/103 [00:00<?, ? examples/s]

Converting train dataset to ChatML (num_proc=2):   0%|          | 0/103 [00:00<?, ? examples/s]

Applying chat template to train dataset (num_proc=2):   0%|          | 0/103 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=2):   0%|          | 0/103 [00:00<?, ? examples/s]

Truncating train dataset (num_proc=2):   0%|          | 0/103 [00:00<?, ? examples/s]

In [49]:
print(dataset[0])

{'conversations': [{'content': 'මගේ නම නවෝදි. මම ඔබට කෙසේද සහය වෙන්නේ?   ', 'role': 'system'}, {'content': 'නාසා මූලස්ථානය පිහිටා ඇත්තේ කිනම් නගරයකද?', 'role': 'user'}, {'content': 'වොෂින්ටන් ඩී.සී', 'role': 'assistant'}], 'text': 'User: මගේ නම නවෝදි. මම ඔබට කෙසේද සහය වෙන්නේ?   \nAssistant: නාසා මූලස්ථානය පිහිටා ඇත්තේ කිනම් නගරයකද?'}


In [50]:
def preprocess_function(examples):
    formatted_text = []
    for conversation_list in examples["conversations"]:  # Iterate over the batch
        conversation_text = []
        for conversation in conversation_list:  # Iterate over each conversation in the example
            if conversation["role"] == "user":
                conversation_text.append(f"<|start_header_id|>user<|end_header_id|>\n\n{conversation['content']}")
            elif conversation["role"] == "assistant":
                conversation_text.append(f"<|start_header_id|>assistant<|end_header_id|>\n\n{conversation['content']}")
            elif conversation["role"] == "system":
                conversation_text.append(f"<|start_header_id|>system<|end_header_id|>\n\n{conversation['content']}")
        formatted_text.append("\n".join(conversation_text))  # Join conversations for this example
    examples["text"] = formatted_text  # Add the formatted text to the batch
    return examples

In [51]:
dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/103 [00:00<?, ? examples/s]

In [52]:
print(dataset[0]["text"])

<|start_header_id|>system<|end_header_id|>

මගේ නම නවෝදි. මම ඔබට කෙසේද සහය වෙන්නේ?   
<|start_header_id|>user<|end_header_id|>

නාසා මූලස්ථානය පිහිටා ඇත්තේ කිනම් නගරයකද?
<|start_header_id|>assistant<|end_header_id|>

වොෂින්ටන් ඩී.සී


In [53]:
print("Raw Example Data:", dataset[0]["text"])
tokenized_data = tokenizer(dataset[0]["text"], return_tensors="pt")
print("Token IDs:", tokenized_data["input_ids"])


Raw Example Data: <|start_header_id|>system<|end_header_id|>

මගේ නම නවෝදි. මම ඔබට කෙසේද සහය වෙන්නේ?   
<|start_header_id|>user<|end_header_id|>

නාසා මූලස්ථානය පිහිටා ඇත්තේ කිනම් නගරයකද?
<|start_header_id|>assistant<|end_header_id|>

වොෂින්ටන් ඩී.සී
Token IDs: tensor([[128000, 128006,   9125, 128007,    271,  55742,    116,  55742,    250,
          49849,    248,  29082,    114,    109,  55742,    116,  29082,    114,
            109,  49849,    222,  49849,    251,  55742,    107,  49849,    240,
             13,  29082,    114,    116,  55742,    116,  29082,    114,    242,
          55742,    114,  55742,    100,  29082,  90183,  49849,    247,  49849,
            225,  49849,    248,  55742,    107,  29082,    115,    225,  49849,
            226,  55742,    118,  29082,    115,    222,  49849,    247,  55742,
            109,  49849,    232,  55742,    109,  49849,    248,     30,   5996,
         128006,    882, 128007,    271,  55742,    109,  49849,    237,  49849,
         

In [54]:
def apply_labels(example):
    text = example["text"]
    input_ids = tokenizer(text, return_tensors="pt")["input_ids"].squeeze(0)

    # Start with everything masked (-100)
    labels = [-100] * len(input_ids)

    # Find the assistant's response
    start = text.find("<|start_header_id|>assistant<|end_header_id|>\n\n")
    if start != -1:
        start += len("<|start_header_id|>assistant<|end_header_id|>\n\n")
        response_ids = tokenizer(text[start:], return_tensors="pt")["input_ids"].squeeze(0)

        # Apply labels only to the assistant's response
        labels[-len(response_ids):] = response_ids.tolist()

    return {"input_ids": input_ids.tolist(), "labels": labels}


In [55]:
dataset = dataset.map(apply_labels)


Map:   0%|          | 0/103 [00:00<?, ? examples/s]

In [56]:
'''for example in dataset:
    print("Input IDs:", example["input_ids"][:10])  # Print first few token IDs
    print("Labels:", example["labels"][:10])  # Should NOT be all -100
    print("Has valid labels?", any(label != -100 for label in example["labels"]))
    print("="*50)'''


'for example in dataset:\n    print("Input IDs:", example["input_ids"][:10])  # Print first few token IDs\n    print("Labels:", example["labels"][:10])  # Should NOT be all -100\n    print("Has valid labels?", any(label != -100 for label in example["labels"]))\n    print("="*50)'

In [57]:
for i in range(3):  # Check first 3 samples
    print("Input IDs:", dataset[i]['input_ids'])
    print("Labels:", dataset[i]['labels'])
    print("==================================================")


Input IDs: [128000, 128006, 9125, 128007, 271, 55742, 116, 55742, 250, 49849, 248, 29082, 114, 109, 55742, 116, 29082, 114, 109, 49849, 222, 49849, 251, 55742, 107, 49849, 240, 13, 29082, 114, 116, 55742, 116, 29082, 114, 242, 55742, 114, 55742, 100, 29082, 90183, 49849, 247, 49849, 225, 49849, 248, 55742, 107, 29082, 115, 225, 49849, 226, 55742, 118, 29082, 115, 222, 49849, 247, 55742, 109, 49849, 232, 55742, 109, 49849, 248, 30, 5996, 128006, 882, 128007, 271, 55742, 109, 49849, 237, 49849, 225, 49849, 237, 29082, 114, 116, 49849, 244, 55742, 121, 49849, 225, 49849, 232, 55742, 106, 49849, 237, 55742, 109, 55742, 118, 29082, 114, 112, 49849, 240, 49849, 226, 49849, 240, 55742, 100, 49849, 237, 29082, 114, 229, 55742, 255, 49849, 232, 55742, 255, 49849, 248, 29082, 90183, 49849, 240, 55742, 109, 55742, 116, 49849, 232, 29082, 114, 109, 55742, 250, 55742, 119, 55742, 118, 55742, 248, 55742, 107, 5380, 128006, 78191, 128007, 271, 49849, 222, 49849, 250, 49849, 224, 49849, 240, 55742, 10

In [58]:
def fix_labels(example):
    input_ids = example["input_ids"]
    labels = [-100] * len(input_ids)  # Default all to ignore

    # Find the assistant's response start point
    start_idx = None
    for i in range(len(input_ids) - 1):
        if input_ids[i] == 128006 and input_ids[i + 1] == 78191:  # <|start_header_id|>assistant
            start_idx = i + 2  # Start labeling after this

    if start_idx:
        for i in range(start_idx, len(input_ids)):
            labels[i] = input_ids[i]  # Keep assistant response tokens

    return {"labels": labels}

dataset = dataset.map(fix_labels)


Map:   0%|          | 0/103 [00:00<?, ? examples/s]

In [59]:
for i in range(3):
    print(f"Fixed Example {i+1}:")
    print("Input IDs:", dataset[i]['input_ids'])
    print("Labels:", dataset[i]['labels'])
    print("="*50)


Fixed Example 1:
Input IDs: [128000, 128006, 9125, 128007, 271, 55742, 116, 55742, 250, 49849, 248, 29082, 114, 109, 55742, 116, 29082, 114, 109, 49849, 222, 49849, 251, 55742, 107, 49849, 240, 13, 29082, 114, 116, 55742, 116, 29082, 114, 242, 55742, 114, 55742, 100, 29082, 90183, 49849, 247, 49849, 225, 49849, 248, 55742, 107, 29082, 115, 225, 49849, 226, 55742, 118, 29082, 115, 222, 49849, 247, 55742, 109, 49849, 232, 55742, 109, 49849, 248, 30, 5996, 128006, 882, 128007, 271, 55742, 109, 49849, 237, 49849, 225, 49849, 237, 29082, 114, 116, 49849, 244, 55742, 121, 49849, 225, 49849, 232, 55742, 106, 49849, 237, 55742, 109, 55742, 118, 29082, 114, 112, 49849, 240, 49849, 226, 49849, 240, 55742, 100, 49849, 237, 29082, 114, 229, 55742, 255, 49849, 232, 55742, 255, 49849, 248, 29082, 90183, 49849, 240, 55742, 109, 55742, 116, 49849, 232, 29082, 114, 109, 55742, 250, 55742, 119, 55742, 118, 55742, 248, 55742, 107, 5380, 128006, 78191, 128007, 271, 49849, 222, 49849, 250, 49849, 224, 4984

In [60]:
trainer_stats = trainer.train()


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 103 | Num Epochs = 5
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 41,943,040
'NoneType' object is not subscriptable
Unsloth: Not an error, but LlamaForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


ValueError: The model did not return a loss from the inputs, only the following keys: logits. For reference, the inputs it received are input_ids,attention_mask,labels.

In [None]:
tokenizer = get_chat_template(
   tokenizer,
   chat_template = "llama-3.1",
)
# Set the PAD token to be the same as the EOS token to avoid tokenization issues
tokenizer.pad_token = tokenizer.eos_token
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
   {"role": "user", "content": "I am sad because I failed my Maths test today"}]
# Tokenize the user input with the chat template
inputs = tokenizer.apply_chat_template(
   messages,
   tokenize=True,
   add_generation_prompt=True,
   return_tensors="pt",
   padding=True,  # Add padding to match sequence lengths
).to("cuda")

attention_mask = inputs != tokenizer.pad_token_id

outputs = model.generate(
   input_ids=inputs,
   attention_mask=attention_mask,
   max_new_tokens=64,
   use_cache=True,  # Use cache for faster token generation
   temperature=0.6,  # Controls randomness in responses
   min_p=0.1,  # Set minimum probability threshold for token selection
)

# Decode the generated tokens into human-readable text
text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(text)