<a href="https://colab.research.google.com/github/Ayesha9811/Streamlit-Dataset-Creator-Sinhala/blob/main/Fine_Tuning_DeepSeek_R1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Setting up

In [1]:
%%capture
!pip install unsloth
!pip install --force-reinstall --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [2]:
from unsloth import FastLanguageModel
import torch

# Define configurations for loading the model
max_seq_length = 2048
dtype = None  # Automatically choose the best data type (float16, bfloat16, etc.)
load_in_4bit = True  # Enable 4-bit quantization to reduce memory usage

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/DeepSeek-R1-Distill-Llama-8B",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit
)


NotImplementedError: Unsloth: No NVIDIA GPU found? Unsloth currently only supports GPUs!

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # LoRA rank (controls low-rank approximation quality)
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],  # Layers to apply LoRA
    lora_alpha=16, # Scaling factor for LoRA weights
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=False,
    loftq_config=None
)

In [None]:
from datasets import load_dataset  # Load datasets from Hugging Face Hub

# Load a dataset
dataset = load_dataset("AyeshaKalpani98/Sinhala-QnA-Generate", split="train")

In [None]:
from unsloth.chat_templates import standardize_sharegpt

# Convert dataset format from ShareGPT format to Hugging Face's standardized ("role", "content") structure
dataset = [{"conversations": entry["messages"]} for entry in dataset if "messages" in entry]



In [None]:
{"from": "system", "value": "You are an assistant"}
{"from": "human", "value": "What's the capital of France?"}
{"from": "gpt", "value": "The capital of France is Paris."}

{"role": "system", "content": "You are an assistant"}
{"role": "user", "content": "What's the capital of France?"}
{"role": "assistant", "content": "The capital of France is Paris."}

In [None]:
!pip install datasets


In [None]:
# Extract the actual list from the nested structure
#dataset = Dataset.from_dict({"conversations": [entry["conversations"] for entry in dataset]})


In [None]:
print(dataset[0])  # Should print a list of conversation dictionaries


In [None]:
#print(type(dataset))
#print(dataset.column_names)
#print(dataset[0])


In [None]:
def formatting_prompts_func(examples):
    texts = []
    for convo in examples:
        formatted_text = tokenizer.apply_chat_template(convo["conversations"], tokenize=False, add_generation_prompt=False)
        texts.append(formatted_text)
    return {"text": texts}



In [None]:
from datasets import Dataset
from unsloth.chat_templates import get_chat_template

# Ensure `dataset` is a list of dictionaries before converting
if isinstance(dataset, list):
    dataset = Dataset.from_list(dataset)  # ✅ Convert list of dicts to Hugging Face Dataset

# Apply the Llama-3.1 chat template to the tokenizer
tokenizer = get_chat_template(
    tokenizer,  # Ensure `tokenizer` is defined before this step
    chat_template="llama-3.1",  # The chat template format
)

# Function to format the conversation data into tokenized text
def formatting_prompts_func(examples):
    texts = []
    for convos in examples["conversations"]:  # ✅ Iterate over a batch
        text = tokenizer.apply_chat_template(convos, tokenize=False, add_generation_prompt=False)
        texts.append(text)
    return {"text": texts}

# ✅ Apply `.map()` on the Hugging Face Dataset
dataset = dataset.map(formatting_prompts_func, batched=True)



In [None]:
# Print an item in its original conversation format
print(dataset[0]["conversations"])

# Print the same item in its formatted text format
print(dataset[0]["text"])

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported
from trl import SFTTrainer
from unsloth.chat_templates import get_chat_template

# Load the pretrained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("unsloth/DeepSeek-R1-Distill-Llama-8B")
model = AutoModelForCausalLM.from_pretrained("unsloth/DeepSeek-R1-Distill-Llama-8B")

# Apply the Llama-3.1 chat template to the tokenizer
tokenizer = get_chat_template(tokenizer, chat_template="llama-3.1")

# Set the max_seq_length (if not defined earlier)
max_seq_length = 512  # Adjust the sequence length as per your needs

# Define training configurations
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        per_device_train_batch_size=2,  # Number of examples per GPU batch
        gradient_accumulation_steps=4,  # Accumulate gradients over 4 batches before updating model
        warmup_steps=5,  # Number of warmup steps for learning rate schedule
        max_steps=60,  # Limit training steps to 60 (for quick testing)
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,  # Log training metrics after every step
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",  # Linear decay of learning rate
        seed=3407,
        output_dir="outputs",  # Directory to save model checkpoints
        report_to="none",  # Use this for WandB etc
    ),
)

# You can now start training by calling trainer.train() or other necessary training steps


In [None]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part="<|start_header_id|>user<|end_header_id|>\n\n",  # Mark user input
    response_part="<|start_header_id|>assistant<|end_header_id|>\n\n",  # Mark assistant response
)
# Start training the model
trainer_stats = trainer.train()

In [None]:
tokenizer = get_chat_template(
   tokenizer,
   chat_template = "llama-3.1",
)
# Set the PAD token to be the same as the EOS token to avoid tokenization issues
tokenizer.pad_token = tokenizer.eos_token
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
   {"role": "user", "content": "I am sad because I failed my Maths test today"}]
# Tokenize the user input with the chat template
inputs = tokenizer.apply_chat_template(
   messages,
   tokenize=True,
   add_generation_prompt=True,
   return_tensors="pt",
   padding=True,  # Add padding to match sequence lengths
).to("cuda")

attention_mask = inputs != tokenizer.pad_token_id

outputs = model.generate(
   input_ids=inputs,
   attention_mask=attention_mask,
   max_new_tokens=64,
   use_cache=True,  # Use cache for faster token generation
   temperature=0.6,  # Controls randomness in responses
   min_p=0.1,  # Set minimum probability threshold for token selection
)

# Decode the generated tokens into human-readable text
text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(text)