#### This was done on google colab and for it to be rendered by github, the outputs were cleared.

**Install libraries**

In [None]:
!pip install -q transformers accelerate bitsandbytes peft datasets

**Import libraries**

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model


**Mount google drive as a folder on the notebook**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

**Defined the model path**

In [None]:
model_path = "/content/drive/MyDrive/Potential_Talent/Llama_3.2_3B_instruct"

**Load tokenizer**

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    model_path,
    use_fast=True
)

tokenizer.pad_token = tokenizer.eos_token

**Load the model**

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="auto",
    load_in_4bit=True,
    torch_dtype=torch.float16
)

**State finetuning method(Lora) and configure it**

In [None]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

**Get the model to be finetuned by applying the lora config on the base model**

In [None]:
from peft import get_peft_model

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

**Load the processed dataset in jsonl format**

In [None]:
from datasets import load_dataset

dataset = load_dataset(
    "json",
    data_files="/content/drive/MyDrive/Potential_Talent/prompt_response_data.jsonl",
    split="train"
)

dataset[0]

**Format/conbine prompt and response fileds to form a single field; text as expected by casual Lm**

In [None]:
def format_example(example):
    return {
        "text": f"""### Instruction:
{example['prompt']}

### Response:
{example['response']}"""
    }

**Apply the formatting field on the dataset**

In [None]:
dataset = dataset.map(format_example)
dataset[0]

**Define and configure tokenization function**

In [None]:
def tokenize_function(example):
    tokenized = tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=512,
    )
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

**Apply the fuction on the dataset, remove the dataset previous columns and leave only the tokenized columns**

In [None]:
dataset = dataset.map(
    tokenize_function,
    remove_columns=dataset.column_names
)

dataset

**Convert the tokenization output to torch tensor using data collector**

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

**Configure training arguments**

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Potential_Talent/lora_output",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    num_train_epochs=3,
    fp16=True,
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    report_to="none",
    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    # gradient_checkpointing removed
)

**Set trainer callback to save the model while training**

In [None]:
from transformers import TrainerCallback

class SaveLoRACallback(TrainerCallback):
    def __init__(self, save_path, save_every_steps=500):
        self.save_path = save_path
        self.save_every_steps = save_every_steps

    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step % self.save_every_steps == 0 and state.global_step != 0:
            checkpoint_path = f"{self.save_path}_step{state.global_step}"
            print(f"Saving LoRA adapters to Google Drive at step {state.global_step}...")
            kwargs['model'].save_pretrained(checkpoint_path)

**Create the Trainer and train the model**

In [None]:
from transformers import Trainer
callback = SaveLoRACallback(
    save_path="/content/drive/MyDrive/Potential_Talent/lora_checkpoints",
    save_every_steps=500
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

**Save the trained model**

In [None]:
model.save_pretrained("/content/drive/MyDrive/Potential_Talent/lora_adapters")
tokenizer.save_pretrained("/content/drive/MyDrive/Potential_Talent/lora_adapters")