**Install libraries**

In [None]:
!pip install -q transformers accelerate bitsandbytes peft datasets

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[?25h

**Import libraries**

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model


**Mount google drive as a folder on the notebook**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**Defined the model path**

In [None]:
model_path = "/content/drive/MyDrive/Potential_Talent/Llama_3.2_3B_instruct"

**Load tokenizer**

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    model_path,
    use_fast=True
)

tokenizer.pad_token = tokenizer.eos_token

The tokenizer you are loading from '/content/drive/MyDrive/Potential_Talent/Llama_3.2_3B_instruct' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.


**Load the model**

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="auto",
    load_in_4bit=True,
    torch_dtype=torch.float16
)

`torch_dtype` is deprecated! Use `dtype` instead!
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

**State finetuning method(Lora) and configure it**

In [None]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

**Get the model to be finetuned by applying the lora config on the base model**

In [None]:
from peft import get_peft_model

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 2,293,760 || all params: 3,215,043,584 || trainable%: 0.0713


**Load the processed dataset in jsonl format**

In [None]:
from datasets import load_dataset

dataset = load_dataset(
    "json",
    data_files="/content/drive/MyDrive/Potential_Talent/prompt_response_data.jsonl",
    split="train"
)

dataset[0]

Generating train split: 0 examples [00:00, ? examples/s]

{'prompt': 'Candidate profile:\nTitle: innovative and driven professional seeking a role in data analyticsdata science in the information technology industry.\nLocation: united states\n\nEvaluate the candidate and assign a screening score.',
 'response': 'Screening score: 100'}

**Format/conbine prompt and response fileds to form a single field; text as expected by casual Lm**

In [None]:
def format_example(example):
    return {
        "text": f"""### Instruction:
{example['prompt']}

### Response:
{example['response']}"""
    }

**Apply the formatting field on the dataset**

In [None]:
dataset = dataset.map(format_example)
dataset[0]

Map:   0%|          | 0/1285 [00:00<?, ? examples/s]

{'prompt': 'Candidate profile:\nTitle: innovative and driven professional seeking a role in data analyticsdata science in the information technology industry.\nLocation: united states\n\nEvaluate the candidate and assign a screening score.',
 'response': 'Screening score: 100',
 'text': '### Instruction:\nCandidate profile:\nTitle: innovative and driven professional seeking a role in data analyticsdata science in the information technology industry.\nLocation: united states\n\nEvaluate the candidate and assign a screening score.\n\n### Response:\nScreening score: 100'}

**Define and configure tokenization function**

In [None]:
def tokenize_function(example):
    tokenized = tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=512,
    )
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

**Apply the fuction on the dataset, remove the dataset previous columns and leave only the tokenized columns**

In [None]:
dataset = dataset.map(
    tokenize_function,
    remove_columns=dataset.column_names
)

dataset

Map:   0%|          | 0/1285 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1285
})

**Convert the tokenization output to torch tensor using data collector**

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

**Configure training arguments**

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Potential_Talent/lora_output",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    num_train_epochs=3,
    fp16=True,
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    report_to="none",
    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    # gradient_checkpointing removed
)

**Set trainer callback to save the model while training**

In [None]:
from transformers import TrainerCallback

class SaveLoRACallback(TrainerCallback):
    def __init__(self, save_path, save_every_steps=500):
        self.save_path = save_path
        self.save_every_steps = save_every_steps

    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step % self.save_every_steps == 0 and state.global_step != 0:
            checkpoint_path = f"{self.save_path}_step{state.global_step}"
            print(f"Saving LoRA adapters to Google Drive at step {state.global_step}...")
            kwargs['model'].save_pretrained(checkpoint_path)

**Create the Trainer and train the model**

In [None]:
from transformers import Trainer
callback = SaveLoRACallback(
    save_path="/content/drive/MyDrive/Potential_Talent/lora_checkpoints",
    save_every_steps=500
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 128009, 'pad_token_id': 128009}.


Step,Training Loss
10,4.158
20,2.765
30,1.5089
40,1.4855
50,1.4527
60,1.3323
70,1.2813
80,1.3967
90,1.3955
100,1.2634


TrainOutput(global_step=483, training_loss=1.2102690202108821, metrics={'train_runtime': 2715.4502, 'train_samples_per_second': 1.42, 'train_steps_per_second': 0.178, 'total_flos': 3.340834910502912e+16, 'train_loss': 1.2102690202108821, 'epoch': 3.0})

**Save the trained model**

In [None]:
model.save_pretrained("/content/drive/MyDrive/Potential_Talent/lora_adapters")
tokenizer.save_pretrained("/content/drive/MyDrive/Potential_Talent/lora_adapters")

('/content/drive/MyDrive/Potential_Talent/lora_adapters/tokenizer_config.json',
 '/content/drive/MyDrive/Potential_Talent/lora_adapters/special_tokens_map.json',
 '/content/drive/MyDrive/Potential_Talent/lora_adapters/chat_template.jinja',
 '/content/drive/MyDrive/Potential_Talent/lora_adapters/tokenizer.json')