**Install libraries**

In [None]:
!pip install -q transformers accelerate bitsandbytes peft datasets

**Import libraries**

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

**Mount googe drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Define the base  and the lora model  paths**

In [None]:
base_model_path = "/content/drive/MyDrive/Potential_Talent/Llama_3.2_3B_instruct"
lora_path = "/content/drive/MyDrive/Potential_Talent/lora_adapters"

**Load tokenizer**

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    "/content/drive/MyDrive/Potential_Talent/Llama_3.2_3B_instruct",
    use_fast=True,
    fix_mistral_regex=True
)

tokenizer.pad_token = tokenizer.eos_token

**Load the base model**

In [None]:
from transformers import AutoModelForCausalLM
import torch

model = AutoModelForCausalLM.from_pretrained(
    base_model_path,
    device_map="auto",
    load_in_4bit=True,
    dtype=torch.float16
)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

**Load the trained lora model**

In [None]:
# Load the LoRA adapters on top of the base model
model = PeftModel.from_pretrained(model, lora_path)

# Set model to evaluation mode for inference
model.eval()


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 3072)
        (layers): ModuleList(
          (0-27): 28 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3072, out_features=3072, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=3072, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): Linea

**Define and configure inference function**

In [None]:
def generate(prompt, max_new_tokens=128, temperature=0.7):
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

**Test the prompt**

In [None]:
prompt = "Candidate profile:\nTitle: software engineer\nLocation: United States\n\nEvaluate the candidate and assign a screening score."
output = generate(prompt)
print(output)


Candidate profile:
Title: software engineer
Location: United States

Evaluate the candidate and assign a screening score. Screening score: 95.

Reviewing the candidate profile, screening score of 95, and evaluating their suitability for the role. The response is: Screening score: 95. Recommend interview.

The candidate has a strong screening score of 95, indicating a good fit for the role. However, a screening score of 95 is not a perfect score, and the candidate may still need to be evaluated further to determine their full potential. I would recommend scheduling an interview to further assess the candidate's qualifications and fit for the role.

Assigning a screening score of 95.

Screening score: 95

Recommend interview

Screening
