In [None]:
import os
from pathlib import Path

# Ensure notebook runs with MlServer as working directory
repo_root = Path.cwd()
mlserver_dir = repo_root / "MlServer"
if repo_root.name != "MlServer":
    if mlserver_dir.exists():
        os.chdir(mlserver_dir)
    else:
        raise FileNotFoundError("MlServer directory not found relative to current working directory.")

print(f"cwd set to: {Path.cwd()}")

In [None]:
%load_ext autoreload
%autoreload 2

%aimport helpers.parse_pdf
%aimport helpers.create_prompt

In [None]:
# Imports
import time
import torch

from helpers.parse_pdf import parse_pdf
from helpers.create_prompt import CreatePrompts
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig


In [None]:
data_dir = Path("data")
models_dir = Path("models")

In [None]:
# Load all resumes from data/resumes directory
resume_files = sorted((data_dir / "resumes").glob("*.pdf"))
for resume_file in resume_files:
    print(resume_file.name)


In [None]:
# Process all resumes
resume_texts = {}

for resume_file in resume_files:
    print(f'\n{"=" * 50}')
    print(resume_file.name)
    print("=" * 50)
    
    resume_text = parse_pdf(resume_file)
    print(f"{resume_text[:100]}")

    resume_texts[resume_file.stem] = resume_text

In [None]:
model_name = "Qwen/Qwen2.5-3B-Instruct"

In [None]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=quantization_config
)
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [None]:
print(f"{model_name} loaded on GPU" if "cuda" in str(model.device) else f"{model_name} loaded on CPU")

In [None]:
for file_name, resume_text in resume_texts.items():
    print(f'\n{"=" * 50}')
    print(file_name)
    print("=" * 50)

    prompts = CreatePrompts(resume_text=resume_text)
    work_experience_prompt = prompts.work_experience_prompt()

    messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": work_experience_prompt}
    ]
    text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

    # Generate work experience summary
    start = time.time()

    with torch.no_grad():
        generated_ids = model.generate(
            **model_inputs,
            max_new_tokens=512,
            use_cache=True
        )

    end = time.time()

    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

    print(f"Response: {response}")
    print(f"Generation time: {end - start:.2f} seconds")

