In [None]:
import os
os.environ["BNB_CUDA_VERSION"] = "124"  # force using CUDA 12.4 binary

!pip install -U transformers accelerate evaluate
!pip install rouge_score sacrebleu
!pip uninstall -y bitsandbytes
!pip install bitsandbytes

# RAG para ranking de candidatos por cover letters
# ImplementaciÃ³n 1 (E5 + Qwen2.5-3B) y ImplementaciÃ³n 2 (BGE-large + Granite/Watson 1B)


# Carga del dataset y construcciÃ³n del corpus de documentos (cover letters + metadatos)

In [None]:
from datasets import load_dataset
import pandas as pd

ds = load_dataset("dhruvvaidh/cover-letter-dataset-llama3")
# Tomamos el split de train para Ã­ndice y validation para evaluaciÃ³n
train_df = ds["train"].to_pandas()
valid_df = ds.get("validation", ds["train"]).to_pandas()  # fallback si no hay val split

def canonize_row(r):
    instr = str(r.get("Instruction", "")).strip()
    prompt = str(r.get("Prompt", "")).strip()     # suele contener job description y/o CV
    output = str(r.get("Output", "")).strip()     # la cover letter final
    # Documento base: la carta + contexto breve del prompt
    doc = f"Cover Letter:\n{output}\n\nContext (job/CV):\n{prompt}"
    return {
        "doc": doc,
        "cover_letter": output,
        "context": prompt,
        "instruction": instr
    }

corpus = train_df.apply(canonize_row, axis=1, result_type="expand")
corpus = corpus.reset_index().rename(columns={"index": "doc_id"})
len(corpus)


# Utilidades: normalizaciÃ³n, tokenizaciÃ³n ligera para fragmentaciÃ³n opcional


In [None]:
import re

def normalize_text(t):
    t = t.replace("\r"," ").replace("\n","\n")
    t = re.sub(r"\s+", " ", t).strip()
    return t

corpus["doc_norm"] = corpus["doc"].apply(normalize_text)


NameError: name 'corpus' is not defined

# ImplementaciÃ³n 1: Embeddings E5 + FAISS y Generador Qwen2.5-3B


In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

embed_model_A_name = "intfloat/e5-base-v2"  # Embeddings A
embed_model_A = SentenceTransformer(embed_model_A_name)

# E5 recomienda prefijos "query: " y "passage: "
docs_A = ["passage: " + t for t in corpus["doc_norm"].tolist()]
doc_emb_A = embed_model_A.encode(docs_A, batch_size=64, show_progress_bar=True, normalize_embeddings=True)
doc_emb_A = np.array(doc_emb_A, dtype="float32")

index_A = faiss.IndexFlatIP(doc_emb_A.shape[1])
index_A.add(doc_emb_A)

# Utilidades de bÃºsqueda
def search_A(query, k=5):
    q = "query: " + query
    q_emb = embed_model_A.encode([q], normalize_embeddings=True)
    q_emb = np.array(q_emb, dtype="float32")
    scores, idxs = index_A.search(q_emb, k)
    return [(int(i), float(s)) for i, s in zip(idxs[0], scores[0])]


# Generador A: Qwen2.5-3B (causal LM). Usaremos el modelo base y LoRA si ya estÃ¡ entrenado.

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

gen_A_name = "Qwen/Qwen2.5-3B"
tok_A = AutoTokenizer.from_pretrained(gen_A_name)
if tok_A.pad_token is None:
    tok_A.pad_token = tok_A.eos_token


quantization_config = BitsAndBytesConfig(load_in_4bit=True)

gen_A = AutoModelForCausalLM.from_pretrained(
    gen_A_name,
    device_map="auto",
    quantization_config=quantization_config
)

def format_rag_prompt(job_desc, retrieved_docs):
    context_str = "\n\n".join([f"[Doc {i}] {corpus.iloc[idx]['doc']}" for i, idx in enumerate(retrieved_docs)])
    return (
        "### Tarea:\n"
        "Dado un job description, recupera cover letters similares y genera un ranking breve de candidatos con justificaciÃ³n.\n\n"
        "### Job Description:\n"
        f"{job_desc}\n\n"
        "### Evidencia recuperada (fragmentos de cover letters):\n"
        f"{context_str}\n\n"
        "### Instrucciones:\n"
        "- Resume los 3 candidatos mÃ¡s alineados.\n"
        "- Fundamenta cada elecciÃ³n con evidencias del contexto recuperado.\n"
        "- Tono profesional y conciso.\n\n"
        "### Respuesta:"
    )

@torch.no_grad()
def generate_with_A(job_desc, k=5, max_new_tokens=400, temperature=0.7, top_p=0.9):
    hits = search_A(job_desc, k=k)
    idxs = [h[0] for h in hits]
    prompt = format_rag_prompt(job_desc, idxs)
    inputs = tok_A(prompt, return_tensors="pt").to(gen_A.device)
    out = gen_A.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=True, temperature=temperature, top_p=top_p, eos_token_id=tok_A.eos_token_id)
    text = tok_A.decode(out[0], skip_special_tokens=True)
    return text.split("### Respuesta:")[-1].strip(), hits


# ImplementaciÃ³n 2: Embeddings BGE-large + FAISS y Generador Granite/Watson 3B


In [None]:
embed_model_B_name = "BAAI/bge-large-en-v1.5"  # Embeddings B
embed_model_B = SentenceTransformer(embed_model_B_name)

# BGE suele usar prompt "Represent this passage for retrieval: "
docs_B = ["Represent this passage for retrieval: " + t for t in corpus["doc_norm"].tolist()]
doc_emb_B = embed_model_B.encode(docs_B, batch_size=32, show_progress_bar=True, normalize_embeddings=True)
doc_emb_B = np.array(doc_emb_B, dtype="float32")

index_B = faiss.IndexFlatIP(doc_emb_B.shape[1])
index_B.add(doc_emb_B)

def search_B(query, k=5):
    q = "Represent this query for retrieving relevant passages: " + query
    q_emb = embed_model_B.encode([q], normalize_embeddings=True)
    q_emb = np.array(q_emb, dtype="float32")
    scores, idxs = index_B.search(q_emb, k)
    return [(int(i), float(s)) for i, s in zip(idxs[0], scores[0])]


# Generador B: Granite/Watson 3B

In [None]:
gen_B_name = "ibm-granite/granite-3.1-1b-a400m-instruct"
tok_B = AutoTokenizer.from_pretrained(gen_B_name)
if tok_B.pad_token is None and tok_B.eos_token:
    tok_B.pad_token = tok_B.eos_token

gen_B = AutoModelForCausalLM.from_pretrained(gen_B_name, torch_dtype=torch.bfloat16, device_map="auto")

def format_rag_prompt_B(job_desc, retrieved_docs):
    context_str = "\n\n".join([f"[Doc {i}] {corpus.iloc[idx]['doc']}" for i, idx in enumerate(retrieved_docs)])
    return (
        "Task: Rank candidates based on fit to the job description using retrieved cover letters as evidence.\n\n"
        f"Job Description:\n{job_desc}\n\n"
        f"Retrieved Evidence:\n{context_str}\n\n"
        "Instructions:\n- Provide top-3 candidates with 1-2 evidence points each.\n- Keep it concise and professional.\n\n"
        "Answer:"
    )

@torch.no_grad()
def generate_with_B(job_desc, k=5, max_new_tokens=400, temperature=0.7, top_p=0.9):
    hits = search_B(job_desc, k=k)
    idxs = [h[0] for h in hits]
    prompt = format_rag_prompt_B(job_desc, idxs)
    inputs = tok_B(prompt, return_tensors="pt").to(gen_B.device)
    out = gen_B.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=True, temperature=temperature, top_p=top_p, eos_token_id=tok_B.eos_token_id)
    text = tok_B.decode(out[0], skip_special_tokens=True)
    return text.split("Answer:")[-1].strip(), hits


# EvaluaciÃ³n de recuperaciÃ³n: Precision@k y nDCG@k (queries tomadas de prompts de validaciÃ³n)


In [None]:
from sklearn.metrics import ndcg_score
import numpy as np

# Construimos queries: tomamos job descriptions (Prompt) y asumimos que el "Output" correspondiente es relevante.
eval_df = valid_df.copy()
eval_df["query"] = eval_df["Prompt"].astype(str)

def build_relevance_vector(hit_indices, gold_index, k):
    rel = np.zeros(k)
    for i, idx in enumerate(hit_indices[:k]):
        if idx == gold_index:
            rel[i] = 1.0
    return rel

def evaluate_retrieval(search_fn, name, n_cases=50, k=5):
    qs = eval_df["query"].tolist()[:n_cases]
    # Mapeo del "gold" a Ã­ndice de corpus por emparejamiento simple (puente doc_id vÃ­a train_df)
    # SimplificaciÃ³n: si hay correspondencia exacta de 'Output' en corpus, usamos su Ã­ndice; si no, rel=0.
    gold_texts = eval_df["Output"].astype(str).tolist()[:n_cases]
    gold_map = {corpus.iloc[i]["cover_letter"]: i for i in range(len(corpus))}
    precs, ndcgs = [], []
    for q, gold in zip(qs, gold_texts):
        hits = search_fn(q, k=k)
        idxs = [h[0] for h in hits]
        # relevancias
        gold_idx = gold_map.get(gold, None)
        rel = build_relevance_vector(idxs, gold_idx, k)
        prec = rel.sum() / k
        score = ndcg_score([rel], [rel])
        precs.append(prec)
        ndcgs.append(score)
    return {"name": name, "precision@{}".format(k): float(np.mean(precs)), "ndcg@{}".format(k): float(np.mean(ndcgs))}

ret_A = evaluate_retrieval(lambda q: search_A(q, k=5), "E5 + FAISS", n_cases=50, k=5)
ret_B = evaluate_retrieval(lambda q: search_B(q, k=5), "BGE-large + FAISS", n_cases=50, k=5)
ret_A, ret_B
## Max precision: 1/k

# Muestreo cualitativo: ver grounding y evidencia citada

In [None]:
samples = [
    "Senior Data Scientist with NLP focus, experience in transformer-based models, Python, and cloud (AWS/GCP).",
    "Frontend Engineer (React/TypeScript) with UX emphasis, accessibility, and design systems."
]
rows = []
for s in samples:
    pred_A, hits_A = generate_with_A(s, k=5)
    pred_B, hits_B = generate_with_B(s, k=5)
    rows.append({
        "query": s,
        "A_pred": pred_A,
        "A_hits": hits_A,
        "B_pred": pred_B,
        "B_hits": hits_B
    })
pd.DataFrame(rows)