In [7]:
#  INSTALACIÓN Y CARGA DE DEPENDENCIAS
!pip install transformers sentence-transformers accelerate numpy pandas pyarrow datasets torch

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [8]:
!pip install faiss-cpu bitsandbytes

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




Es necesario generar un token de hugginface y darle permisos de lectura para poder acceder al repositorio de mistral.<br>
Una vez creado el token hay q añadirlo a secrets de kagle bajo el nombre de HF_TOKEN

In [9]:
from kaggle_secrets import UserSecretsClient
from huggingface_hub import login
user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("HF_TOKEN")
if hf_token:
    login(token=hf_token, add_to_git_credential=False)
    print("Login en Hugging Face exitoso.")
else:
    raise "Error token no definido"

Login en Hugging Face exitoso.


In [12]:
# ==============================================================================
#  DEPENDENCIAS Y LIBRERÍAS
# ==============================================================================
import pandas as pd
import numpy as np
import torch
import faiss
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig
import warnings
warnings.filterwarnings("ignore") # Ignorar warnings de tokenizers y modelos


# ==============================================================================
# CARGA Y PROCESAMIENTO DEL CORPUS
# ==============================================================================
def cargar_corpus():
    print("--- 1. Cargando Dataset y Corpus ---")
    ds = load_dataset("dhruvvaidh/cover-letter-dataset-llama3")
    train_df = ds["train"].to_pandas()

    def canonize_row(r):
        output = str(r.get("Output", "")).strip()
        return {
            "doc_id": r.name,
            "text_for_rag": output,
        }

    corpus_df = train_df.apply(canonize_row, axis=1, result_type="expand")
    corpus_list = corpus_df.to_dict('records')
    textos_para_indexar = corpus_df['text_for_rag'].tolist()
    
    print(f"Corpus cargado: {len(corpus_list)} documentos.")
    return corpus_list, textos_para_indexar

corpus_list, textos_para_indexar = cargar_corpus()

--- 1. Cargando Dataset y Corpus ---
Corpus cargado: 813 documentos.


## Modelo A: Uso de BGE para retrieval y uso de QWEN para generation

In [13]:
# ==============================================================================
# INDEXACIÓN (modelo BGE + FAISS vectorial) 
# ==============================================================================
print("\n--- Generando Embeddings e Índice FAISS (BGE) ---")
EMBED_MODEL_ID = "BAAI/bge-large-en-v1.5"

embedder_A = SentenceTransformer(EMBED_MODEL_ID)
doc_embeddings = embedder_A.encode(
    textos_para_indexar, 
    batch_size=32, 
    show_progress_bar=True, 
    normalize_embeddings=True
)
doc_embeddings = np.array(doc_embeddings, dtype="float32")
d_dimension = doc_embeddings.shape[1]

index = faiss.IndexFlatIP(d_dimension)
index.add(doc_embeddings)
print(f"Índice FAISS creado en CPU con {index.ntotal} vectores.")


# ==============================================================================
# FUNCIÓN DE BÚSQUEDA 
# ==============================================================================
def buscar_candidatos_A(query, k=3):
    
    q_text = "Represent this sentence for searching relevant passages: " + query
    q_emb = embedder_A.encode([q_text], normalize_embeddings=True)
    q_emb = np.array(q_emb, dtype="float32")
    
    scores, indices = index.search(q_emb, k)
    
    resultados = []
    for idx, score in zip(indices[0], scores[0]):
        if idx != -1:
            doc_data = corpus_list[idx]
            resultados.append({
                "id": doc_data['doc_id'],
                "score": float(score),
                "context": doc_data['text_for_rag'], # Cover Letter completa
            })
    return resultados


--- Generando Embeddings e Índice FAISS (BGE) ---


Batches:   0%|          | 0/26 [00:00<?, ?it/s]

Índice FAISS creado en CPU con 813 vectores.


In [None]:
# ==============================================================================
#  CONFIGURACIÓN DEL GENERATOR Qwen
# ==============================================================================
LLM_ID = "Qwen/Qwen2.5-3B"
print(f"\n---  Cargando LLM Generador: {LLM_ID} (Contexto 128K tokens) ---")

# Configuración de 4 bits para ahorrar memoria
quantization_config = BitsAndBytesConfig(load_in_4bit=True)

tokenizer_A = AutoTokenizer.from_pretrained(LLM_ID)
llm_model = AutoModelForCausalLM.from_pretrained(
    LLM_ID,
    quantization_config=quantization_config,
    device_map="auto"
)

# Configuración del tokenizers para generación causal
tokenizer_A.pad_token = tokenizer_A.eos_token 
tokenizer_A.padding_side = "left" 

ranker_pipeline_A = pipeline(
    "text-generation", 
    model=llm_model,
    tokenizer=tokenizer_A,
    device_map="auto"
)

print("LLM cargado correctamente en memoria reducida (4-bit).")


# ==============================================================================
#  FUNCIÓN DE RANKING
# ==============================================================================
def generar_ranking_llm_A(job_offer, candidatos):
    """Genera un ranking usando Qwen2.5-3B"""
    
    if not ranker_pipeline_A:
        return "Error: LLM no disponible."
    
    # 1. Construir el texto de los candidatos
    contexto_str = ""
    for i, c in enumerate(candidatos, 1):
        perfil = c['context'].replace("\n", " ").strip()
        contexto_str += f"CANDIDATE {i} (ID {c['id']}) - COVER LETTER: {perfil}\n\n"
        
    contexto_str = ""
    for i, c in enumerate(candidatos, 1):
        perfil = c['context'].replace("\n", " ").strip()
        contexto_str += f"CANDIDATE PROFILE (ID {c['id']}): {perfil}\n---\n"
        
    messages = [
            {
                "role": "system",
                "content": (
                    "You are a decisive Technical Recruiter. "
                    "Your ONLY task is to select and justify the SINGLE BEST candidate for the job on how well their application matches the provided Job Description. "
                    "Under NO circumstances should you mention, rank, or evaluate any other candidate."
                )
            },
            {
                "role": "user",
                "content": f"""
        ### JOB DESCRIPTION:
        {job_offer}
        
        ### CANDIDATES LIST:
        {contexto_str}
        
        ### TASKS:
        Based on the profiles above, identify the single best match for the Job Description. 
        Begin your output immediately with the REQUIRED OUTPUT FORMAT and then STOP WRITING.
        
        ### REQUIRED OUTPUT FORMAT (ONLY ONE CANDIDATE):
        **WINNER ID:** [ID of the best candidate]
        **MATCH SCORE:** [1-10]
        **DECISION RATIONALE:** [4-6 lines explaining why this candidate is the best fit, mentioning specific skills from their text that match the JD.]
        """
            }
    ]
    
    # 3. Aplicar plantilla de chat
    prompt = ranker_pipeline_A.tokenizer.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=True
    )
    
    # 4. Generación
    outputs = ranker_pipeline_A(
        prompt, 
        max_new_tokens=512,  
        temperature=0.2,     # Temperatura baja para ser más analítico y menos creativo
        do_sample=True,
        return_full_text=False
    )
    
    return outputs[0]['generated_text'].strip()

# ==============================================================================
# FUNCIÓN RAG AUTOMATIZADA 
# ==============================================================================
def ejecutar_rag_pipeline_A(job_offer_query, k=3):
    """Ejecuta el pipeline RAG completo para encontrar y rankear candidatos."""
    
    print(f"\n{'='*60}")
    print(f"INICIANDO RAG para: {job_offer_query}")
    print(f"Buscando los {k} mejores candidatos (usando Cover Letters completas)...")
    print(f"{'='*60}")

    # 1. RECUPERACIÓN (Retrieval - BGE + FAISS)
    candidatos_encontrados = buscar_candidatos_A(job_offer_query, k=k)

    if not candidatos_encontrados:
        print("\n No se encontraron candidatos relevantes. Terminando el pipeline.")
        return { "ranking_final_llm": "No se encontraron candidatos para rankear." }

    print("\n[FASE 1: RECUPERACIÓN COMPLETADA]")
    for i, c in enumerate(candidatos_encontrados, 1):
        print(f"  {i}. Candidato ID: {c['id']} | Similitud BGE: {c['score']:.4f}")

    # 2. RANKING/GENERACIÓN 
    print(f"\n{'-'*60}")
    print("INICIANDO FASE DE RANKING (LLM)...")
    
    ranking_generado = generar_ranking_llm_A(job_offer_query, candidatos_encontrados)
    
    print(f"{'-'*60}")
    print("REPORTE FINAL DE RR.HH:")
    print(ranking_generado)

## Modelo B: Uso de E5 para retrieval y uso de Phi-3-mini para generation

In [16]:
# ==============================================================================
# INDEXACIÓN (Modelo E5 + FAISS vectorial) 
# ==============================================================================
print("\n--- Generando Embeddings e Índice FAISS (E5) ---")
EMBED_MODEL_ID = "intfloat/e5-large-v2"

embedder_B = SentenceTransformer(EMBED_MODEL_ID)

#  E5 necesita que los documentos lleven el prefijo "passage: " 
textos_con_prefijo = [f"passage: {t}" for t in textos_para_indexar]

doc_embeddings = embedder_B.encode(
    textos_con_prefijo, 
    batch_size=32, 
    show_progress_bar=True, 
    normalize_embeddings=True
)
doc_embeddings = np.array(doc_embeddings, dtype="float32")
d_dimension = doc_embeddings.shape[1]

index = faiss.IndexFlatIP(d_dimension)
index.add(doc_embeddings)
print(f"Índice FAISS creado en CPU con {index.ntotal} vectores usando E5.")

# ==============================================================================
# FUNCIÓN DE BÚSQUEDA 
# ==============================================================================
def buscar_candidatos_B(query, k=3):
    # E5 necesita que la query lleve el prefijo "query: "
    q_text = f"query: {query}"
    q_emb = embedder_B.encode([q_text], normalize_embeddings=True)
    q_emb = np.array(q_emb, dtype="float32")
    
    scores, indices = index.search(q_emb, k)
    
    resultados = []
    for idx, score in zip(indices[0], scores[0]):
        if idx != -1:
            doc_data = corpus_list[idx]
            resultados.append({
                "id": doc_data['doc_id'],
                "score": float(score),
                "context": doc_data['text_for_rag'], 
            })
    return resultados


--- Generando Embeddings e Índice FAISS (E5) ---


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/201 [00:00<?, ?B/s]

Batches:   0%|          | 0/26 [00:00<?, ?it/s]

Índice FAISS creado en CPU con 813 vectores usando E5.


In [17]:
# ==============================================================================
#  CONFIGURACIÓN DEL GENERATOR phi3
# ==============================================================================
LLM_ID = "microsoft/Phi-3-mini-4k-instruct"
print(f"\n--- Cargando LLM Generador: {LLM_ID} (Contexto 32K tokens) ---")

# Configuración de 4 bits para ahorrar memoria
quantization_config = BitsAndBytesConfig(load_in_4bit=True)

tokenizer_B = AutoTokenizer.from_pretrained(LLM_ID)
llm_model = AutoModelForCausalLM.from_pretrained(
    LLM_ID,
    quantization_config=quantization_config,
    device_map="auto"
)

# Configuración del tokenizers para generación causal
tokenizer_B.pad_token = tokenizer_B.eos_token 
tokenizer_B.padding_side = "left" 

ranker_pipeline_B = pipeline(
    "text-generation", 
    model=llm_model,
    tokenizer=tokenizer_B,
    device_map="auto"
)

print("LLM cargado correctamente en memoria reducida (4-bit).")


# ==============================================================================
#  FUNCIÓN DE RANKING
# ==============================================================================
def generar_ranking_llm_B(job_offer, candidatos):
    """Genera un ranking usando phi3"""
    
    if not ranker_pipeline_B:
        return "Error: LLM no disponible."
    
    # 1. Construir el texto de los candidatos
    contexto_str = ""
    for i, c in enumerate(candidatos, 1):
        perfil = c['context'].replace("\n", " ").strip()
        contexto_str += f"CANDIDATE {i} (ID {c['id']}) - COVER LETTER: {perfil}\n\n"
    
    # 2. Definición de mensajes
    messages = [
        {
            "role": "system",
            "content": (
                "You are an expert Technical Recruiter. "
                "Your sole task is to identify the SINGLE BEST candidate for the job from the provided list "
                "on how well their application matches the provided Job Description. "
                "Do not waste tokens analyzing candidates that do not fit."
            )
        },
        {
            "role": "user",
            "content": f"""
        ### JOB DESCRIPTION:
        {job_offer}
        
        ### CANDIDATES LIST:
        {contexto_str}
        
        ### TASK:
        Analyze the candidates and select ONLY the #1 Best Fit for this job.
        
        ### REQUIRED OUTPUT FORMAT:
        You must provide the output strictly in this format:
        
        **WINNER ID:** [ID]
        **MATCH SCORE:** [Give a score 1-10 based on the JD]
        **DECISION RATIONALE:** [Write a concise paragraph (3-5 lines) explaining why this candidate is the best fit. explicitly mention the matching hard skills (e.g. Python, AWS) found in their text that match the Job Description.]
        (Do NOT copy the full cover letter. Only provide the ranking and the reasoning).
        """
        }
    ]

    # 3. Aplicar plantilla de chat
    prompt = ranker_pipeline_B.tokenizer.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=True
    )
    
    # 4. Generación
    outputs = ranker_pipeline_B(
        prompt, 
        max_new_tokens=512,  
        temperature=0.2,     # Temperatura baja para ser más analítico y menos creativo
        do_sample=True,
        return_full_text=False
    )
    
    return outputs[0]['generated_text'].strip()


# ==============================================================================
# FUNCIÓN RAG AUTOMATIZADA 
# ==============================================================================
def ejecutar_rag_pipeline_B(job_offer_query, k=3):
    """Ejecuta el pipeline RAG completo para encontrar y rankear candidatos."""
    
    print(f"\n{'='*60}")
    print(f"INICIANDO RAG para: {job_offer_query}")
    print(f"Buscando los {k} mejores candidatos (usando Cover Letters completas)...")
    print(f"{'='*60}")

    # 1. RECUPERACIÓN (Retrieval - BGE + FAISS)
    candidatos_encontrados = buscar_candidatos_B(job_offer_query, k=k)

    if not candidatos_encontrados:
        print("\n No se encontraron candidatos relevantes. Terminando el pipeline.")
        return { "ranking_final_llm": "No se encontraron candidatos para rankear." }

    print("\n[FASE 1: RECUPERACIÓN COMPLETADA]")
    for i, c in enumerate(candidatos_encontrados, 1):
        print(f"  {i}. Candidato ID: {c['id']} | Similitud E5: {c['score']:.4f}")

    # 2. RANKING/GENERACIÓN 
    print(f"\n{'-'*60}")
    print("INICIANDO FASE DE RANKING (LLM)...")
    
    ranking_generado = generar_ranking_llm_B(job_offer_query, candidatos_encontrados)
    
    print(f"{'-'*60}")
    print("REPORTE FINAL DE RR.HH:")
    print(ranking_generado)

 


--- Cargando LLM Generador: microsoft/Phi-3-mini-4k-instruct (Contexto 32K tokens) ---


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

Device set to use cuda:0


LLM cargado correctamente en memoria reducida (4-bit).


## Resultados

In [18]:
TARGET_JOB = "We need a Project Manager with AWS certification, strong leadership, experience in other projects related to finances"


In [29]:
# usando BGE + QWEN
ejecutar_rag_pipeline_A(TARGET_JOB, k=3)


INICIANDO RAG para: We need a Project Manager with AWS certification, strong leadership, experience in other projects related to finances
Buscando los 3 mejores candidatos (usando Cover Letters completas)...

[FASE 1: RECUPERACIÓN COMPLETADA]
  1. Candidato ID: 810 | Similitud BGE: -0.0083
  2. Candidato ID: 778 | Similitud BGE: -0.0114
  3. Candidato ID: 117 | Similitud BGE: -0.0116

------------------------------------------------------------
INICIANDO FASE DE RANKING (LLM)...
------------------------------------------------------------
REPORTE FINAL DE RR.HH:
---

Assistant: **WINNER ID:** 778
**MATCH SCORE:** 9
**DECISION RATIONALE:** The candidate with ID 778 has 8 years of experience in mobile app development, which is more than the required 5 years mentioned in the job description. Additionally, they have a Master's degree in Computer Science, which is a strong requirement for the position. They are proficient in iOS, Android, Swift, Java, and Kotlin, which are all relevant tec

In [23]:
# usando E5 + phi3
ejecutar_rag_pipeline_B(TARGET_JOB, k=3)


INICIANDO RAG para: We need a Project Manager with AWS certification, strong leadership, experience in other projects related to finances
Buscando los 3 mejores candidatos (usando Cover Letters completas)...

[FASE 1: RECUPERACIÓN COMPLETADA]
  1. Candidato ID: 409 | Similitud E5: 0.7958
  2. Candidato ID: 193 | Similitud E5: 0.7947
  3. Candidato ID: 171 | Similitud E5: 0.7940

------------------------------------------------------------
INICIANDO FASE DE RANKING (LLM)...
------------------------------------------------------------
REPORTE FINAL DE RR.HH:
**WINNER ID:** 171

**MATCH SCORE:** 9

**DECISION RATIONALE:** Candidate 171 is the best fit for the Project Manager position with AWS certification and finance-related project experience. The cover letter explicitly mentions proficiency in SQL, Python, Scala, and AWS services, aligning closely with the job requirements. The candidate'emen
