In [None]:
# ─── Imports & Setup ────────────────────────────────────────────────────
import sys, os
import pandas as pd
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores.faiss import DistanceStrategy
from langchain_community.embeddings import HuggingFaceEmbeddings
from transformers import pipeline
from langchain.llms import HuggingFacePipeline

# for progress bar (optional)
from tqdm import tqdm

In [2]:
# ─── Configuration ─────────────────────────────────────────────────────
DATA_PATH       = "/Users/basusmac/Desktop/Github Repositories/SmartCandidate-Analyzer-RAG-Based-Resume-Screening/data/main-data/synthetic-resumes.csv"
FAISS_PATH      = "/Users/basusmac/Desktop/Github Repositories/SmartCandidate-Analyzer-RAG-Based-Resume-Screening/vectorstore"
TESTSET_PATH    = "/Users/basusmac/Desktop/Github Repositories/SmartCandidate-Analyzer-RAG-Based-Resume-Screening/data/main-data/test-sets/testset-1.csv"
OUTPUT_PATH     = "/Users/basusmac/Desktop/Github Repositories/SmartCandidate-Analyzer-RAG-Based-Resume-Screening/data/main-data/gpt4-ragfusion/test-results/testres-1.csv"

RAG_K_THRESHOLD = 10
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

# our new local generator
HF_MODEL_NAME   = "gpt2"            # ← swap for any HF‑hub model you prefer
GEN_MAX_LENGTH  = 512
GEN_TEMPERATURE = 0.7

In [3]:
# ─── Load Data & Build ID → Resume Map ─────────────────────────────────
documents = pd.read_csv(DATA_PATH)
documents["ID"] = documents["ID"].astype(str)
id_resume_dict = dict(zip(documents["ID"], documents["Resume"]))

test_df        = pd.read_csv(TESTSET_PATH)
question_list  = test_df["Job Description"].tolist()
ground_truth   = test_df["Ground Truth"].tolist()

In [7]:
# ─── Initialize Embeddings & FAISS ─────────────────────────────────────
# 1) Embedding model
embedder = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL,
    model_kwargs={"device": "cpu"},
)

# 2) FAISS vectorstore
vectorstore_db = FAISS.load_local(
    FAISS_PATH,
    embedder,
    distance_strategy=DistanceStrategy.COSINE,
    allow_dangerous_deserialization=True,  
)

In [11]:
# ─── Initialize Local HF LLM ────────────────────────────────────────────
generator = pipeline(
    "text-generation",
    model=HF_MODEL_NAME,
    max_length=GEN_MAX_LENGTH,
    temperature=GEN_TEMPERATURE,
    # device=0               # uncomment if you have a GPU
)
generator = pipeline(
     "text-generation",
     model=HF_MODEL_NAME,
     # generate up to 256 new tokens (you can bump this if you want longer answers)
     max_new_tokens=256,
     # if your prompt is longer than the model’s context window, truncate it
     truncation=True,
     # ensure we have a pad token (GPT‑2 uses the EOS token for padding)
     pad_token_id=generator.tokenizer.eos_token_id,
     temperature=GEN_TEMPERATURE,
     # device=0               # uncomment if you have a GPU
)
llm = HuggingFacePipeline(pipeline=generator)

Device set to use mps:0
Device set to use mps:0


In [18]:
# ─── RAG Helper Functions ────────────────────────────────────────────────
def generate_subquestions(llm, question: str, max_subqs=4) -> list[str]:
    prompt = f"""
You are an expert in talent acquisition. Split this job description into {max_subqs} targeted sub-queries,
each on its own line. Only use info from the original; don’t make up new requirements.

Job Description:
{question}

Sub-queries:
"""
    out = llm(prompt).strip()
    return [line for line in out.splitlines() if line.strip()]

def reciprocal_rank_fusion(ranks: list[dict[str, float]], k=50) -> dict[str, float]:
    fused = {}
    for rank_list in ranks:
        for idx, (doc_id, score) in enumerate(rank_list.items()):
            fused.setdefault(doc_id, 0.0)
            fused[doc_id] += 1.0 / (idx + k)
    return dict(sorted(fused.items(), key=lambda x: x[1], reverse=True))

def retrieve_id_and_rerank(queries: list[str], top_k=RAG_K_THRESHOLD) -> dict[str, float]:
    ranklists = []
    for q in queries:
        docs_with_score = vectorstore_db.similarity_search_with_score(q, k=top_k)
        ranklists.append({ str(d.metadata["ID"]): sc for d, sc in docs_with_score })
    return reciprocal_rank_fusion(ranklists)

def retrieve_documents_with_id(id_scores: dict[str, float], threshold=5) -> list[str]:
    top_ids = sorted(id_scores, key=id_scores.get, reverse=True)[:threshold]
    return [f"Applicant ID {i}\n{id_resume_dict[i]}" for i in top_ids]

def generate_response(llm, question: str, docs: list[str]) -> str:
    context = "\n\n".join(docs)
    prompt = f"""
You are an expert in talent acquisition helping pick the best candidate. Use only Applicant IDs to refer to resumes.

Context:
{context}

Question:
{question}

Answer with your selection and reasoning:
"""
    return llm(prompt).strip()

In [19]:
# ─── Run the Full RAG Pipeline ──────────────────────────────────────────
results = []
for q in tqdm(question_list, desc="Running RAG"):
    # 1) build sub‑queries
    subs = generate_subquestions(llm, q)
    # 2) retrieve & fusion
    id_scores = retrieve_id_and_rerank([q] + subs)
    docs      = retrieve_documents_with_id(id_scores)
    # 3) final answer
    ans       = generate_response(llm, q, docs)

    results.append({
        "question":     q,
        "ground_truth": ground_truth[question_list.index(q)],
        "answer":       ans,
        "contexts":     "===\n".join(docs)
    })

Running RAG:   0%|          | 0/100 [00:00<?, ?it/s]This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (1024). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.
Running RAG: 100%|██████████| 100/100 [13:44<00:00,  8.24s/it]


In [20]:
# ─── Save to CSV ─────────────────────────────────────────────────────────
out_df = pd.DataFrame(results)
out_df.to_csv(OUTPUT_PATH, index=False)
print("✅ Finished! Results written to", OUTPUT_PATH)

✅ Finished! Results written to /Users/basusmac/Desktop/Github Repositories/SmartCandidate-Analyzer-RAG-Based-Resume-Screening/data/main-data/gpt4-ragfusion/test-results/testres-1.csv
