### **Objective**:
*Build a RAG-powered system that retrieves and ranks the most relevant resumes for a job role using semantic search and reranking, enabling faster and smarter candidate screening.*



In [1]:
# Install required packages (run this cell if not already installed)
!pip install -q torch transformers accelerate bitsandbytes langchain sentence-transformers faiss-cpu openpyxl datasets pypdf langchain-community langchain-huggingface ragatouille
!pip install flashrank # rerank 

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.0/68.0 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m31.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.5/207.5 MB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [44]:
import numpy as np
import os
import shutil
import pandas as pd

In [3]:
from tqdm.notebook import tqdm
from typing import Optional, List, Tuple
from datasets import Dataset
from langchain.document_loaders import DirectoryLoader, PyPDFLoader

In [26]:


def load_resumes(data_dir: str, csv_path: str = None) -> list:
    """Load resumes from directory and optional CSV metadata."""
    # Load CSV if provided (single line with error handling)
    df = pd.read_csv(csv_path) if csv_path and os.path.exists(csv_path) else None
    
    if not os.path.exists(data_dir):
        raise FileNotFoundError(f"Directory not found: {data_dir}")
    
    documents = []
    for category in os.listdir(data_dir):
        category_path = os.path.join(data_dir, category)
        if not os.path.isdir(category_path):
            continue
            
        # Load PDFs (removed progress bar for speed)
        loader = DirectoryLoader(category_path, glob="*.pdf", loader_cls=PyPDFLoader)
        docs = loader.load()
        
        for doc in docs:
            # Simplified metadata extraction
            doc.metadata.update({
                "category": category,
                "file_name": os.path.basename(doc.metadata["source"]),
                "id": os.path.splitext(doc.metadata["source"].split('/')[-1])[0]
            })
            
            # Optional CSV merge (one-liner)
            if df is not None:
                if match := df[df["ID"] == doc.metadata["id"]].to_dict('records'):
                    doc.metadata.update(match[0])
        
        documents.extend(docs)
    print(documents[0])
    return documents

# Usage (2 lines)
DATA_DIR = "/kaggle/input/resume-dataset/data/data"
docs = load_resumes(DATA_DIR, "/kaggle/input/resume-dataset/Resume/Resume.csv")

page_content='PRE-PRESS GRAPHIC DESIGNER
Summary
Creative, hardworking designer seeking a full-time desktop job, educated as a graphic artist, past experience in business world as a desktop
publisher laying out designs for printed mail and advertisements, in local government designing new websites with graphics for different agencies
within the system, and later for the same government printing and reproduction center creating documents to be printed off a press or copiers.
Skills
Adobe InDesign, Photoshop, Illustrator, and Acrobat Professional
Strongly familiar with Microsoft Word, Excel, PowerPoint, and Publisher / also QuarkXPress
Basic knowledge of web development with Adobe Dreamweaver, HTML, WordPress
Able to perform graphic design and administrative functions
Able to work as a team player and independently
Experienced using phone, fax, email, copiers and printers
Provides excellent customer service (in-person, by phone, email, or interoffice mail)
Prioritizes and calmly handles 

In [27]:
# to create a raw k_b to feed into text spillter
from langchain.docstore.document import Document as LangchainDocument

RAW_KNOWLEDGE_BASE = [
    LangchainDocument(
        page_content=doc.page_content,
        metadata={
            "Category": doc.metadata["category"],
            "filename": doc.metadata["file_name"],
            "id" :doc.metadata["id"]# If available
        }
    ) for doc in docs
]

In [28]:
# splitting raw d_b to feed into embeddoing model so we can create vector db
from langchain.text_splitter import RecursiveCharacterTextSplitter

# We use a hierarchical list of separators specifically tailored for splitting Markdown documents
# This list is taken from LangChain's MarkdownTextSplitter class
MARKDOWN_SEPARATORS = [
    "\n#{1,6} ",
    "```\n",
    "\n\\*\\*\\*+\n",
    "\n---+\n",
    "\n___+\n",
    "\n\n",
    "\n",
    " ",
    "",
]

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # The maximum number of characters in a chunk: we selected this value arbitrarily
    chunk_overlap=100,  # The number of characters to overlap between chunks
    add_start_index=True,  # If `True`, includes chunk's start index in metadata
    strip_whitespace=True,  # If `True`, strips whitespace from the start and end of every document
    separators=MARKDOWN_SEPARATORS,
)

docs_processed = [] # this splitts the charectters into chuks and store
for doc in RAW_KNOWLEDGE_BASE:
    docs_processed += text_splitter.split_documents([doc])

In [29]:
#choosng andloadng emeddng model
from sentence_transformers import SentenceTransformer

# To get the value of the max sequence_length, we will query the underlying `SentenceTransformer` object used in the RecursiveCharacterTextSplitter
print(f"Model's maximum sequence length: {SentenceTransformer('thenlper/gte-small').max_seq_length}")

Model's maximum sequence length: 512


In [30]:
from transformers import AutoTokenizer# model toknze the splited data

tokenizer = AutoTokenizer.from_pretrained("thenlper/gte-small") # here chuks get tokenized for vectors storage
lengths = [len(tokenizer.encode(doc.page_content)) for doc in tqdm(docs_processed)]

  0%|          | 0/18499 [00:00<?, ?it/s]

In [31]:
EMBEDDING_MODEL_NAME = "thenlper/gte-small"

In [32]:
from langchain.vectorstores import FAISS # stores vector databse
from langchain_community.embeddings import HuggingFaceEmbeddings # fuction chain to perform embeddngs
from langchain_community.vectorstores.utils import DistanceStrategy # for retriver to identify smlar docs

In [33]:
embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    multi_process=True,
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True},  # Set `True` for cosine similarity
)

In [34]:
KNOWLEDGE_VECTOR_DATABASE = FAISS.from_documents(
    docs_processed, embedding_model, distance_strategy=DistanceStrategy.COSINE
)
# here we hve xomplete vector knoelgde base to perform query to retrive docs or perform llm to
#generte ansrs

KNOWLEDGE_VECTOR_DATABASE # high dimension matrx 

2025-07-10 10:08:14.878352: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752142094.900444     213 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752142094.907019     213 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Chunks:   0%|          | 0/10 [00:00<?, ?it/s]

<langchain_community.vectorstores.faiss.FAISS at 0x7bbee4e921d0>

In [35]:
# its a doc retriver for vector db
retriever = KNOWLEDGE_VECTOR_DATABASE.as_retriever(search_kwargs={"k": 30})

docs = retriever.invoke("""
  "Machine Learning Engineer" OR "Data Scientist" AND (
    ("Python" AND ("Scikit-learn" OR "TensorFlow" OR "PyTorch"))  
    ("SQL" AND ("Spark" OR "Hadoop" OR "ETL"))  
    ("AWS" OR "GCP" OR "Azure" OR "MLOps")  
    ("Tableau" OR "Power BI" OR "data visualization")  
    ("statistical analysis" OR "A/B testing")  
    ("NLP" OR "LLM" OR "GenAI" OR "recommendation systems")  
  )  
  NOT ("intern" OR "student")  
  Years: "3+ years"  
  Location: "Remote" OR "Mumbai" OR "Bangalore" OR "Pune"  
""")

# import pprint

# pprint.pp(docs) # retrved 30 docs to lter perform rerank

2025-07-10 10:09:09.680744: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752142149.702751     233 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752142149.709488     233 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

In [36]:
# all corrected rearank code
from langchain.retrievers import ContextualCompressionRetriever
from langchain_community.document_compressors import FlashrankRerank
from flashrank import Ranker, RerankRequest  # Explicitly import RerankRequest

In [37]:
# Monkey patch the missing reference
import langchain_community.document_compressors.flashrank_rerank as flashrank_rerank

In [38]:
flashrank_rerank.RerankRequest = RerankRequest

# Initialize retriever
retriever = KNOWLEDGE_VECTOR_DATABASE.as_retriever(search_kwargs={"k": 30})

# Initialize compressor
compressor = FlashrankRerank(top_n=5)

# Create compression retriever
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=retriever
)

# Run query
query =  """
  "Machine Learning Engineer" OR "Data Scientist" AND (
    ("Python" AND ("Scikit-learn" OR "TensorFlow" OR "PyTorch"))  
    ("SQL" AND ("Spark" OR "Hadoop" OR "ETL"))  
    ("AWS" OR "GCP" OR "Azure" OR "MLOps")  
    ("Tableau" OR "Power BI" OR "data visualization")  
    ("statistical analysis" OR "A/B testing")  
    ("NLP" OR "LLM" OR "GenAI" OR "recommendation systems")  
  )  
  NOT ("intern" OR "student")  
  Years: "3+ years"  
  Location: "Remote" OR "Mumbai" OR "Bangalore" OR "Pune"  
"""
compressed_docs = compression_retriever.invoke(query)

2025-07-10 10:09:35.059557: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752142175.081754     254 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752142175.088370     254 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

In [39]:
# Print results
print([doc.metadata.get("id") for doc in compressed_docs])



['50328713', '22946204', '17823436', '83816738', '18448085']


In [42]:
# Print results with more details
for doc in compressed_docs:
    print(doc)

page_content='ENGINEERING INTERN
Skills
C++, Python, MATLAB, Git, Bash, R, SQL (basic). Experienced in Linux/Unix and using high performance computing clusters.
Machine Learning Tools and Libraries: Scikit-learn, Pandas, Seaborn, matplotlib, TensorFlow (basic). (I built a XGBoost
model that has 77.5% accuracy in the Kaggle Titanic challenge.)
Computational Fluid Dynamics and Discrete Element Method Codes
CFD-DEM, OpenFOAM, CFD-ACE+Â®, FluentÂ®, COMSOLÂ®, LAMMPS, and LIGGGHTS.
Reservoir and Fracture Modeling Tools
CMGÂ® for reservoir simulation; FracProÂ® for fracture simulation and analysis; Saphir for pressure transient analysis.
Experimental and Statistical Methods
SEM, AFM, Confocal Microscopy, Regression analysis, Statistical process control, Design of experiments.
Experience
ENGINEERING INTERN
 
08/2016
 
ï¼​ 
12/2016
 
Company Name
 
State
Project: Develop a cavings transport model for optimizing hole-cleaning operations.' metadata={'id': '50328713', 'relevance_score': 0.9871056,

In [43]:


# Define the source directory containing the PDFs
DATA_DIR = "/kaggle/input/resume-dataset/data/data"

# List of file IDs from your output (replace with actual IDs from compressed_docs)
file_ids = [doc.metadata.get("id") for doc in compressed_docs]  # Example: ['id1', 'id2', 'id3', 'id4', 'id5']

# Define the output directory to save the downloaded PDFs
output_dir = "./downloaded_pdfs"
os.makedirs(output_dir, exist_ok=True)

# Function to find and copy PDF files
def download_pdfs(file_ids, data_dir, output_dir):
    for file_id in file_ids:
        # Search for the PDF file in the data directory
        for category in os.listdir(data_dir):
            category_path = os.path.join(data_dir, category)
            if not os.path.isdir(category_path):
                continue
            pdf_path = os.path.join(category_path, f"{file_id}.pdf")
            if os.path.exists(pdf_path):
                # Copy the PDF to the output directory
                output_path = os.path.join(output_dir, f"{file_id}.pdf")
                shutil.copy(pdf_path, output_path)
                print(f"Downloaded: {output_path}")
            else:
                print(f"File not found: {file_id}.pdf")

# Run the download function
download_pdfs(file_ids, DATA_DIR, output_dir)

File not found: 50328713.pdf
File not found: 50328713.pdf
File not found: 50328713.pdf
File not found: 50328713.pdf
File not found: 50328713.pdf
File not found: 50328713.pdf
File not found: 50328713.pdf
File not found: 50328713.pdf
File not found: 50328713.pdf
File not found: 50328713.pdf
File not found: 50328713.pdf
File not found: 50328713.pdf
File not found: 50328713.pdf
File not found: 50328713.pdf
File not found: 50328713.pdf
File not found: 50328713.pdf
File not found: 50328713.pdf
File not found: 50328713.pdf
File not found: 50328713.pdf
File not found: 50328713.pdf
Downloaded: ./downloaded_pdfs/50328713.pdf
File not found: 50328713.pdf
File not found: 50328713.pdf
File not found: 50328713.pdf
File not found: 22946204.pdf
File not found: 22946204.pdf
File not found: 22946204.pdf
File not found: 22946204.pdf
File not found: 22946204.pdf
Downloaded: ./downloaded_pdfs/22946204.pdf
File not found: 22946204.pdf
File not found: 22946204.pdf
File not found: 22946204.pdf
File not found: