In [2]:
# ===== Cell 1: Setup & Load All Resumes =====

import os
import fitz  # PyMuPDF
import re
from pathlib import Path
from dotenv import load_dotenv

# Load API key from .env if exists
load_dotenv()
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
if not GOOGLE_API_KEY:
    GOOGLE_API_KEY = input("Enter your GOOGLE_API_KEY: ").strip()
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY

# Path to resumes folder
resumes_folder = Path("../resumes")  # Adjust relative path from scripts/
if not resumes_folder.exists():
    raise FileNotFoundError(f"Resumes folder not found: {resumes_folder}")

# Supported file types
supported_ext = [".pdf", ".docx", ".txt"]

# Gather all supported files
resume_files = [f for f in resumes_folder.glob("*.*") if f.suffix.lower() in supported_ext]
if not resume_files:
    raise FileNotFoundError("No supported resume files found in the folder.")

print(f" Found {len(resume_files)} resume(s):")
for f in resume_files:
    print(" -", f.name, "(", f.suffix.lower(), ")")

# Function to extract text from a single file 
def extract_text(file_path):
    text = ""
    ext = file_path.suffix.lower()
    
    if ext == ".pdf":
        try:
            with fitz.open(file_path) as doc:
                for page in doc:
                    text += page.get_text("text")
        except Exception as e:
            print(f"Could not read PDF {file_path.name}: {e}")
    elif ext == ".docx":
        try:
            from docx import Document
            doc = Document(file_path)
            text = "\n".join(p.text for p in doc.paragraphs)
        except Exception as e:
            print(f"Could not read DOCX {file_path.name}: {e}")
    elif ext == ".txt":
        try:
            with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
                text = f.read()
        except Exception as e:
            print(f"Could not read TXT {file_path.name}: {e}")
    else:
        print(f"Unsupported file type: {file_path.suffix}")
    
    # Clean text
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Extract text from all resumes 
resumes_texts = {f.name: extract_text(f) for f in resume_files}
print("\nResume text extraction complete.\n")
for name, text in resumes_texts.items():
    print(f"--- {name} ---")
    preview = text[:500] + ("..." if len(text) > 500 else "")
    print(preview)
    print("\n")



 Found 4 resume(s):
 - resume1.pdf ( .pdf )
 - resume2.pdf ( .pdf )
 - resume3.pdf ( .pdf )
 - resume4.pdf ( .pdf )

Resume text extraction complete.

--- resume1.pdf ---
Aneeba Waseem Backend Developer | Software Engineering Student aneeba.waseem1403@gmail.com +923184328087 aneeba-waseem-63962a25b Aneeba Waseem Lahore, Pakistan PROFILE Enthusiastic Software Engineering student with hands-on experience in .NET and SQL, eager to contribute to real- world backend development. Skilled in building efficient APIs and database solutions, with a strong commitment to writing clean, maintainable code and continuously evolving through practical learning. EDUCATION BS Softw...


--- resume2.pdf ---
Noor Fatima Software Engineering Student h.noorfatima7@gmail.com +92 323 4753925 Noor Fatima noor_fatima7 Noor Fatima 7_noorfatima Profile I am a passionate Software Engineering student with a strong interest in problem-solving and full-stack development. I have hands-on experience building projects an

In [3]:
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain_core.prompts import PromptTemplate
import json

# Embedding model
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

# Split resume into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = text_splitter.split_text(text)

# Vector DB (FAISS)
vectorstore = FAISS.from_texts(chunks, embedding=embeddings)
retriever = vectorstore.as_retriever(search_kwargs={"k": len(chunks)})

# LLM model
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=0.2,
    google_api_key=os.environ["GOOGLE_API_KEY"]
)

# Prompt template 
prompt_template = """
You are a precise and structured resume parser designed to extract technical information accurately.
From the given resume context, extract the following details:
- Full name
- Contact information (phone number, email)
- GitHub profile link
- LinkedIn profile link
- Qualification and university
- Technical experience
- Projects
- Coursework
- Technical skills and tools
- Extracurricular or leadership experience
Context:
{context}
Question:
{question}
Return ONLY a valid JSON object with these keys:
name, contact_info, github_link, linkedin, qualification, university, experience, projects, coursework_keywords, skills_summary, extracurricular.

If a field doesnâ€™t exist, set it to null.
"""

prompt = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"]
)

# RAG chain
chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",
    chain_type_kwargs={"prompt": prompt}
)

# Query the chain
query = "Extract all details from this resume including name, qualification, university, contact info, GitHub, LinkedIn, projects, experience, and technical keywords."
result = chain.invoke({"query": query})

import json
print("Extraction result:\n")
print(json.dumps(result["result"], indent=4))

  from .autonotebook import tqdm as notebook_tqdm


Extraction result:

"```json\n{\n  \"name\": \"CHARLES MCTURLAND\",\n  \"contact_info\": {\n    \"phone\": \"(123) 456-7890\",\n    \"email\": \"cmcturland@email.com\"\n  },\n  \"github_link\": null,\n  \"linkedin\": null,\n  \"qualification\": \"B.S. Computer Science\",\n  \"university\": \"University of Pittsburgh\",\n  \"experience\": [\n    {\n      \"title\": \"Software Engineer\",\n      \"company\": \"Embark\",\n      \"dates\": \"January 2015 - current\",\n      \"location\": \"New York, NY\",\n      \"description\": [\n        \"Worked with product managers to re-architect a multi-page web app into a single page web-app, boosting yearly revenue by $1.4M\",\n        \"Constructed the logic for a streamlined ad-serving platform that scaled to our 35M users, which improved the page speed by 15% after implementation\",\n        \"Tested software for bugs and operating speed, fixing bugs and documenting processes to increase efficiency by 18%\",\n        \"Iterated platform for col