In [2]:
import os
import fitz  # PyMuPDF
import re
from pathlib import Path
from dotenv import load_dotenv
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain_core.prompts import PromptTemplate

# Load API key from .env file
load_dotenv()
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
if not GOOGLE_API_KEY:
    GOOGLE_API_KEY = input("Enter your GOOGLE_API_KEY: ").strip()
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY

# Select resume file 
resume_path = input("Enter path to your resume file (PDF, DOCX, TXT): ").strip()
resume_file = Path(resume_path)
if not resume_file.exists():
    raise FileNotFoundError(f"File not found: {resume_path}")

# Extract text from resume
text = ""
if resume_file.suffix.lower() == ".pdf":
    with fitz.open(resume_file) as doc:
        for page in doc:
            text += page.get_text("text")
elif resume_file.suffix.lower() == ".docx":
    from docx import Document
    doc = Document(resume_file)
    text = "\n".join(p.text for p in doc.paragraphs)
elif resume_file.suffix.lower() == ".txt":
    with open(resume_file, "r", encoding="utf-8", errors="ignore") as f:
        text = f.read()
else:
    raise ValueError("Unsupported file type. Supported: PDF, DOCX, TXT")

# Clean text
text = re.sub(r"\s+", " ", text).strip()
print("✅ Resume text extracted successfully.")


KeyboardInterrupt: 

In [None]:
# Embedding model
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

# Split resume into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = text_splitter.split_text(text)

# Vector DB (FAISS)
vectorstore = FAISS.from_texts(chunks, embedding=embeddings)
retriever = vectorstore.as_retriever(search_kwargs={"k": len(chunks)})

# LLM model
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=0.2,
    google_api_key=os.environ["GOOGLE_API_KEY"]
)

# Prompt template 
prompt_template = """
You are a precise and structured resume parser designed to extract technical information accurately.
From the given resume context, extract the following details:
- Full name
- Contact information (phone number, email)
- GitHub profile link
- LinkedIn profile link
- Qualification and university
- Technical experience
- Projects
- Coursework
- Technical skills and tools
- Extracurricular or leadership experience
Context:
{context}
Question:
{question}
Return ONLY a valid JSON object with these keys:
name, contact_info, github_link, linkedin, qualification, university, experience, projects, coursework_keywords, skills_summary, extracurricular.

If a field doesn’t exist, set it to null.
"""

prompt = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"]
)

# RAG chain
chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",
    chain_type_kwargs={"prompt": prompt}
)

# Query the chain
query = "Extract all details from this resume including name, qualification, university, contact info, GitHub, LinkedIn, projects, experience, and technical keywords."
result = chain.invoke({"query": query})

import json
print("Extraction result:\n")
print(json.dumps(result["result"], indent=4))
