In [None]:
import os
from typing import List, Optional
import pdfplumber
import docx
from pydantic import BaseModel, Field
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import PromptTemplate


Pydantic Schema

In [None]:

class ContactInfo(BaseModel):
    phone: Optional[str] = None
    email: Optional[str] = None

class ExperienceItem(BaseModel):
    title: Optional[str] = None
    organization: Optional[str] = None
    duration: Optional[str] = None
    description: Optional[str] = None

class ProjectItem(BaseModel):
    name: Optional[str] = None
    description: Optional[str] = None
    tech_stack: Optional[str] = None

class SkillsSummary(BaseModel):
    languages: List[str] = []
    frameworks_libraries: List[str] = []
    tools_platforms: List[str] = []

class ExtraCurricularItem(BaseModel):
    role: Optional[str] = None
    organization: Optional[str] = None
    duration: Optional[str] = None
    description: Optional[str] = None

class ResumeSchema(BaseModel):
    name: Optional[str] = None
    contact_info: ContactInfo = ContactInfo()
    github_link: Optional[str] = None
    linkedin: Optional[str] = None
    qualification: Optional[str] = None
    university: Optional[str] = None
    experience: List[ExperienceItem] = []
    projects: List[ProjectItem] = []
    coursework_keywords: List[str] = []
    skills_summary: SkillsSummary = SkillsSummary()
    extracurricular: List[ExtraCurricularItem] = []

Helpers to extract text from PDF / DOCX

In [None]:
def extract_text_from_pdf(path: str) -> str:
    text_parts = []
    with pdfplumber.open(path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text_parts.append(page_text)
    return "\n".join(text_parts)

def extract_text_from_docx(path: str) -> str:
    doc = docx.Document(path)
    paragraphs = [p.text for p in doc.paragraphs if p.text and p.text.strip()]
    return "\n".join(paragraphs)

def load_document_text(path: str) -> str:
    ext = os.path.splitext(path)[1].lower()
    if ext == ".pdf":
        return extract_text_from_pdf(path)
    elif ext in (".docx", ".doc"):
        return extract_text_from_docx(path)
    else:
        # fallback for plain text files
        with open(path, "r", encoding="utf-8") as f:
            return f.read()

Build RAG components, index and retriever

In [None]:

def build_retriever_from_text(text: str, embedding_model=None, persist_path=None):
    # Split into chunks
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
    docs = splitter.create_documents([text])

    # Embeddings
    embeddings = embedding_model or OpenAIEmbeddings()
    # Build FAISS index
    vectorstore = FAISS.from_documents(docs, embeddings)
    return vectorstore.as_retriever()

prompt + parser for the schema

In [None]:
def build_parser_and_prompt():
    parser = PydanticOutputParser(pydantic_object=ResumeSchema)

    instruction = """You are given text extracted from a resume/CV. Extract the information and return a JSON that EXACTLY conforms to the provided Pydantic schema.
- Only return JSON and nothing else.
- If a field is not present, set it to null or an empty list/object as appropriate.
- Parse dates/durations as they appear (do not normalize unless obvious).
- Aggregate skills/frameworks/tools as lists.

Schema: {schema}

Resume text:
{resume_text}

Return the JSON now.
"""
    prompt = PromptTemplate(
        input_variables=["schema", "resume_text"],
        template=instruction,
    )

    return parser, prompt


parse resume -> JSON using RAG + LLM

In [None]:

def parse_resume_to_json(path: str, openai_api_key: Optional[str] = None):
    if openai_api_key:
        os.environ["OPENAI_API_KEY"] = openai_api_key

    # 1) Load text
    raw_text = load_document_text(path)
    if not raw_text.strip():
        raise ValueError("No text extracted from file.")

    # 2) Build retriever (RAG)
    retriever = build_retriever_from_text(raw_text)

    # 3) LLM (chat model)
    llm = ChatOpenAI(temperature=0, model="gpt-4o-mini" )
    # 4) Parser + prompt
    parser, prompt_template = build_parser_and_prompt()

    # 5) Build retrieval QA chain - we will pass the resume text trimmed into the prompt.
    prompt_str = prompt_template.format_prompt(schema=ResumeSchema.schema_json(), resume_text=raw_text[:6000]).to_string()

    # Ask the LLM to return structured JSON
    resp = llm.generate(messages=[{"role":"user", "content": prompt_str}])
    # The 'resp' object structure may vary with langchain versions; fallback to text extraction:
    # Try to fetch text from resp
    try:
        content = resp.generations[0][0].text
    except Exception:
        # fallback: get text in a simpler way
        content = resp.generations[0][0].text if hasattr(resp, "generations") else str(resp)

    # Parse the returned JSON using the pydantic parser - the parser enforces schema
    parsed = parser.parse_raw(content)
    return parsed.dict()


Usage

In [None]:

if __name__ == "__main__":
    import json
    FILE = "sample_resume.pdf"
    result = parse_resume_to_json(FILE, openai_api_key=os.getenv("OPENAI_API_KEY"))
    print(json.dumps(result, indent=2, ensure_ascii=False))