In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain_community.vectorstores import Chroma
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from dotenv import load_dotenv
import os, getpass

# Load env vars
load_dotenv()
if not os.getenv("GOOGLE_API_KEY"):
    os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter your Google API key: ")

# Step 1: Load PDF
pdf_path = input("Enter PDF path: ")
loader = PyPDFLoader(pdf_path)
docs = loader.load()

# Also store raw text for LLM access
raw_text = "\n".join([doc.page_content for doc in docs])

# Step 2: Split and Embed
splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=30)
docs_split = splitter.split_documents(docs)

embedding_model = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001")

# Persistent Chroma store
db_path = "./chroma_resume"
db = None
if os.path.exists(db_path):
    db = Chroma(persist_directory=db_path, embedding_function=embedding_model)
else:
    db = Chroma.from_documents(docs_split, embedding=embedding_model, persist_directory=db_path)
    db.persist()

retriever = db.as_retriever(search_kwargs={"k": 5})

# Step 3: LLM
llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0.3)

# Step 4: Prompt Template
prompt = ChatPromptTemplate.from_messages([
    ("system", 
     "You are a resume parsing assistant. Use both the vector DB context and full raw PDF text to answer. "
     "Match the resume data to produce a clean, structured output with sections:\n"
     "Keywords: Name, Contact, Skills, Education, Experience, Achievements, Certifications.\n"
     "Provide only mentioned keyword content in {input}, not all keywords content.\n\n"
     "VectorDB Context:\n{context}\n\nFull PDF Text:\n{pdf_text}"),
    ("human", "{input}")
])

# Step 5: Chain
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

resume_chain = (
    {
        "context": retriever | format_docs,
        "pdf_text": lambda _: raw_text,
        "input": RunnablePassthrough()
    }
    | prompt
    | llm
)
#C:/Users/Antriksh Sharma/Downloads/Antriksh Sharma (4).pdf
# Step 6: Query
query = "hi give me name of resume holder"
response = resume_chain.invoke(query)
print(response.content)


  db = Chroma(persist_directory=db_path, embedding_function=embedding_model)


ANTRIKSH SHARMA


In [2]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain_community.vectorstores import Chroma
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from dotenv import load_dotenv
import os, getpass, json

CONFIG_FILE = "resume_config.json"
DB_PATH = "./chroma_resume"

# --- Load config if exists ---
def load_config():
    if os.path.exists(CONFIG_FILE):
        with open(CONFIG_FILE, "r") as f:
            return json.load(f)
    return {}

def save_config(data):
    with open(CONFIG_FILE, "w") as f:
        json.dump(data, f)

# --- Step 0: API Key ---
load_dotenv()
if not os.getenv("GOOGLE_API_KEY"):
    os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter your Google API key: ")

# --- Step 1: Load or Ask for PDF ---
config = load_config()
pdf_path = config.get("pdf_path")

if not pdf_path or not os.path.exists(pdf_path):
    pdf_path = input("Enter PDF path (upload your resume file): ").strip()
    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f"File not found: {pdf_path}")
    save_config({"pdf_path": pdf_path})
    print(f"PDF path saved to {CONFIG_FILE}")

# Load PDF
loader = PyPDFLoader(pdf_path)
docs = loader.load()

# Store raw text for full PDF reference
raw_text = "\n".join(doc.page_content for doc in docs)

# --- Step 2: Vector DB ---
splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=30)
docs_split = splitter.split_documents(docs)

embedding_model = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001")

if os.path.exists(DB_PATH) and os.listdir(DB_PATH):
    print("Loading existing Chroma DB...")
    db = Chroma(persist_directory=DB_PATH, embedding_function=embedding_model)
else:
    print("Creating new Chroma DB...")
    db = Chroma.from_documents(docs_split, embedding=embedding_model, persist_directory=DB_PATH)
    db.persist()

retriever = db.as_retriever(search_kwargs={"k": 5})

# --- Step 3: LLM ---
llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0.3)

# --- Step 4: Prompt Template ---
prompt = ChatPromptTemplate.from_messages([
    ("system",
     "You are a resume parsing assistant. Use both the vector DB context and full raw PDF text to answer.\n"
     "Match the resume data to produce a clean, structured output with sections:\n"
     "Keywords: Name, Contact, Skills, Education, Experience, Achievements, Certifications.\n"
     "Provide only mentioned keyword content in {input}, not all keywords content.\n\n"
     "VectorDB Context:\n{context}\n\nFull PDF Text:\n{pdf_text}"),
    ("human", "{input}")
])

# --- Step 5: Chain ---
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

resume_chain = (
    {
        "context": retriever | format_docs,
        "pdf_text": lambda _: raw_text,
        "input": RunnablePassthrough()
    }
    | prompt
    | llm
)

# --- Step 6: Example Query ---
query = "hi give me summary of resume holder"
response = resume_chain.invoke(query)
print("\n💡 Response:\n", response.content)


Loading existing Chroma DB...

💡 Response:
 The resume holder, Antriksh Sharma, is an Agentic AI Developer specializing in building intelligent automation systems and advanced machine learning solutions. They are proficient in developing end-to-end AI applications using Python, deep learning frameworks, and modern MLOps practices, with experience in implementing RAG systems, Langchain, and LangGraph workflows.
