**Processing and Creating Vector Store to load the content of PDF**

In [23]:
#PDF--PDF LOADER--LLM(structured again)-- vector dDATA_BASE---

# CONERTING PDF LOADER INTO RUNABLE
import os
from langchain_community.document_loaders import PDFMinerLoader
from langchain_core.runnables import RunnableLambda
from langchain_core.documents import Document
from typing import List
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI



def load_pdf_documents(file_path: str) -> List[Document]:
    """
    A function to load a PDF file using PDFMinerLoader.
    It takes a file path string and returns a list of Document objects.
    """
    # Check if the file exists to provide a better error message
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"The file '{file_path}' was not found.")
        
    loader = PDFMinerLoader(
        file_path,
        mode="single",  # Loads all pages into one Document object
    )
    # The loader returns a list of documents
    docs = loader.load()
    # Assuming docs is a list of Document objects
    page_contents = [doc.page_content for doc in docs]

    # Join all page contents into one complete text
    complete_text = " ".join(page_contents)

    return complete_text

# Create a runnable from the loading function.
pdf_loader_runnable = RunnableLambda(load_pdf_documents)
pdf=pdf_loader_runnable.invoke("ALOK-RAI--22BEC010.pdf")





#splitter_text=pdf | texts
# NOW WE GET OUR PDF----Its time to genrate its embeddings and store it in vector database
embedding_model = GoogleGenerativeAIEmbeddings(
    model="models/embedding-001",
    google_api_key=os.getenv("GOOGLE_API_KEY")
)
'''splitter = SemanticChunker(
    embeddings=embedding_model,
    breakpoint_threshold_type="percentile"
)'''

## Converting the text splitter into a runnable
from langchain_core.runnables import RunnableSequence


##--- Change the spillter
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=100)
texts = text_splitter.split_text(pdf)
print(len(texts))


##---This was whole for reume part--

"""Creates a FAISS vector store from texts and returns a retriever."""
vector_store = FAISS.from_texts(texts, embedding_model)

7


**JoB -DESCRIPTION CHAIN**

In [24]:
jd = """ About the job
Internship Summary:

We're looking for enthusiastic and talented AI Interns passionate about coding to join our team. This internship offers a unique opportunity to gain hands-on experience by working on real-world projects, contributing to innovative solutions. You will play a crucial role in developing advanced AI solutions from data preparation to model deployment.


What You will Do:

As an AI Engineer Intern, you'll work closely with our team, contributing to various stages of the AI development lifecycle. Your responsibilities may include:
Data Preparation & Annotation: Assist in collecting, cleaning, preprocessing, and annotating large datasets This will involve supporting robust OCR and HTR capabilities.
Model Experimentation & Evaluation: Conduct experiments to test, evaluate, and fine-tune AI models for accuracy, performance, and scalability.
Information Extraction & Analysis: Contribute to the development and testing of features for highly accurate multilingual information extraction, retrieval, summarization, and enhanced pattern recognition from diverse, unstructured government documents.
Large Language Model (LLM) Support: Support the team in evaluating, fine-tuning, and testing open-source LLMs for performance and suitability, ensuring they deeply understand government-specific terminology, policies, and contextual nuances in both English and Indian languages. 
Research & Exploration: Research the latest advancements, architectures, and techniques
Prototyping & Implementation: Help in building prototypes and integrating AI components, including those for conceptual predictive insights from forecasting models.
Experiment with prompt engineering techniques to optimize outputs from generative models.
Explore and implement techniques for controlling generated content and ensuring model safety/alignment.
Documentation: Maintain clear and concise documentation of experiments, model development processes, and results.
Collaboration: Actively participate in team discussions, brainstorm new ideas, and collaborate with cross-functional teams to align AI solutions with business objectives.
 
Who Can Apply:

Currently pursuing or recently completed a Bachelor's or Master's degree in Computer Science, Data Science, Artificial Intelligence, Machine Learning from a Tier 1 or Tier-2 Colleges / Autonomous Institutions.
Strong foundational understanding of core AI and Machine Learning concepts and algorithms.
Specific interest in Natural Language Processing (NLP) and Large Language Models (LLMs) is highly preferred.
Proficiency in at least one programming language, preferably Python is mandatory
Familiarity with deep learning frameworks (e.g., TensorFlow) is a plus.
Excellent analytical and problem-solving skills, with a keen eye for detail in visual data and model outputs.
Strong communication and teamwork skills, with the ability to articulate technical concepts clearly.
A strong desire to learn, adapt, and contribute in a fast-paced environment focused on innovation.
 """

**Working with Job Description**

In [25]:
from langchain_core.prompts import PromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.output_parsers.json import SimpleJsonOutputParser


# ---------- 1. Prompt ----------
job_jd_extraction_template = """
You are a resume-assistant that extracts structured data from job descriptions.

Return ONLY valid JSON with these keys:
- title                       (string or null)
- company                     (string or null)
- location                    (string or null)
- summary                     (string)
- responsibilities            (string[])
- required_skills             (string[])
- preferred_skills            (string[])
- education_requirements      (string or null)
- experience_level            (string or null)
- Else                       (string or null)

If a field is missing, use null or an empty list. In description, there can be multiple paragraphs, so you should extract the relevant information from the entire text and sometimes keys for JSON file will have synonyms or different names, so you should extract the relevant information from the entire text and sometimes keys for JSON file will have synonyms or different names.

Job description:
\"\"\"{description}\"\"\"
"""

prompt = PromptTemplate(
    input_variables=["description"],
    template=job_jd_extraction_template.strip()
)

# ---------- 2. LLM ----------
# Gemini-Pro is the general-purpose chat model.
llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",      # <- or "gemini-pro-vision" etc.
    temperature=0.0          # deterministic extraction
)

# ---------- 3. Chain ----------
chain = prompt | llm | SimpleJsonOutputParser()## Structured output of Job Descriptions

# ---------- 4. Run ----------

result = chain.invoke({"description": jd})
print(result)


{'title': 'AI Intern', 'company': None, 'location': None, 'summary': "We're looking for enthusiastic and talented AI Interns passionate about coding to join our team. This internship offers a unique opportunity to gain hands-on experience by working on real-world projects, contributing to innovative solutions. You will play a crucial role in developing advanced AI solutions from data preparation to model deployment.", 'responsibilities': ['Data Preparation & Annotation: Assist in collecting, cleaning, preprocessing, and annotating large datasets. This will involve supporting robust OCR and HTR capabilities.', 'Model Experimentation & Evaluation: Conduct experiments to test, evaluate, and fine-tune AI models for accuracy, performance, and scalability.', 'Information Extraction & Analysis: Contribute to the development and testing of features for highly accurate multilingual information extraction, retrieval, summarization, and enhanced pattern recognition from diverse, unstructured gove

**Combining Both Chain**

In [26]:
def build_search_query(job: dict) -> str:
    """
    Turn a structured job-post into a single keyword string.

    Parameters
    ----------
    job : dict
        Must contain lists under some or all of these keys:
        - "responsibilities"
        - "required_skills"
        - "preferred_skills"
        Everything else is ignored.

    Returns
    -------
    str
        Comma-separated list with duplicates removed, e.g.
        "Python, TensorFlow, Data preparation, NLP, LLM fine-tuning"
    """
    # Collect lists; if the key is missing fall back to an empty list
    resp    = job.get("responsibilities", [])
    req     = job.get("required_skills", [])
    pref    = job.get("preferred_skills", [])

    # Flatten + deduplicate while preserving order
    seen, parts = set(), []
    for item in resp + req + pref:
        if isinstance(item, str) and item not in seen:
            seen.add(item)
            parts.append(item)

    return ", ".join(parts)


**MULTI QuereyRETRIVER to exrtact info from Vector DATABASE**

In [27]:
# --- 0. prerequisites ---------------------------------------------------------
from langchain.chat_models import ChatOpenAI
from langchain.retrievers.multi_query import MultiQueryRetriever

# --- 1. base retriever --------------------------------------------------------
# pull (for example) the 2 most similar chunks for *each* query
base_retriever = vector_store.as_retriever(
        search_type="similarity",
        search_kwargs={"k": 7}
)
from langchain_google_genai import ChatGoogleGenerativeAI   # make sure this import matches your package version

llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",   # <- correct kwarg
    temperature=0.2,
)
# --- 3. wrap in a Multi-Query retriever --------------------------------------
multi_query_retriever = MultiQueryRetriever.from_llm(
        retriever   = base_retriever,   # your existing retriever
        llm         = llm
)

# 4. build search query from job post
search_query = build_search_query(result)

docs =multi_query_retriever.get_relevant_documents(search_query)

# --- 4. use it like any other retriever --------------------------------------

print(len(docs), docs[0].page_content)
print(docs)

# or plug it straight into a QA / conversation chain
# from langchain.chains import ConversationalRetrievalChain
# qa_chain = ConversationalRetrievalChain.from_llm(llm, multi_query_retriever)


7 Skills and Competencies

3.1 Skills Summary

Programming Languages:Python(3+yrs), C++(3+ yrs), SQL, Matlab
ML/NLP Frameworks: TensorFlow, PyTorch, Pandas, Scikit-learn, RAGs
VS Code, Google Colab, Hugging Face, LangChain
Tools:
Power BI, Streamlit(2+yrs), Cadence(2+yrs), Arduino
Other:

Projects

• Transformer Language Model:

◦ Implemented a GPT-style transformer-2.2 million parameters from scratch in PyTorch for autoregressive text

generation.

◦ Designed with Multi-head self-attention (6 heads), residual connections, and achieving a validation perplexity approx

10 over 5000 iterations.
[Document(id='f0cc6b45-9081-4566-afb2-ed20fef78dfa', metadata={}, page_content='Skills and Competencies\n\n3.1 Skills Summary\n\nProgramming Languages:Python(3+yrs), C++(3+ yrs), SQL, Matlab\nML/NLP Frameworks: TensorFlow, PyTorch, Pandas, Scikit-learn, RAGs\nVS Code, Google Colab, Hugging Face, LangChain\nTools:\nPower BI, Streamlit(2+yrs), Cadence(2+yrs), Arduino\nOther:\n\nProjects\n\n• Transfo

**Comparing the results after extraction info from Resume and the Job Description**

In [28]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate

# 1-- LLM instance
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0)

# 2-- prompt template
prompt = ChatPromptTemplate.from_messages([
    ("system",
     "You are a career-matching assistant. "
     "Use ONLY the resume snippets provided to judge the fit."),
    ("user", "Job description (JSON):\n{job_json}"),
    ("user", "Resume snippets:\n{snippets}"),
    ("user",
     "Return a JSON with keys:\n"
     "  - matches  (bullet points where the resume meets the job)\n"
     "  - gaps     (bullet points where the resume is missing something)\n"
     "  - overall_fit (one-sentence conclusion)-- be very strict in your assessment!"),
])

# 3-- combine template + LLM  -> runnable chain
chain = prompt | llm          # equivalent to RunnableSequence(prompt, llm)

# 4-- variables you collected earlier
variables = {
    "job_json": result,          # dict or JSON string
    "snippets": docs                # "\n---\n".join(d.page_content for d in docs)
}

# 5-- run the chain
answer = chain.invoke(variables)        # returns an LLM message object
print(answer.content)                   # or just `print(answer)` depending on version


```json
{
  "matches": [
    "The resume shows proficiency in Python and experience with TensorFlow, both required and preferred skills.",
    "The candidate has experience with NLP, including building a transformer language model and working on a food classification project, aligning with the preferred interest in NLP and LLMs.",
    "The resume demonstrates experience in model development, evaluation, and optimization, including hyperparameter tuning and techniques to reduce computation time.",
    "The candidate's projects showcase experience in data preprocessing, model building, and performance analysis, which are relevant to the internship responsibilities."
  ],
  "gaps": [
    "The resume does not explicitly mention experience with large language models (LLMs) beyond building a transformer model.  The job description emphasizes LLM support and understanding of government-specific terminology, which is not evident in the resume.",
    "There is no mention of experience with mult