#### MeataData Filtering - Self-Query Retriever

In [1]:
from langchain.retrievers import SelfQueryRetriever
# from langchain.chains.query_constructor import 
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_groq import ChatGroq
from langchain.document_loaders import PyPDFLoader, TextLoader, DirectoryLoader
from typing import List
import os
from langchain.prompts import ChatPromptTemplate
from pydantic import BaseModel
from pydantic.fields import Field
from langchain.output_parsers import StructuredOutputParser
from dotenv import load_dotenv
import re
from collections import defaultdict
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone.vectorstores import PineconeVectorStore
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
load_dotenv()

  from tqdm.autonotebook import tqdm


True

In [2]:
os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")
os.environ["HUGGINGFACEHUB_API_TOKEN"] = os.getenv("HUGGINGFACEHUB_API_TOKEN")
os.environ["PINECONE_API_KEY"] = os.getenv("PINECONE_API_KEY")

In [3]:
if os.getcwd() == "e:\\Programming\\Projects\\LLM\\Langchain for everbody\\Langchain basics":
    os.chdir("e:\\Programming\\Projects\\LLM\\Langchain for everbody")

In [4]:
# loader = PyPDFLoader(file_path="HR_data/hrtech_HR_code_of_coduct_policy_v1.pdf")

In [5]:
loader = DirectoryLoader(
    "HR_data/",
    glob="**/*.pdf",
    loader_cls=PyPDFLoader
)

In [6]:
documents = loader.load()
documents

[Document(metadata={'source': 'HR_data\\hrtech_HR_code_of_coduct_policy_v1.pdf', 'page': 0}, page_content='ABC Corp Code of Conduct Policy  \n**Company Name:** ABC Corp  \n**Policy Name:** Code of Conduct  \n**Effective Date:** [DD/MM/YYYY]  \n**Version:** 1.0  \n1. Purpose  \nThe purpose of this Code of Conduct policy is to outline the standards of behavior expected \nfrom all employees at ABC Corp. We are committed to maintaining a professional, ethical, \nand respectful work environment. Failure to adhere to these standards may result in \ndisciplinary action, including termination.  \n2. Workplace Behavior  \n2.1 Professional Conduct  \nEmployees are expected to act professionally, responsibly, and ethically at all times. They \nshould treat colleagues, clients, and stakeholders with respect and integrity.  \n2.2 Harassment and Discrimination  \nABC Corp maintains a zero -tolerance policy towards a ny form of harassment, \ndiscrimination, or bullying. This includes but is not limit

In [7]:
# File Name Format : CompanyName_DepartmentName_PolicyName_Version.pdf

In [8]:
# define the llm
llm = ChatGroq(model="llama-3.3-70b-versatile",temperature=0)

In [9]:
# structured ouput
class MetaDataModel(BaseModel):
    keywords: List[str] = Field(...,description="Keywords present in the chunk document")
    summary: str = Field(...,description="Summarized content of the chunk document")

In [10]:
# strctured llm
structured_llm = llm.with_structured_output(MetaDataModel)

In [11]:
# write the prompt
system = """You are an expert in text summarization and keyword extraction. Your task is to analyze the given content and provide two outputs:

1. **Summary**: Generate a concise and clear summary (1-2 sentences) that captures the essence of the content.
The summary should be informative, retain key details, and maintain the core message of the text.

2. **Keywords**: Extract the **5 most relevant keywords** that best represent the content. 
The keywords should be meaningful, precise, and directly related to the subject matter.
Prioritize important concepts and avoid generic or overly common words.
"""

summarizer_prompt = ChatPromptTemplate.from_messages(messages=[
    ("system",system),
    ("human","content : \n {content}")
])

summarizer_chain = summarizer_prompt | structured_llm 

In [12]:
def get_summary_keywords(chunk):
    content = chunk.page_content
    response = summarizer_chain.invoke({"content":content})
    keywords = response.keywords
    summary = response.summary
    return keywords,summary

In [13]:
string = "hrtech_HR_leaves_2025_policy_v1"
match = re.search(r'HR_(.*?)_policy', string)

# Extract and format the policy name
policy_name = match.group(1) if match else None
policy_name

'leaves_2025'

In [14]:
def get_metadata(document):
    file_name = document.metadata['source'].split(".pdf")[0].split('\\')[1]
    version = file_name.split("_")[-1]
    department = file_name.split("_")[1]
    return file_name,version,department

In [15]:
for doc in documents:
    filename,version,department = get_metadata(doc)
    doc.metadata['filename'] = filename
    doc.metadata['department'] = department
    doc.metadata['version'] = version

In [16]:
documents

[Document(metadata={'source': 'HR_data\\hrtech_HR_code_of_coduct_policy_v1.pdf', 'page': 0, 'filename': 'hrtech_HR_code_of_coduct_policy_v1', 'department': 'HR', 'version': 'v1'}, page_content='ABC Corp Code of Conduct Policy  \n**Company Name:** ABC Corp  \n**Policy Name:** Code of Conduct  \n**Effective Date:** [DD/MM/YYYY]  \n**Version:** 1.0  \n1. Purpose  \nThe purpose of this Code of Conduct policy is to outline the standards of behavior expected \nfrom all employees at ABC Corp. We are committed to maintaining a professional, ethical, \nand respectful work environment. Failure to adhere to these standards may result in \ndisciplinary action, including termination.  \n2. Workplace Behavior  \n2.1 Professional Conduct  \nEmployees are expected to act professionally, responsibly, and ethically at all times. They \nshould treat colleagues, clients, and stakeholders with respect and integrity.  \n2.2 Harassment and Discrimination  \nABC Corp maintains a zero -tolerance policy towards

In [17]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=150)
chunks = text_splitter.split_documents(documents=documents)

In [18]:
chunks

[Document(metadata={'source': 'HR_data\\hrtech_HR_code_of_coduct_policy_v1.pdf', 'page': 0, 'filename': 'hrtech_HR_code_of_coduct_policy_v1', 'department': 'HR', 'version': 'v1'}, page_content='ABC Corp Code of Conduct Policy  \n**Company Name:** ABC Corp  \n**Policy Name:** Code of Conduct  \n**Effective Date:** [DD/MM/YYYY]  \n**Version:** 1.0  \n1. Purpose  \nThe purpose of this Code of Conduct policy is to outline the standards of behavior expected \nfrom all employees at ABC Corp. We are committed to maintaining a professional, ethical, \nand respectful work environment. Failure to adhere to these standards may result in \ndisciplinary action, including termination.'),
 Document(metadata={'source': 'HR_data\\hrtech_HR_code_of_coduct_policy_v1.pdf', 'page': 0, 'filename': 'hrtech_HR_code_of_coduct_policy_v1', 'department': 'HR', 'version': 'v1'}, page_content='and respectful work environment. Failure to adhere to these standards may result in \ndisciplinary action, including termin

In [19]:
def insert_metadata(chunks):
    chunk_counter = defaultdict(int)
    for chunk in chunks: 
        
        filename = chunk.metadata['filename']
        match = re.search(r'HR_(.*?)_policy', filename)
        
        chunk_counter[filename] += 1
        
        keywords,summary = get_summary_keywords(chunk)
        chunk.metadata['keywords'] = keywords
        chunk.metadata['summary'] = summary
        chunk.metadata['chunk_id'] = f"{match.group(1)}_chunk_{chunk_counter[filename]}"

In [20]:
insert_metadata(chunks)

In [21]:
chunks

[Document(metadata={'source': 'HR_data\\hrtech_HR_code_of_coduct_policy_v1.pdf', 'page': 0, 'filename': 'hrtech_HR_code_of_coduct_policy_v1', 'department': 'HR', 'version': 'v1', 'keywords': ['Code of Conduct', 'ABC Corp', 'Employee Behavior', 'Work Environment', 'Disciplinary Action'], 'summary': "ABC Corp's Code of Conduct policy outlines the expected standards of behavior for all employees to maintain a professional and respectful work environment, with non-compliance potentially leading to disciplinary action.", 'chunk_id': 'code_of_coduct_chunk_1'}, page_content='ABC Corp Code of Conduct Policy  \n**Company Name:** ABC Corp  \n**Policy Name:** Code of Conduct  \n**Effective Date:** [DD/MM/YYYY]  \n**Version:** 1.0  \n1. Purpose  \nThe purpose of this Code of Conduct policy is to outline the standards of behavior expected \nfrom all employees at ABC Corp. We are committed to maintaining a professional, ethical, \nand respectful work environment. Failure to adhere to these standards

In [22]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [24]:
pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))

index_name = "metadatafiltering"  # change if desired

existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=768,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )

index = pc.Index(index_name)
vector_store = PineconeVectorStore(index=index, embedding=embeddings)

Old chhunks removal --> this should be done first before adding new

In [25]:
ids = [chunk.metadata['chunk_id'] for chunk in chunks]
ids

['code_of_coduct_chunk_1',
 'code_of_coduct_chunk_2',
 'code_of_coduct_chunk_3',
 'code_of_coduct_chunk_4',
 'code_of_coduct_chunk_5',
 'code_of_coduct_chunk_6',
 'code_of_coduct_chunk_7',
 'code_of_coduct_chunk_8',
 'leaves_2025_chunk_1',
 'leaves_2025_chunk_2',
 'leaves_2025_chunk_3',
 'leaves_2025_chunk_4',
 'leaves_2025_chunk_5',
 'leaves_2025_chunk_6',
 'leaves_2025_chunk_7',
 'remote_work_chunk_1',
 'remote_work_chunk_2',
 'remote_work_chunk_3',
 'remote_work_chunk_4',
 'remote_work_chunk_5',
 'remote_work_chunk_6',
 'remote_work_chunk_7',
 'remote_work_chunk_8',
 'remote_work_chunk_9',
 'remote_work_chunk_10',
 'remote_work_chunk_11']

In [26]:
vector_store.add_documents(documents=chunks, ids=ids)

['code_of_coduct_chunk_1',
 'code_of_coduct_chunk_2',
 'code_of_coduct_chunk_3',
 'code_of_coduct_chunk_4',
 'code_of_coduct_chunk_5',
 'code_of_coduct_chunk_6',
 'code_of_coduct_chunk_7',
 'code_of_coduct_chunk_8',
 'leaves_2025_chunk_1',
 'leaves_2025_chunk_2',
 'leaves_2025_chunk_3',
 'leaves_2025_chunk_4',
 'leaves_2025_chunk_5',
 'leaves_2025_chunk_6',
 'leaves_2025_chunk_7',
 'remote_work_chunk_1',
 'remote_work_chunk_2',
 'remote_work_chunk_3',
 'remote_work_chunk_4',
 'remote_work_chunk_5',
 'remote_work_chunk_6',
 'remote_work_chunk_7',
 'remote_work_chunk_8',
 'remote_work_chunk_9',
 'remote_work_chunk_10',
 'remote_work_chunk_11']

In [40]:
ids = [chunk.metadata['chunk_id'] for chunk in chunks]

In [43]:
old_policy_filter = {
    "filename": "hrtech_HR_remote_work_policy_v1",  # Ensure this is the previous version
    "department": "HR"
}

# Query Pinecone for old policy chunks
old_chunks = vector_store.similarity_search("", k=100, filter=old_policy_filter)

# Extract chunk IDs
old_chunks_ids = [chunk.metadata['chunk_id'] for chunk in old_chunks]

In [44]:
old_chunks_ids

['remote_work_chunk_1',
 'remote_work_chunk_11',
 'remote_work_chunk_10',
 'remote_work_chunk_6',
 'remote_work_chunk_9',
 'remote_work_chunk_7',
 'remote_work_chunk_3',
 'remote_work_chunk_5',
 'remote_work_chunk_2',
 'remote_work_chunk_4',
 'remote_work_chunk_8']

In [45]:
vector_store.delete(ids=old_chunks_ids)

In [None]:
vector_store.add_documents(documents=chunks, ids=ids)

New chunks addition into the vectorstore

In [46]:
# Update the new documents
loader = PyPDFLoader(file_path="HR_data/hrtech_HR_remote_work_policy_v2.pdf")
new_documents = loader.load()
new_documents

[Document(metadata={'source': 'HR_data/hrtech_HR_remote_work_policy_v2.pdf', 'page': 0}, page_content='ABC Corp Remote Work Policy  \nCompany Name:  ABC Corp  \nPolicy Name:  Remote Work Policy  \nEffective Date:  [DD/MM/YYYY]  \nVersion:  2.0 \n \n1. Purpose  \nABC Corp is committed to providing a flexible and productive work environment by \nsupporting remote work arrangements. This policy establishes clear expectations regarding \neligibility, communication, security, performance, and compliance for employees working \nremotely.  \n \n2. Eligibility  \nRemote work eligibility is determined by job function, individual performance, and \nmanagerial discretion. Employees must meet the following crite ria to qualify:  \n1. Completion of at least six months  of employment with satisfactory performance.  \n2. A job role that permits remote execution without compromising productivity.  \n3. A dedicated workspace  that meets security and effi ciency standards.  \n4. Reliable internet connec

In [47]:
def get_metadata(document):
    file_name = document.metadata['source'].split(".pdf")[0].split('/')[1]
    version = file_name.split("_")[-1]
    department = file_name.split("_")[1]
    return file_name,version,department

In [48]:
for doc in new_documents:
    filename,version,department = get_metadata(doc)
    doc.metadata['filename'] = filename
    doc.metadata['department'] = department
    doc.metadata['version'] = version

In [49]:
new_documents

[Document(metadata={'source': 'HR_data/hrtech_HR_remote_work_policy_v2.pdf', 'page': 0, 'filename': 'hrtech_HR_remote_work_policy_v2', 'department': 'HR', 'version': 'v2'}, page_content='ABC Corp Remote Work Policy  \nCompany Name:  ABC Corp  \nPolicy Name:  Remote Work Policy  \nEffective Date:  [DD/MM/YYYY]  \nVersion:  2.0 \n \n1. Purpose  \nABC Corp is committed to providing a flexible and productive work environment by \nsupporting remote work arrangements. This policy establishes clear expectations regarding \neligibility, communication, security, performance, and compliance for employees working \nremotely.  \n \n2. Eligibility  \nRemote work eligibility is determined by job function, individual performance, and \nmanagerial discretion. Employees must meet the following crite ria to qualify:  \n1. Completion of at least six months  of employment with satisfactory performance.  \n2. A job role that permits remote execution without compromising productivity.  \n3. A dedicated work

In [51]:
new_chunks = text_splitter.split_documents(documents=new_documents)

In [52]:
insert_metadata(new_chunks)

In [53]:
new_chunks

[Document(metadata={'source': 'HR_data/hrtech_HR_remote_work_policy_v2.pdf', 'page': 0, 'filename': 'hrtech_HR_remote_work_policy_v2', 'department': 'HR', 'version': 'v2', 'keywords': ['Remote Work', 'Policy', 'ABC Corp', 'Flexibility', 'Productivity'], 'summary': "ABC Corp's Remote Work Policy aims to provide a flexible work environment by establishing clear expectations for remote work arrangements, including eligibility, communication, and performance. The policy supports remote work to increase productivity.", 'chunk_id': 'remote_work_chunk_1'}, page_content='ABC Corp Remote Work Policy  \nCompany Name:  ABC Corp  \nPolicy Name:  Remote Work Policy  \nEffective Date:  [DD/MM/YYYY]  \nVersion:  2.0 \n \n1. Purpose  \nABC Corp is committed to providing a flexible and productive work environment by \nsupporting remote work arrangements. This policy establishes clear expectations regarding \neligibility, communication, security, performance, and compliance for employees working \nremot

In [54]:
new_ids = [chunk.metadata['chunk_id'] for chunk in new_chunks]
new_ids

['remote_work_chunk_1',
 'remote_work_chunk_2',
 'remote_work_chunk_3',
 'remote_work_chunk_4',
 'remote_work_chunk_5',
 'remote_work_chunk_6',
 'remote_work_chunk_7',
 'remote_work_chunk_8',
 'remote_work_chunk_9',
 'remote_work_chunk_10',
 'remote_work_chunk_11',
 'remote_work_chunk_12',
 'remote_work_chunk_13']

In [55]:
vector_store.add_documents(documents=new_chunks, ids=new_ids)

['remote_work_chunk_1',
 'remote_work_chunk_2',
 'remote_work_chunk_3',
 'remote_work_chunk_4',
 'remote_work_chunk_5',
 'remote_work_chunk_6',
 'remote_work_chunk_7',
 'remote_work_chunk_8',
 'remote_work_chunk_9',
 'remote_work_chunk_10',
 'remote_work_chunk_11',
 'remote_work_chunk_12',
 'remote_work_chunk_13']