In [1]:
import os
os.environ["CHROMA_TELEMETRY"] = "False"

In [2]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma


## Loading PDF

In [3]:
loader = PyPDFLoader("HR-Policy-Revised-JUNE-2022.pdf")
pages = loader.load()

## Splitting PDF into Chunks

In [4]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap = 100
) 

In [5]:
all_chunks = []
for page in pages:
    chunk = text_splitter.split_text(page.page_content)
    all_chunks.extend(chunk)

print(f" Total Chunks Created: {len(all_chunks)}")

 Total Chunks Created: 183


## Generating Embeddings

In [6]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
embeddings

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

## Store Chunks in VectorStore

In [7]:
vectorstore = Chroma.from_texts(all_chunks, embedding=embeddings, persist_directory="./chroma_db")

Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


## Persisting Database

In [8]:
vectorstore.persist()

  vectorstore.persist()


In [9]:
def retrieve_query(query, k=2):
    results = vectorstore.similarity_search(query=query, k=k)
    return [r.page_content for r in results]

In [10]:
# Test queries
queries = [
    "What is the maternity leave policy?",
    "How many paid leaves do employees get annually?",
    "What is the procedure for leave application?"
]

In [11]:
for q in queries:
    print(f"\n Query: {q}")
    answers = retrieve_query(q)
    for ans in answers:
        print(f"answer: {ans}")

Failed to send telemetry event CollectionQueryEvent: capture() takes 1 positional argument but 3 were given



 Query: What is the maternity leave policy?
answer: exceptional circumstances. 
 
10.7   MATERNITY LEAVE 
 
 Leave of up to 12 weeks is allowed upon presentation of doctor’s report. Approving    
authority is CEO.  
 
10.8    SPECIAL/ ACCIDENTAL LEAVE 
 
Special /Accident Leave shall be admissible to the employees suffering from  T.B., Cancer, 
Paralysis, Mental illness, Cardiac disease, renal dise ases, other complicated/high risk 
diseases, surgery and serious accident, disabling injuries resulting in complete bed rest for
answer: exceptional circumstances. 
 
10.7   MATERNITY LEAVE 
 
 Leave of up to 12 weeks is allowed upon presentation of doctor’s report. Approving    
authority is CEO.  
 
10.8    SPECIAL/ ACCIDENTAL LEAVE 
 
Special /Accident Leave shall be admissible to the employees suffering from  T.B., Cancer, 
Paralysis, Mental illness, Cardiac disease, renal dise ases, other complicated/high risk 
diseases, surgery and serious accident, disabling injuries resulting in c