In [7]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

## Loading PDF

In [2]:
loader = PyPDFLoader("HR-Policy-Revised-JUNE-2022.pdf")
pages = loader.load()

## Splitting PDF into Chunks

In [3]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap = 100
) 

In [5]:
all_chunks = []
for page in pages:
    chunk = text_splitter.split_text(page.page_content)
    all_chunks.extend(chunk)

print(f" Total Chunks Created: {len(all_chunks)}")

 Total Chunks Created: 183


## Generating Embeddings

In [6]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
embeddings

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

## Store Chunks in VectorStore

In [10]:
vectorstore = Chroma.from_texts(all_chunks, embedding=embeddings, persist_directory="./chroma_db")

Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


## Persisting Database

In [11]:
vectorstore.persist()

  vectorstore.persist()


In [15]:
query = "How many free leaves ?"
results = vectorstore.similarity_search(query, k=5)

print("\n🔹 Similarity Search Results:")
for r in results:
    print(r.page_content)


🔹 Similarity Search Results:
Department:     _______________        
 Casual 
  
 
 
No of Leaves:  _______________        From:                  __________    to    __________   
 
 
Reasons:      
                        
 
                        
 
                        
 
 
Contact No During Leave:    ____________________________        
    
 
 
Current Leave Balance Available: 
                         
                 _____________________      _____________________
[Annexure IX]. One day casual leave will be charged against four short leaves in a 
month. Similarly, two half day leaves in a month will be charged as one day 
casual leave in a month.  
 
In case any employee fails to submit his/her appro ved short/half/day leave, as 
required above, his salary of that day will be withheld.  
3)  Late Arrival; One day salary/wage will be deducted on three consecutive late 
arrivals of an employee without approved short/half/ -leave thereof.. Further,
PASDEC  HR Policy (Last R