In [7]:
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter

import numpy as np
import faiss
import pdfplumber


In [2]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [9]:
def create_index_add_embedding(embeddings):
    embedding_dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(embedding_dimension)
    index.add(embeddings)
    print(f"Number of vectors in FAISS index: {index.ntotal}")
    return index

def search_faiss(index,query, top_k,document):
    # Generate the embedding for the query
    query_embedding = model.encode([query])  # We pass the query as a list to keep it consistent with the batch processing
    query_embedding = np.array(query_embedding).astype('float32')

    # Perform the search for the top 2 most similar documents
    k = 1  # Number of nearest neighbors to retrieve
    distances, indices = index.search(query_embedding, top_k)

    # Print the results
    print(f"Distances: {distances}")
    print(f"Indices: {indices}")

    # Retrieve and print the most relevant documents
    for idx in indices[0]:
        print(f"Relevant Document: {document[idx]}")


def pdf_to_text(path):
        
    with pdfplumber.open(pdf_file_path) as pdf:
        extracted_text = ""

        # Iterate through each page and extract text
        for page in pdf.pages:
            extracted_text += page.extract_text()  # Extract text from the page
    return extracted_text
    

In [17]:

# Open the PDF file
pdf_file_path = "media/Renuka/TDS/23042500058723MAHB_ChallanReceipt.pdf"
extracted_text = pdf_to_text(pdf_file_path)


# Initialize the text splitter with chunk size and overlap size
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,  # Max number of characters per chunk
    chunk_overlap=20  # Allow some overlap between chunks for better context preservation
)

# Split the text into chunks
chunks = text_splitter.split_text(extracted_text)

In [18]:
document = chunks
embeddings = model.encode(document)
faiss_index = create_index_add_embedding(embeddings)

Number of vectors in FAISS index: 3


In [19]:
# Example query
query = " how much pre-emi interest?"
top_k = 2
search_faiss(faiss_index,query, top_k,document)

Distances: [[1.4821765 1.4882874]]
Indices: [[0 1]]
Relevant Document: INCOME TAX DEPARTMENT
Challan Receipt
PAN : GFVPD8077N
Name : Ankush Sanjayrao Deshmukh
Assessment Year : 2024-25
Financial Year : 2023-24
Major Head : Income Tax (Other than Companies) (0021)
Minor Head : TDS on Sale of Property (800)
Amount (in Rs.) : ₹ 83,214
Amount (in words) : Rupees Eighty Three Thousand Two Hundred And Fourteen Only
CIN : 23042500058723MAHB
Acknowledgement Number : AK00769716
Mode of Payment : Net Banking
Bank Name : Bank Of Maharashtra
Relevant Document: Bank Reference Number : 2023042512013820004575016
Date of Deposit : 25-Apr-2023
BSR code : 0230001 Challan No : 00295 Tender Date : 25/04/2023
Thanks for being a committed taxpayer!
To express gratitude towards committed taxpayers, the Income Tax Department has started a unique
appreciation initiative. It recognises taxpayers’ commitment by awarding certificates of appreciation to
them.Login to e-filing portal and visit Appreciations and Rew