In [7]:
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter

import numpy as np
import faiss
import pdfplumber


In [2]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [9]:
def create_index_add_embedding(embeddings):
    embedding_dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(embedding_dimension)
    index.add(embeddings)
    print(f"Number of vectors in FAISS index: {index.ntotal}")
    return index

def search_faiss(index,query, top_k,document):
    # Generate the embedding for the query
    query_embedding = model.encode([query])  # We pass the query as a list to keep it consistent with the batch processing
    query_embedding = np.array(query_embedding).astype('float32')

    # Perform the search for the top 2 most similar documents
    k = 1  # Number of nearest neighbors to retrieve
    distances, indices = index.search(query_embedding, top_k)

    # Print the results
    print(f"Distances: {distances}")
    print(f"Indices: {indices}")

    # Retrieve and print the most relevant documents
    for idx in indices[0]:
        print(f"Relevant Document: {document[idx]}")


def pdf_to_text(path):
        
    with pdfplumber.open(pdf_file_path) as pdf:
        extracted_text = ""

        # Iterate through each page and extract text
        for page in pdf.pages:
            extracted_text += page.extract_text()  # Extract text from the page
    return extracted_text
    

In [10]:

# Open the PDF file
pdf_file_path = "media/Renuka/HOME_LOAN/HELP_IN_DISBURSEMENT_2/disb_advice.pdf"
extracted_text = pdf_to_text(pdf_file_path)


# Initialize the text splitter with chunk size and overlap size
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,  # Max number of characters per chunk
    chunk_overlap=20  # Allow some overlap between chunks for better context preservation
)

# Split the text into chunks
chunks = text_splitter.split_text(extracted_text)

In [11]:
document = chunks
embeddings = model.encode(document)
faiss_index = create_index_add_embedding(embeddings)

Number of vectors in FAISS index: 5


In [15]:
# Example query
query = " how much pre-emi interest?"
top_k = 2
search_faiss(faiss_index,query, top_k,document)

Distances: [[0.9477917 1.147608 ]]
Indices: [[1 0]]
Relevant Document: _______________________
Total: 3716
_______________________
As per your request to commence repayment of principal prior to the loan being fully disbursed,
this repayment option is being given till any further disbursement of the loan is made.
EQUATED MONTHLY INSTALLMENT(EMI)
EMI - From 01-FEB-2024 Rs. 5850 **
till you avail of further disbursement.
The EMI/s are payable during the month or on or before the due date.
The rate of interest (ROI) will be subjected to reset*** on 01-MAR-24.
Relevant Document: PART DISBURSEMENT ADVICE CUM INSTALLMENT ADVICE Date : 11-JAN-24 03:53 P.M.
Loan Account No : 686857777
Loan Product : RESIDENT HOME LOAN-VARIABLE RATE-MONTHLY REST
Name : MR DESHMUKH ANKUSH SANJAYRAO
Loan Sanctioned : Rs. 5990001
Current Disbursement : Rs. 764287
Total Loan Disbursed : Rs. 764287
Loan Yet to be Disbursed : Rs. 5225714
PRE-EMI Interest * @8.45% for the month of JAN-2024 . Rs. 3,716
