# Step 1: Text Extraction from the PDF's #

<ul>
    <li>
        In this step there are 2 functions which make the logic of <b>extract_text_from_pdf.py.</br>
    </li>
</ul>
<ol>
    <li>extract_chunks_from_page: This function extracts chunks from a single page</li>
    <li>extract_text_from_pdf: This function is responsible for extracting text, calling the extract_chunks_from_page function to extract chunks in a loop</li>
</ol>
    </li>


In [1]:
import os
import pdfplumber
import pandas as pd
from tqdm import tqdm
from pathlib import Path

#--------------Directory paths---------------------------------
BASE_DIR = Path().resolve().parent
RAW_DIR = BASE_DIR / "data" / "raw_docs"
OUTPUT_FILE = BASE_DIR / "data" / "extracted_chunks.csv"
ERROR_LOG_FILE = BASE_DIR / "data" / "error_log.csv"


# Extracts text from a single page.
# Since this will get called in the below function, chunks from all the pages gets generated.

def extract_chunks_from_page(text, filename, page_num, chunk_size=300, overlap=50):
    words = text.strip().split()
    chunks = [] # In this variable all the extracted chunks will get stored of the specific page
    for x in range(0, len(words), chunk_size - overlap):
        chunk_words = words[x:x + chunk_size]
        if len(chunk_words) < 30:
            continue  # skip very short fragments
        chunk_text = " ".join(chunk_words)
        chunk_id = f"{filename}_p{page_num}_c{x}"
        chunks.append({
            "filename": filename,
            "page": page_num,
            "chunk_id": chunk_id,
            "text": chunk_text
        })
    return chunks

def extract_text_from_pdf(raw_dir):
    all_chunks = [] # Chunks from each page extracted using the function "extract_chunks_from_page" gets stored here.
    error_log = []
    pdf_files = [f for f in os.listdir(raw_dir) if f.lower().endswith(".pdf")]

    for filename in tqdm(pdf_files, desc="Extracting PDFs"):
        file_path = os.path.join(raw_dir, filename)

        try:
            with pdfplumber.open(file_path) as pdf:
                for page_num, page in enumerate(pdf.pages, start=1):
                    try:
                        text = page.extract_text()
                        if not text:
                            continue
                        text = text.replace("\n", " ").strip()
                        chunks = extract_chunks_from_page(text, filename, page_num)
                        all_chunks.extend(chunks)
                    except Exception as page_err:
                        error_log.append({
                            "filename": filename,
                            "page": page_num,
                            "error": str(page_err)
                        })
        except Exception as file_err:
            error_log.append({
                "filename": filename,
                "page": -1,
                "error": f"Failed to open PDF: {file_err}"
            })

    return all_chunks, error_log

if __name__ == "__main__":
    chunks, errors = extract_text_from_pdf(RAW_DIR)
    
    # Saving the extracted chunks
    df = pd.DataFrame(chunks)
    df.to_csv(OUTPUT_FILE, index=False)
    print(f"Done. Extracted {len(df)} chunks to {OUTPUT_FILE}")
    
    # Saving any errors if they occur
    if errors:
        error_df = pd.DataFrame(errors)
        error_df.to_csv(ERROR_LOG_FILE, index=False)
        print(f"Logged {len(error_df)} errors to {ERROR_LOG_FILE}")
    else:
        print("No extraction errors encountered.")


Extracting PDFs: 100%|███████████████████████████████████████████████████████████████████| 8/8 [08:45<00:00, 65.70s/it]


Done. Extracted 3311 chunks to C:\Users\DELL\Data Science\Deep Learning\NLP\Projects\financial-qa\data\extracted_chunks.csv
No extraction errors encountered.


### Inspection of Chunks ###

In [2]:
#--- Chunk length and distribution ----

df['word_count'] = df['text'].str.split().apply(len)
df['word_count'].describe()

count    3311.000000
mean      232.308064
std        86.817985
min        30.000000
25%       166.000000
50%       300.000000
75%       300.000000
max       300.000000
Name: word_count, dtype: float64

<b> Models usually can process around ~512 tokens. Here in the chunks that I've created doesnot exceed that limit as max=300.

In [3]:
#--- Empty and Garbage chunk -------

df[df['text'].str.strip() == '']

Unnamed: 0,filename,page,chunk_id,text,word_count


In [4]:
#--- Chunks with only Numbers, Non-word Characters and Punchuations -------

import re

pattern = r'^[\W\d\s]+$'  # non-word characters, digits, and whitespace
garbage_chunks = df[df['text'].str.match(pattern)]
garbage_chunks

Unnamed: 0,filename,page,chunk_id,text,word_count


In [5]:
#----- Chunks with very few unique words --------

df['unique_words'] = df['text'].apply(lambda x: len(set(x.lower().split())))
df[df['unique_words'] < 10]

Unnamed: 0,filename,page,chunk_id,text,word_count,unique_words


In [6]:
df['text'].duplicated().sum()

0

In [7]:
df.head()

Unnamed: 0,filename,page,chunk_id,text,word_count,unique_words
0,HDFC_AGM_Transcript_Aug2024.pdf,1,HDFC_AGM_Transcript_Aug2024.pdf_p1_c0,"CIN: L65920MH1994PLC080618 HDFC Bank Limited, ...",300,161
1,HDFC_AGM_Transcript_Aug2024.pdf,1,HDFC_AGM_Transcript_Aug2024.pdf_p1_c250,by way of remote e-voting will not be able to ...,143,100
2,HDFC_AGM_Transcript_Aug2024.pdf,2,HDFC_AGM_Transcript_Aug2024.pdf_p2_c0,"resolutions as read. Now, without any further ...",280,156
3,HDFC_AGM_Transcript_Aug2024.pdf,2,HDFC_AGM_Transcript_Aug2024.pdf_p2_c250,"present. With permission of the members, I cal...",30,28
4,HDFC_AGM_Transcript_Aug2024.pdf,3,HDFC_AGM_Transcript_Aug2024.pdf_p3_c0,"Friends, we would like to recall last year whe...",300,178


In [8]:
df.drop(columns=["word_count", "unique_words"], inplace=True)

# Step 2: Generating the Embeddings from the Extracted Chunks #

<ul>
    <li>
        The below cell contains the logic of generate_embeddings.py.
    </li>
    <li>
        Embedding simply means converting pieces of text in Vector of Numbers.
    </li>
    <li>
        We'll be using SentenceTransformer to capture the Semantic Context of the sentences properly and is much better than Word2Vec
    </li>
</ul>

In [9]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np
from pathlib import Path

#--------Directory Paths-----------------------
BASE_DIR = Path().resolve().parent
CHUNKS_CSV = BASE_DIR / "data" / "extracted_chunks.csv"
EMBEDDINGS_OUTPUT = BASE_DIR / "data" / "embedded_chunks.npy"
METADATA_OUTPUT = BASE_DIR / "data" / "embedded_chunks.csv"

# Loading the  data
df_embed = pd.read_csv(CHUNKS_CSV)
texts = df_embed['text'].tolist()

# Loading model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Encoding the data, this will create a vector of numbers of each sentence
embeddings = model.encode(texts, batch_size=32, show_progress_bar=True)

# Doing vstack is important, because it is important when we're planning to integrate the data with LLM's in the future
# It helps in calculating the cosine similarity and LLM's expect input in a certain way.
embedding_matrix = np.vstack(embeddings)
np.save(EMBEDDINGS_OUTPUT, embedding_matrix)

# Save metadata with index
df_embed['embedding_index'] = range(len(df_embed))
df_embed.to_csv(METADATA_OUTPUT, index=False)

print(f"Saved {embedding_matrix.shape[0]} embeddings to {EMBEDDINGS_OUTPUT}")


Batches:   0%|          | 0/104 [00:00<?, ?it/s]

Saved 3311 embeddings to C:\Users\DELL\Data Science\Deep Learning\NLP\Projects\financial-qa\data\embedded_chunks.npy


In [10]:
df_embed = pd.read_csv(r"C:\Users\DELL\Data Science\Deep Learning\NLP\Projects\financial-qa\data\embedded_chunks.csv")
df_embed.head()

Unnamed: 0,filename,page,chunk_id,text,embedding_index
0,HDFC_AGM_Transcript_Aug2024.pdf,1,HDFC_AGM_Transcript_Aug2024.pdf_p1_c0,"CIN: L65920MH1994PLC080618 HDFC Bank Limited, ...",0
1,HDFC_AGM_Transcript_Aug2024.pdf,1,HDFC_AGM_Transcript_Aug2024.pdf_p1_c250,by way of remote e-voting will not be able to ...,1
2,HDFC_AGM_Transcript_Aug2024.pdf,2,HDFC_AGM_Transcript_Aug2024.pdf_p2_c0,"resolutions as read. Now, without any further ...",2
3,HDFC_AGM_Transcript_Aug2024.pdf,2,HDFC_AGM_Transcript_Aug2024.pdf_p2_c250,"present. With permission of the members, I cal...",3
4,HDFC_AGM_Transcript_Aug2024.pdf,3,HDFC_AGM_Transcript_Aug2024.pdf_p3_c0,"Friends, we would like to recall last year whe...",4


# Step 3: Vector Search #
<ul>
    <li>
        The function below takes a user query, creates an embedding for it, and compares it with the precomputed embedding matrix (from document chunks).
It returns the top-k most relevant chunks from extracted_chunks.csv based on cosine similarity.
    </li>
</ul>

In [11]:
from pathlib import Path
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

#------------Loading the Model-------------------------
model = SentenceTransformer("all-MiniLM-L6-v2")  

BASE_DIR = Path().resolve().parent
embeddings = np.load(BASE_DIR/ "data" / "embedded_chunks.npy")
df_chunks = pd.read_csv(BASE_DIR / "data" / "extracted_chunks.csv")

def get_top_k_similar_chunks(question, k=5):
    question_embedding = model.encode([question])
    similarity = cosine_similarity(question_embedding, embeddings)[0]
    top_indices = similarity.argsort()[::-1][:k]

    top_chunks = []
    for index in top_indices:
        chunk = df_chunks.iloc[index].to_dict()
        chunk["similarity"] = similarity[index]
        top_chunks.append(chunk)
    return top_chunks



if __name__ == "__main__":
    user_prompt = "What was the net profit in FY 2022?"
    result = get_top_k_similar_chunks(user_prompt, k=5)
    df_ex = pd.DataFrame(result)
    print(df_ex)


                                     filename  page  \
0      HDFC_Integrated_Annual_Report_2022.pdf    17   
1  HDFC_Integrated_Annual_Report_2024.pdf.pdf   287   
2      HDFC_Integrated_Annual_Report_2022.pdf   218   
3  HDFC_Integrated_Annual_Report_2024.pdf.pdf   216   
4      HDFC_Integrated_Annual_Report_2022.pdf   129   

                                            chunk_id  \
0    HDFC_Integrated_Annual_Report_2022.pdf_p17_c250   
1  HDFC_Integrated_Annual_Report_2024.pdf.pdf_p28...   
2     HDFC_Integrated_Annual_Report_2022.pdf_p218_c0   
3  HDFC_Integrated_Annual_Report_2024.pdf.pdf_p21...   
4   HDFC_Integrated_Annual_Report_2022.pdf_p129_c250   

                                                text  similarity  
0  been increased to 72.7% and the coverage of to...    0.588073  
1  STANDALONE PROFIT AND LOSS ACCOUNT For the yea...    0.556045  
2  Profit and Loss Account For the year ended Mar...    0.535019  
3  finally paid off for some critical sectors may...    0.512183

## What's New Compared to Traditional Search ##

Feature                       | Ctrl+F in PDF | Current System                             |
----------------------------- | ------------- | --------------------------------------- |
Requires exact words          | Yes           | No                                      |
Understands synonyms          | No            | Yes (e.g., *"profit"* ≈ *"net income"*) |
Works across multiple files   | No            | Yes                                     |
Ranks based on meaning        | No            | Yes                                     |
Ready for summarization or QA | No            | Yes                                     |



# Step 4: Prompt Construction #

<ul> 
    <li> 
        This function takes the top-k retrieved chunks and the user query, then builds a structured prompt with chunk metadata and clear instructions for the LLM to follow. 
    </li> 
</ul>

In [12]:
def system_prompt(context_chunks, question):
    context = ""
    for index, chunk in enumerate(context_chunks):
        context += f"### Chunk {index+1} (Page {chunk['page']} of {chunk['filename']}):\n{chunk['text']}\n\n"

    prompt = f"""
You are a financial question-answering assistant working with company annual reports.

Only use the information provided in the context below to answer the question. Do not guess. 
If the answer cannot be found, respond with: "Not found in the documents."

-----------------------
Context:
{context}
-----------------------

Question: {question}

Answer:"""

    return prompt.strip()
    


# Step 5: Answer Generation via Local LLM #

<ul> 
    <li> 
        This function sends the constructed prompt to a local language model [Mistral via Ollama] and returns a grounded answer based strictly on the given context. 
    </li> 
</ul>

In [13]:
import requests


def local_generate(prompt):
    response = requests.post(
        'http://localhost:11434/api/generate',
        json={
            "model": "mistral",
            "prompt": prompt,
            "stream": False
        }
    )
    return response.json()["response"]

# Step 6: Response Generation and Display #

In [14]:
question = "Give me insights about the Education Loan, is the demand for such loans increasing every year? Whats the general trend happening?"
top_k_chunks = get_top_k_similar_chunks(question, k=5)


prompt = system_prompt(top_k_chunks, question)
response = local_generate(prompt)



print("Answer:", response)

Answer:  From the provided context, it is stated that HDFC Bank has been a key player in extending the reach of Government education programmes and enhancing its education loan offering. The bank has removed the cap on committed state education loan values and widened the range of courses for which students can avail loans. There is no explicit mention of the demand for these loans increasing every year, but the context suggests that the bank's focus on education loans indicates an increased emphasis or effort towards it. Therefore, it can be inferred that there might be an increasing trend in demand for education loans, but direct numerical data is not provided in the context to support this conclusion.


In [15]:
question = "What was the net profit in FY 2022?"
top_k_chunks = get_top_k_similar_chunks(question, k=5)


prompt = system_prompt(top_k_chunks, question)
response = local_generate(prompt)


print("Answer:", response)

Answer:  The net profit in FY 2022 was ` 36,961.3 crore. (Refer to Chunk 3 from HDFC_Integrated_Annual_Report_2022.pdf)


In [15]:
User_Prompts = []

print("Hi, Im Your Finacial QA Assisatnt. How can i help you?")
while True:
    Question = input(">>")
    if Question.lower() == "quit":
        print("All queries have been taken. Please Wait for a while")
        break
    else:
        User_Prompts.append(Question)

Hi, Im Your Finacial QA Assisatnt. How can i help you?


>> Net Profit FY 2022?
>> Best Performing Investment in the year 2024?
>> quit


All queries have been taken. Please Wait for a while


In [16]:
User_Prompts

['Net Profit FY 2022?', 'Best Performing Investment in the year 2024?']

In [17]:
for question in User_Prompts:
    top_k_chunks = get_top_k_similar_chunks(question, k=5)
    prompt = system_prompt(top_k_chunks, question)
    response = local_generate(prompt)
    print(question)
    print("Answer:", response)
    print("==================================")

Net Profit FY 2022?
Answer:  The Net Profit for FY 2022 can be found in the Chunk 2 of the provided context. It is 369,613,552 according to the report. This amount is in '000 and should be multiplied by 1000 to get the actual value. So, the Net Profit for FY 2022 is approximately 369,613,552 * 1000 = 369,613,552 (INR).
Best Performing Investment in the year 2024?
Answer:  Based on the provided data from the annual report for the year 2024 (HDFC_Integrated_Annual_Report_2024.pdf), we can find the best performing investment category under "Major categories of plan assets as a percentage of fair value of total plan assets" section:

1. Equity shares with 30.34% of fair value to total plan assets.

This indicates that equity shares had the highest proportion of the fair value of total plan assets in 2024, suggesting they were likely the best performing investment category compared to government securities, debenture and bonds, and others during that year. However, it is important to note t

In [18]:
import pandas
import numpy
import pdfplumber
import requests
import sentence_transformers
import fastapi
import tqdm
import sklearn
import re
import requests


print("Pandas", pandas.__version__)
print("Numpy", numpy.__version__)
print("Pdfplumber", pdfplumber.__version__)
print("requests", requests.__version__)
print("sentence_transformers", sentence_transformers.__version__)
print("fastapi", fastapi.__version__)
print("tqdm", tqdm.__version__)
print("sklearn", sklearn.__version__)
print("re", re.__version__)
print("requests", requests.__version__)

Pandas 2.2.2
Numpy 1.26.4
Pdfplumber 0.11.4
requests 2.32.3
sentence_transformers 5.0.0
fastapi 0.115.13
tqdm 4.66.5
sklearn 1.5.1
re 2.2.1
requests 2.32.3
