In [2]:
# Install required libraries
# %% [code]
# Install dependencies
!pip install pdfplumber sentence-transformers faiss-cpu transformers torch pandas


Collecting pdfplumber
  Downloading pdfplumber-0.11.5-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.5/42.5 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5

In [3]:
# %% [code]
import pdfplumber
import pandas as pd
import numpy as np
from io import BytesIO
from google.colab import files
from sentence_transformers import SentenceTransformer
import faiss
from transformers import pipeline

In [4]:
# %% [code]
class VectorStore:
    """Local vector store using FAISS"""
    def __init__(self, model_name='all-mpnet-base-v2'):
        self.model = SentenceTransformer(model_name)
        self.index = None
        self.documents = []

    def add_documents(self, chunks):
        embeddings = self.model.encode(chunks)
        self.documents.extend(chunks)

        # Create FAISS index if not exists
        if self.index is None:
            d = embeddings.shape[1]
            self.index = faiss.IndexFlatL2(d)

        self.index.add(embeddings.astype('float32'))

    def search(self, query, k=3):
        query_embed = self.model.encode([query]).astype('float32')
        distances, indices = self.index.search(query_embed, k)
        return [self.documents[i] for i in indices[0]]


In [5]:
class QAModel:
    """Question Answering model using Hugging Face"""
    def __init__(self):
        self.pipeline = pipeline(
            "question-answering",
            model="deepset/roberta-base-squad2"
        )

    def get_answer(self, context, question):
        result = self.pipeline(question=question, context=context)
        return result['answer']

In [10]:
# %% [code]
def process_pdf(file_bytes):
    """Extract text and tables from PDF with duplicate handling"""
    with pdfplumber.open(BytesIO(file_bytes)) as pdf:
        text = []
        tables = []
        for page in pdf.pages:
            # Extract text
            text.append(page.extract_text())

            # Extract and process tables
            table = page.extract_table()
            if table:
                # Clean duplicate column names
                headers = table[0]
                seen = {}
                unique_headers = []
                for h in headers:
                    if h in seen:
                        seen[h] += 1
                        unique_headers.append(f"{h}_{seen[h]}")
                    else:
                        seen[h] = 0
                        unique_headers.append(h)

                # Create DataFrame with unique headers
                df = pd.DataFrame(table[1:], columns=unique_headers)
                tables.append(df.reset_index(drop=True))

        # Concatenate tables safely
        if tables:
            combined_tables = pd.concat(tables, ignore_index=True)
        else:
            combined_tables = pd.DataFrame()

        return "\n".join(text), combined_tables

In [11]:
# %% [code]
# Upload PDF file
print("Please upload your P&L statement PDF:")
uploaded = files.upload()
pdf_bytes = next(iter(uploaded.values()))

Please upload your P&L statement PDF:


Saving Sample Financial Statement-1.pdf to Sample Financial Statement-1 (1).pdf


In [12]:
# Process PDF
full_text, tables = process_pdf(pdf_bytes)
chunks = [full_text[i:i+1000] for i in range(0, len(full_text), 1000)]  # Simple chunking


In [13]:
# Initialize components
vector_store = VectorStore()
vector_store.add_documents(chunks)
qa_model = QAModel()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Device set to use cpu


In [14]:
# %% [code]
# Ask questions
question = "What was the total revenue for Q3 2024?"

In [15]:
# Retrieve context
context_chunks = vector_store.search(question)
context = "\n".join(context_chunks)

In [16]:
# Generate answer
answer = qa_model.get_answer(context, question)

In [17]:
# Display results
print(f"\n📝 Question: {question}")
print(f"💡 Answer: {answer}")
print("\n🔍 Relevant Context Excerpt:")
print(context[:500] + "...")  # Show first 500 chars of context
print("\n📊 Sample Table Data:")
display(tables.head())


📝 Question: What was the total revenue for Q3 2024?
💡 Answer: 37,923

🔍 Relevant Context Excerpt:
ther income, net (Refer to Note 2.17) 4,711
2,701
Finance cost 470
284
Profit before tax 35,988
33,322
Income tax expense 9,740
9,214
Net Profit 26,248
24,108
Depreciation and amortization expense 4,678
4,225
Non-cash expenses other than depreciation and amortization —
—
(1) Financial Services include enterprises in Financial Services and Insurance
(2) Retail includes enterprises in Retail, Consumer Packaged Goods and Logistics
(3) Communication includes enterprises in Communication, Telecom OEM...

📊 Sample Table Data:


Unnamed: 0,Deferred Contract Cost,Unnamed: 2,None,"35,199","145,285","137,575",None_1
0,,,"Revenue from products and platforms 1,859 2,24...",,,,
1,,,,,,,
2,,,,,,,
3,,,,,,,
4,,,,,,,


In [18]:
# %% [code]
# Ask questions
question = "Show the operating margin for the past 6 months."

In [19]:
# Retrieve context
context_chunks = vector_store.search(question)
context = "\n".join(context_chunks)

In [20]:
# Generate answer
answer = qa_model.get_answer(context, question)

In [21]:
# Display results
print(f"\n📝 Question: {question}")
print(f"💡 Answer: {answer}")
print("\n🔍 Relevant Context Excerpt:")
print(context[:500] + "...")  # Show first 500 chars of context
print("\n📊 Sample Table Data:")
display(tables.head())


📝 Question: Show the operating margin for the past 6 months.
💡 Answer: 2024 2023 2024

🔍 Relevant Context Excerpt:
 WISE CLASSIFICATION OF CONDENSED CONSOLIDATED STATEMENT OF PROFIT AND LOSS
(In ₹ crore)
Particulars Note No. Three months ended March 31, Year ended March 31,
2024 2023 2024 2023
Revenue from operations 2.16 37,923 37,441 153,670 146,767
Cost of Sales 26,748 26,011 107,413 102,353
Gross profit 11,175 11,430 46,257 44,414
Operating expenses
Selling and marketing expenses 1,735 1,659 6,973 6,249
General and administration expenses 1,819 1,894 7,537 7,260
Total operating expenses 3,554 3,553 14,51...

📊 Sample Table Data:


Unnamed: 0,Deferred Contract Cost,Unnamed: 2,None,"35,199","145,285","137,575",None_1
0,,,"Revenue from products and platforms 1,859 2,24...",,,,
1,,,,,,,
2,,,,,,,
3,,,,,,,
4,,,,,,,
