# **Document creation and Vector database for DMART Annual Report**

In [None]:
import re
from langchain.schema import Document
from langchain.document_loaders import PyPDFLoader

pdf_path = "./anual_reports/DMart/Annual Report 2023-24.pdf"

def clean_and_extract_index_lines(raw_text):
    lines = raw_text.strip().splitlines()

    # 1. Encontrar la línea "Contents"
    try:
        start_idx = next(i for i, line in enumerate(lines) if line.strip().lower() == "contents")
    except StopIteration:
        raise ValueError("No se encontró una línea que diga 'Contents'.")

    lines = lines[start_idx + 1:]

    # 2. Procesar y concatenar secciones
    entries = []
    i = 0
    while i < len(lines):
        line = lines[i].strip()
        match = re.match(r"^(0*\d{1,3})(\s+.+)?$", line)

        if match:
            page = match.group(1)
            title = match.group(2).strip() if match.group(2) else ""

            # Buscar líneas siguientes que no empiecen con número
            i += 1
            while i < len(lines) and not re.match(r"^\d{1,3}(\s|$)", lines[i].strip()):
                title += " " + lines[i].strip()
                i += 1

            page = str(int(page))
            title = ' '.join(title.strip().split()[0:6])

            entries.append({"page_start": page, "title": title.strip()})
        else:
            i += 1

    return entries

def assign_end_pages(sections):
    result = []

    for i, section in enumerate(sections):
        start = int(section["page_start"])
        end = int(sections[i + 1]["page_start"]) - 1 if i + 1 < len(sections) else None

        result.append({
            "title": section["title"].strip(),
            "page_start": start,
            "page_end": end
        })

    return result


def extract_section_documents(pdf_path, year, type, sections, ticker):
    loader = PyPDFLoader(pdf_path)
    docs = loader.load()

    section_documents = []

    for section in sections:
        start = int(section["page_start"])
        end = section["page_end"] 

        # Seleccionar páginas de esta sección
        pages_in_section = [
            doc for doc in docs
            if doc.metadata.get("page_label") is not None and doc.metadata.get("page_label").isdigit()
            and start <= int(doc.metadata["page_label"]) <= (int(end) if end is not None else int(start))
        ]

        # Concatenar el texto de todas las páginas de esta sección
        section_text = "\n\n".join([page.page_content for page in pages_in_section])

        # Crear un nuevo Document por sección
        section_doc = Document(
            page_content=section_text,
            metadata={
                "year": year,
                'type':type,
                "ticker": ticker,
                "section_title": section["title"],
                "page_start": start,
                "page_end": end,
                "source": pdf_path,
            }
        )
        section_documents.append(section_doc)

    return section_documents

# PDF Loader
loader = PyPDFLoader(pdf_path)
docs = loader.load()

for document in docs:
    if document.metadata['page_label'] == "B":
        index_doc = document
        index_page = document.page_content

clean_content = clean_and_extract_index_lines(index_page)
sections = assign_end_pages(clean_content)

section_documents = extract_section_documents(pdf_path, 2024, 'Annual Report', sections, 'DMART')

In [100]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import TokenTextSplitter
from transformers import AutoTokenizer

# Choose your model for tokenizer (matching your embedding/model)
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
max_length = tokenizer.model_max_length

text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        tokenizer=tokenizer,
        chunk_size=max_length,
        chunk_overlap=int(max_length / 10),
        add_start_index=True,
        strip_whitespace=True)
docs_split = text_splitter.split_documents(section_documents)


# Embedding model
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")  # rápido, preciso
section_documents = extract_section_documents(pdf_path, 2024, 'Annual Report', sections, 'DMART')

# Splitter
splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=250)
docs_split = text_splitter.split_documents(section_documents)

# Vector
vector_db = Chroma.from_documents(
    documents=docs_split,
    embedding=embedding_model,
    persist_directory="./chroma_db",
    collection_name='section_anual_report-2024_v5'
)

# Guardar en disco
vector_db.persist()

Token indices sequence length is longer than the specified maximum sequence length for this model (948 > 512). Running this sequence through the model will result in indexing errors


In [None]:
retriever = vector_db.as_retriever(
    search_kwargs={
        "k": 5,
        "filter": {
            "section_title": {"$eq": "Consolidated Balance Sheet"} 
        }
    }
)

results = retriever.get_relevant_documents("What is the company's assets and liabilities?")
for doc in results:
    print(f"[{doc.metadata['section_title']}] {doc.page_content}\n")

[Consolidated Balance Sheet] Consolidated Balance Sheet
as at 31st March, 2024
 (` in crore)
Particulars Notes As at  
31st March, 2024
As at  
31st March, 2023
Assets
Non-current assets
(a)  Property, plant and equipment 2  11,759.19  9,725.61 
(b)  Capital work-in-progress 2  935.22  829.16 
(c)  Right-of-use assets 3  1,539.10  1,504.88 
(d)  Investment properties 4  8.09  8.54 
(e) Goodwill  78.27  78.27 
(f)  Intangible assets 5  30.35  23.18 
(g)  Financial assets
 (i)  Investments 6  0.01  0.01 
 (ii)  Other non-current financial assets 7  234.76  108.55 
(h)  Income tax assets (net)  17.56  17.50 
(i) Deferred tax assets (net) 8  4.72  1.77 
(j)  Other non-current assets 9  367.90  360.43 
Total non-current assets  14,975.17  12,657.90 
Current assets
(a)  Inventories 10  3,927.31  3,243.48 
(b)  Financial assets
 (i)    Investments 11  106.66  202.19 
 (ii) Trade receivables 12  166.37  62.16 
 (iii) Cash and cash equivalents 13  337.12  207.15 
 (iv) Bank balances other than 

In [89]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
print(tokenizer.model_max_length)  # Usually prints 512


512


In [None]:
import streamlit as st
from chromadb import Client
from transformers import pipeline
from your_embedding_utils import load_vectorstore

# Cargar modelo (puedes cambiar a tu preferido)
pipe = pipeline("text-generation", model="HuggingFaceTB/SmolLM2-1.7B-Instruct", tokenizer="HuggingFaceTB/SmolLM2-1.7B-Instruct")

# Cargar tu vectorstore
vectorstore = load_vectorstore("./chroma_db")

st.title("📚 Chat sobre reportes anuales")

# Filtros para buscar
company = st.selectbox("Selecciona la empresa", ["Dmart", "Otra"])
year = st.selectbox("Selecciona el año", [2021, 2022, 2023, 2024])
section = st.text_input("¿Buscar dentro de una sección específica? (opcional)")

# Input de la pregunta
query = st.text_input("Haz tu pregunta")

if query:
    # Consulta con filtros
    filters = {"company": company, "year": year}
    if section:
        filters["section"] = section

    docs = vectorstore.similarity_search(query, k=4, filter=filters)
    context = "\n\n".join([doc.page_content for doc in docs])

    prompt = f"""Eres un asistente que responde basándote en el siguiente contexto:

{context}

Pregunta: {query}
Respuesta:"""

    result = pipe(prompt, max_new_tokens=200)[0]["generated_text"]
    st.write(result.strip())


## Stock API for financials table retrieval

In [88]:
import requests
import os
from dotenv import load_dotenv

load_dotenv()
secret = os.environ['INDIAN_STOCK_API']
stock = "DMART"

base_url = f"https://stock.indianapi.in/stock?name={stock}"
header = {"X-Api-Key":secret }

response = requests.get(base_url, headers=header)

data = response.json()


In [64]:
import json
import pandas as pd

# Función para leer y parsear el fichero
def parse_balance_sheet(data):
    if data is not None:
        balance_sheet = {}
        for item in data:
            key = item.get('displayName', '').strip()
            value = item.get('value', None)
            if value is not None:
                # Convertir a float si es posible
                try:
                    value = float(value)
                except ValueError:
                    pass
                balance_sheet[key] = value
            
        return balance_sheet

# Llamar a la función y mostrar resultados

parsed_balance_sheets = [ {
    'FiscalYear': financial['FiscalYear'],
    'EndDate': financial['EndDate'],
     'StatementDate': financial['StatementDate'],
     'Type':financial['Type'],
     'BalanceSheet': parse_balance_sheet(financial['stockFinancialMap']['BAL'])

}  for financial in data['financials'] if financial['stockFinancialMap']['BAL'] is not None and financial['Type'] == 'Annual']


pd.DataFrame(parsed_balance_sheets)



Unnamed: 0,FiscalYear,EndDate,StatementDate,Type,BalanceSheet
0,2025,2025-03-31,2025-03-31,Annual,"{'Cash Equivalents': 358.2, 'Short Term Invest..."
1,2024,2024-03-31,2024-03-31,Annual,"{'Cash': 285.84, 'Cash Equivalents': 351.4, 'S..."
2,2023,2023-03-31,2023-03-31,Annual,"{'Cash': 203.05, 'Cash Equivalents': 1204.38, ..."
3,2022,2022-03-31,2022-03-31,Annual,"{'Cash': 95.12, 'Cash Equivalents': 202.6, 'Sh..."
4,2021,2021-03-31,2021-03-31,Annual,"{'Cash': 191.5, 'Cash Equivalents': 1253.2, 'S..."
5,2020,2020-03-31,2020-03-31,Annual,"{'Cash': 105.87, 'Cash Equivalents': 1.15, 'Sh..."


In [65]:
df = pd.DataFrame(parsed_balance_sheets)

df = pd.concat(
    [df[['FiscalYear', 'EndDate', 'StatementDate', 'Type']], 
     pd.json_normalize(df['BalanceSheet'])
     ], axis=1
)

columns = df.T.iloc[0,:].values
df = df.T.iloc[4:,:]
df.columns = list(columns)

In [87]:
# 
loader = PyPDFLoader("./anual_reports/MRF/MRF-Annual-Report-2024-Final.pdf")
docs = loader.load()
docs

[Document(metadata={'producer': 'Adobe PDF Library 10.0.1', 'creator': 'Adobe InDesign CS6 (Windows)', 'creationdate': '2024-07-05T17:16:03+05:30', 'moddate': '2024-07-06T14:19:07+05:30', 'title': '', 'trapped': '/False', 'source': './anual_reports/MRF/MRF-Annual-Report-2024-Final.pdf', 'total_pages': 208, 'page': 0, 'page_label': '1'}, page_content=''),
 Document(metadata={'producer': 'Adobe PDF Library 10.0.1', 'creator': 'Adobe InDesign CS6 (Windows)', 'creationdate': '2024-07-05T17:16:03+05:30', 'moddate': '2024-07-06T14:19:07+05:30', 'title': '', 'trapped': '/False', 'source': './anual_reports/MRF/MRF-Annual-Report-2024-Final.pdf', 'total_pages': 208, 'page': 1, 'page_label': '2'}, page_content=''),
 Document(metadata={'producer': 'Adobe PDF Library 10.0.1', 'creator': 'Adobe InDesign CS6 (Windows)', 'creationdate': '2024-07-05T17:16:03+05:30', 'moddate': '2024-07-06T14:19:07+05:30', 'title': '', 'trapped': '/False', 'source': './anual_reports/MRF/MRF-Annual-Report-2024-Final.pdf'