# **Document creation and Vector database for DMART Annual Report**

In [20]:
import re
from langchain.schema import Document
from langchain.document_loaders import PyPDFLoader


def clean_and_extract_index_lines(raw_text):
    lines = raw_text.strip().splitlines()

    # 1. Encontrar la línea "Contents"
    try:
        start_idx = next(i for i, line in enumerate(lines) if line.strip().lower() == "contents")
    except StopIteration:
        raise ValueError("No se encontró una línea que diga 'Contents'.")

    lines = lines[start_idx + 1:]

    # 2. Procesar y concatenar secciones
    entries = []
    i = 0
    while i < len(lines):
        line = lines[i].strip()
        match = re.match(r"^(0*\d{1,3})(\s+.+)?$", line)

        if match:
            page = match.group(1)
            title = match.group(2).strip() if match.group(2) else ""

            # Buscar líneas siguientes que no empiecen con número
            i += 1
            while i < len(lines) and not re.match(r"^\d{1,3}(\s|$)", lines[i].strip()):
                title += " " + lines[i].strip()
                i += 1

            page = str(int(page))
            title = ' '.join(title.strip().split()[0:6])

            entries.append({"page_start": page, "title": title.strip()})
        else:
            i += 1

    return entries

def assign_end_pages(sections):
    result = []

    for i, section in enumerate(sections):
        start = int(section["page_start"])
        end = int(sections[i + 1]["page_start"]) - 1 if i + 1 < len(sections) else None

        result.append({
            "title": section["title"].strip(),
            "page_start": start,
            "page_end": end
        })

    return result


def extract_section_documents(pdf_path, year, type, sections, ticker):
    loader = PyPDFLoader(pdf_path)
    docs = loader.load()

    section_documents = []

    for section in sections:
        start = int(section["page_start"])
        end = section["page_end"] 

        # Seleccionar páginas de esta sección
        pages_in_section = [
            doc for doc in docs
            if doc.metadata.get("page_label") is not None and doc.metadata.get("page_label").isdigit()
            and start <= int(doc.metadata["page_label"]) <= (int(end) if end is not None else int(start))
        ]

        # Concatenar el texto de todas las páginas de esta sección
        section_text = "\n\n".join([page.page_content for page in pages_in_section])

        # Crear un nuevo Document por sección
        section_doc = Document(
            page_content=section_text,
            metadata={
                "year": year,
                'type':type,
                "ticker": ticker,
                "section_title": section["title"],
                "page_start": start,
                "page_end": end,
                "source": pdf_path,
            }
        )
        section_documents.append(section_doc)

    return section_documents

# PDF Loader

pdf_path = "./data/anual_reports/DMART/Annual Report 2020.pdf"
loader = PyPDFLoader(pdf_path)
docs = loader.load()

for document in docs:
    if document.metadata['page'] == 1:
        index_doc = document
        index_page = document.page_content

clean_content = clean_and_extract_index_lines(index_page)
sections = assign_end_pages(clean_content)

#section_documents = extract_section_documents(pdf_path, 2024, 'Annual Report', sections, 'DMART')


In [21]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import TokenTextSplitter
from transformers import AutoTokenizer
from embedding.documents import DMartARExtractor

# Choose your model for tokenizer (matching your embedding/model)
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")  # rápido, preciso

# Extract document sections
docs_extractor = DMartARExtractor(pdf_path="./data/anual_reports/DMART/Annual Report 2024.pdf", ticker='DMART', year=2020)
section_documents = docs_extractor.extract_section_documents()

# Splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=250)
docs_split = text_splitter.split_documents(section_documents)

## Vector
#vector_db = Chroma.from_documents(
#    documents=docs_split,
#    embedding=embedding_model,
#    persist_directory="./api/chroma_db",
#    collection_name='DMART'
#)

# Guardar en disco
#vector_db.persist()

In [16]:
from langchain_together import TogetherEmbeddings
from dotenv import load_dotenv
import os 

load_dotenv()

embeddings = TogetherEmbeddings(
    model="togethercomputer/m2-bert-80M-2k-retrieval",
)

vector_db = Chroma.from_documents(
    documents=docs_split,
    embedding=embeddings,
    persist_directory="./testdb/chroma_db",
    collection_name='section_anual_report-2020_v1'
)

In [17]:
vector_db.persist()


In [2]:
[section['title'] for section in docs_extractor.sections]

['Good Products Great Value',
 'About DMart',
 'Vision, Mission and Core Values',
 'Presence and Expansion Strategy',
 'Key Product Categories',
 'Key Performance Indicators',
 'Message from the Chairman',
 'Message from the Managing Director &',
 'Sustainability at DMart',
 'Corporate Social Responsibility',
 'Board of Directors',
 'Senior Leadership Team',
 'Corporate Information Statutory Reports',
 'Management Discussion & Analysis',
 'Directors’ Report',
 'Corporate Governance Report',
 'Business Responsibility Report Financial Statements Standalone',
 'Independent Auditor’s Report',
 'Annexure 1 to Independent Auditor’s Report',
 'Annexure 2 to Independent Auditor’s Report',
 'Standalone Balance Sheet',
 'Statement of Standalone Profit and Loss',
 'Statement of Standalone Cash Flows',
 'Statement of Changes in Equity',
 'Notes Consolidated Accounts',
 'Independent Auditor’s Report',
 'Annexure 1 to Independent Auditor’s Report',
 'Consolidated Balance Sheet',
 'Statement of Conso

In [34]:
retriever = vector_db.as_retriever(
    search_kwargs={
        "k": 5
    }
)

results = retriever.get_relevant_documents("What are DMARt's risks and concerns?")
results

[]

In [89]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
print(tokenizer.model_max_length)  # Usually prints 512


512


In [None]:
import streamlit as st
from chromadb import Client
from transformers import pipeline
from your_embedding_utils import load_vectorstore

# Cargar modelo (puedes cambiar a tu preferido)
pipe = pipeline("text-generation", model="HuggingFaceTB/SmolLM2-1.7B-Instruct", tokenizer="HuggingFaceTB/SmolLM2-1.7B-Instruct")

# Cargar tu vectorstore
vectorstore = load_vectorstore("./chroma_db")

st.title("📚 Chat sobre reportes anuales")

# Filtros para buscar
company = st.selectbox("Selecciona la empresa", ["Dmart", "Otra"])
year = st.selectbox("Selecciona el año", [2021, 2022, 2023, 2024])
section = st.text_input("¿Buscar dentro de una sección específica? (opcional)")

# Input de la pregunta
query = st.text_input("Haz tu pregunta")

if query:
    # Consulta con filtros
    filters = {"company": company, "year": year}
    if section:
        filters["section"] = section

    docs = vectorstore.similarity_search(query, k=4, filter=filters)
    context = "\n\n".join([doc.page_content for doc in docs])

    prompt = f"""Eres un asistente que responde basándote en el siguiente contexto:

{context}

Pregunta: {query}
Respuesta:"""

    result = pipe(prompt, max_new_tokens=200)[0]["generated_text"]
    st.write(result.strip())


## Stock API for financials table retrieval

In [22]:
import requests
import os
from dotenv import load_dotenv

load_dotenv()
secret = os.environ['INDIAN_STOCK_API']
stock = "MRF"

base_url = f"https://stock.indianapi.in/stock?name={stock}"
header = {"X-Api-Key":secret }

response = requests.get(base_url, headers=header)

data = response.json()


In [23]:
import json
import pandas as pd

# Función para leer y parsear el fichero
def parse_balance_sheet(data):
    if data is not None:
        balance_sheet = {}
        for item in data:
            key = item.get('displayName', '').strip()
            value = item.get('value', None)
            if value is not None:
                # Convertir a float si es posible
                try:
                    value = float(value)
                except ValueError:
                    pass
                balance_sheet[key] = value
            
        return balance_sheet

# Llamar a la función y mostrar resultados

parsed_balance_sheets = [ {
    'FiscalYear': financial['FiscalYear'],
    'EndDate': financial['EndDate'],
     'StatementDate': financial['StatementDate'],
     'Type':financial['Type'],
     'BalanceSheet': parse_balance_sheet(financial['stockFinancialMap']['BAL'])

}  for financial in data['financials'] if financial['stockFinancialMap']['BAL'] is not None and financial['Type'] == 'Annual']


pd.DataFrame(parsed_balance_sheets)



Unnamed: 0,FiscalYear,EndDate,StatementDate,Type,BalanceSheet
0,2025,2025-03-31,2025-03-31,Annual,"{'Cash Equivalents': 376.49, 'Short Term Inves..."
1,2024,2024-03-31,2024-03-31,Annual,"{'Cash': 287.76, 'Cash Equivalents': 15.31, 'S..."
2,2023,2023-03-31,2023-03-31,Annual,"{'Cash': 199.54, 'Cash Equivalents': 48.97, 'S..."
3,2022,2022-03-31,2022-03-31,Annual,"{'Cash': 164.6, 'Cash Equivalents': 14.09, 'Sh..."
4,2021,2021-03-31,2021-03-31,Annual,"{'Cash': 129.92, 'Cash Equivalents': 36.93, 'S..."
5,2020,2020-03-31,2020-03-31,Annual,"{'Cash': 1.06, 'Cash Equivalents': 1177.46, 'S..."


In [24]:
df = pd.DataFrame(parsed_balance_sheets)

df = pd.concat(
    [df[['FiscalYear', 'EndDate', 'StatementDate', 'Type']], 
     pd.json_normalize(df['BalanceSheet'])
     ], axis=1
)

columns = df.T.iloc[0,:].values
df = df.T.iloc[4:,:]
df.columns = list(columns)
df

Unnamed: 0,2025,2024,2023,2022,2021,2020
Cash Equivalents,376.49,15.31,48.97,14.09,36.93,1177.46
Short Term Investments,3606.07,2443.5,2037.69,3121.44,4744.25,1513.75
Cashand Short Term Investments,3982.56,2746.57,2286.2,3300.13,4911.1,2692.27
Accounts Receivable- Trade Net,3370.25,2953.61,2543.56,2384.37,2295.03,2302.29
Total Receivables Net,3375.27,3179.47,2712.87,2553.58,2450.11,2464.6
Total Inventory,5624.78,4468.58,4141.05,4129.67,2938.81,2905.18
Other Current Assets Total,413.79,90.05,53.22,162.07,29.12,52.31
Total Current Assets,13396.4,10549.55,9243.68,10187.95,10367.18,8151.04
Property/ Plant/ Equipment Total- Net,14368.08,14405.01,13137.89,10733.66,10418.65,10582.21
Intangibles Net,21.33,25.74,25.94,21.23,24.33,28.49


In [26]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

vector_db = Chroma(
    persist_directory="../api/chroma_db",
    collection_name="Indian_Companies_Annual_Reports",
    embedding_function=embeddings
)


# Build Chroma filter using $and
filter_conditions = [
    {
        "ticker": 
        {"$eq": "DMART"}     
    },
    {
        "year": 
        {"$eq": 2024}
    }
]


# Create the filter with proper ChromaDB structure
filters = {"$and": filter_conditions}

# Perform the vector similarity search
matched_docs = vector_db.similarity_search(
    query="What are the risks and concerns for DMART in 2024?",
    k=5,
    filter=filters
)

matched_docs

[]

In [3]:
# Test API

import requests

query = "What are the company's risks and concerns?"
ticker = "DMART"
url = "http://127.0.0.1:8000/query"
year = 2024
section = "Management Discussion and Analysis"
k = 5

payload = {
    "question": query,
    "ticker": ticker,
    "year": year,
    "section": section,
    "k": k
}

response = requests.post(url, json=payload)
if response.status_code == 200:
    print("Response:", response.json())

Response: {'chunks': ['Management team, led by the Managing Director, is responsible for \nproactively managing risks with appropriate mitigation measures \nand ensuring their implementation thereof.\nBelow are some of the key risks and concerns in our business:\n \x97  If we are unable to continue to offer daily low prices pursuant \nto our EDLC/EDLP pricing strategy, we risk losing our distinct \nadvantage and a substantial portion of our customers, which \nwill adversely affect our business, financial condition, and \nresults of operations \n \x97  Availability of commercially viable real estate properties at \nsuitable locations for our new stores, timely execution of sale \ndeeds/leave and license registrations and getting regulatory \napprovals for these properties \n \x97  Our ability to attract, hire, train and retain skilled employees\n \x97  Our inability to maintain an optimal level of inventory in our \nstores may impact our operations adversely \n \x97  Our continued under

In [39]:
response

<Response [500]>