# **Document creation and Vector database for DMART Annual Report**

In [20]:
import re
from langchain.schema import Document
from langchain.document_loaders import PyPDFLoader


def clean_and_extract_index_lines(raw_text):
    lines = raw_text.strip().splitlines()

    # 1. Encontrar la línea "Contents"
    try:
        start_idx = next(i for i, line in enumerate(lines) if line.strip().lower() == "contents")
    except StopIteration:
        raise ValueError("No se encontró una línea que diga 'Contents'.")

    lines = lines[start_idx + 1:]

    # 2. Procesar y concatenar secciones
    entries = []
    i = 0
    while i < len(lines):
        line = lines[i].strip()
        match = re.match(r"^(0*\d{1,3})(\s+.+)?$", line)

        if match:
            page = match.group(1)
            title = match.group(2).strip() if match.group(2) else ""

            # Buscar líneas siguientes que no empiecen con número
            i += 1
            while i < len(lines) and not re.match(r"^\d{1,3}(\s|$)", lines[i].strip()):
                title += " " + lines[i].strip()
                i += 1

            page = str(int(page))
            title = ' '.join(title.strip().split()[0:6])

            entries.append({"page_start": page, "title": title.strip()})
        else:
            i += 1

    return entries

def assign_end_pages(sections):
    result = []

    for i, section in enumerate(sections):
        start = int(section["page_start"])
        end = int(sections[i + 1]["page_start"]) - 1 if i + 1 < len(sections) else None

        result.append({
            "title": section["title"].strip(),
            "page_start": start,
            "page_end": end
        })

    return result


def extract_section_documents(pdf_path, year, type, sections, ticker):
    loader = PyPDFLoader(pdf_path)
    docs = loader.load()

    section_documents = []

    for section in sections:
        start = int(section["page_start"])
        end = section["page_end"] 

        # Seleccionar páginas de esta sección
        pages_in_section = [
            doc for doc in docs
            if doc.metadata.get("page_label") is not None and doc.metadata.get("page_label").isdigit()
            and start <= int(doc.metadata["page_label"]) <= (int(end) if end is not None else int(start))
        ]

        # Concatenar el texto de todas las páginas de esta sección
        section_text = "\n\n".join([page.page_content for page in pages_in_section])

        # Crear un nuevo Document por sección
        section_doc = Document(
            page_content=section_text,
            metadata={
                "year": year,
                'type':type,
                "ticker": ticker,
                "section_title": section["title"],
                "page_start": start,
                "page_end": end,
                "source": pdf_path,
            }
        )
        section_documents.append(section_doc)

    return section_documents

# PDF Loader

pdf_path = "./data/anual_reports/DMART/Annual Report 2020.pdf"
loader = PyPDFLoader(pdf_path)
docs = loader.load()

for document in docs:
    if document.metadata['page'] == 1:
        index_doc = document
        index_page = document.page_content

clean_content = clean_and_extract_index_lines(index_page)
sections = assign_end_pages(clean_content)

#section_documents = extract_section_documents(pdf_path, 2024, 'Annual Report', sections, 'DMART')


In [21]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import TokenTextSplitter
from transformers import AutoTokenizer
from embedding.documents import DMartARExtractor

# Choose your model for tokenizer (matching your embedding/model)
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")  # rápido, preciso

# Extract document sections
docs_extractor = DMartARExtractor(pdf_path="./data/anual_reports/DMART/Annual Report 2024.pdf", ticker='DMART', year=2020)
section_documents = docs_extractor.extract_section_documents()

# Splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=250)
docs_split = text_splitter.split_documents(section_documents)

## Vector
#vector_db = Chroma.from_documents(
#    documents=docs_split,
#    embedding=embedding_model,
#    persist_directory="./api/chroma_db",
#    collection_name='DMART'
#)

# Guardar en disco
#vector_db.persist()

In [16]:
from langchain_together import TogetherEmbeddings
from dotenv import load_dotenv
import os 

load_dotenv()

embeddings = TogetherEmbeddings(
    model="togethercomputer/m2-bert-80M-2k-retrieval",
)

vector_db = Chroma.from_documents(
    documents=docs_split,
    embedding=embeddings,
    persist_directory="./testdb/chroma_db",
    collection_name='section_anual_report-2020_v1'
)

In [17]:
vector_db.persist()


In [2]:
[section['title'] for section in docs_extractor.sections]

['Good Products Great Value',
 'About DMart',
 'Vision, Mission and Core Values',
 'Presence and Expansion Strategy',
 'Key Product Categories',
 'Key Performance Indicators',
 'Message from the Chairman',
 'Message from the Managing Director &',
 'Sustainability at DMart',
 'Corporate Social Responsibility',
 'Board of Directors',
 'Senior Leadership Team',
 'Corporate Information Statutory Reports',
 'Management Discussion & Analysis',
 'Directors’ Report',
 'Corporate Governance Report',
 'Business Responsibility Report Financial Statements Standalone',
 'Independent Auditor’s Report',
 'Annexure 1 to Independent Auditor’s Report',
 'Annexure 2 to Independent Auditor’s Report',
 'Standalone Balance Sheet',
 'Statement of Standalone Profit and Loss',
 'Statement of Standalone Cash Flows',
 'Statement of Changes in Equity',
 'Notes Consolidated Accounts',
 'Independent Auditor’s Report',
 'Annexure 1 to Independent Auditor’s Report',
 'Consolidated Balance Sheet',
 'Statement of Conso

In [34]:
retriever = vector_db.as_retriever(
    search_kwargs={
        "k": 5
    }
)

results = retriever.get_relevant_documents("What are DMARt's risks and concerns?")
results

[]

In [89]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
print(tokenizer.model_max_length)  # Usually prints 512


512


In [None]:
import streamlit as st
from chromadb import Client
from transformers import pipeline
from your_embedding_utils import load_vectorstore

# Cargar modelo (puedes cambiar a tu preferido)
pipe = pipeline("text-generation", model="HuggingFaceTB/SmolLM2-1.7B-Instruct", tokenizer="HuggingFaceTB/SmolLM2-1.7B-Instruct")

# Cargar tu vectorstore
vectorstore = load_vectorstore("./chroma_db")

st.title("📚 Chat sobre reportes anuales")

# Filtros para buscar
company = st.selectbox("Selecciona la empresa", ["Dmart", "Otra"])
year = st.selectbox("Selecciona el año", [2021, 2022, 2023, 2024])
section = st.text_input("¿Buscar dentro de una sección específica? (opcional)")

# Input de la pregunta
query = st.text_input("Haz tu pregunta")

if query:
    # Consulta con filtros
    filters = {"company": company, "year": year}
    if section:
        filters["section"] = section

    docs = vectorstore.similarity_search(query, k=4, filter=filters)
    context = "\n\n".join([doc.page_content for doc in docs])

    prompt = f"""Eres un asistente que responde basándote en el siguiente contexto:

{context}

Pregunta: {query}
Respuesta:"""

    result = pipe(prompt, max_new_tokens=200)[0]["generated_text"]
    st.write(result.strip())


## Stock API for financials table retrieval

In [16]:
import requests
import os
from dotenv import load_dotenv

load_dotenv()
secret = os.environ['INDIAN_STOCK_API']
stock = "MRF"

base_url = f"https://stock.indianapi.in/stock?name={stock}"
header = {"X-Api-Key":secret }

response = requests.get(base_url, headers=header)

data = response.json()


In [None]:
import json
import pandas as pd

# Función para leer y parsear el fichero
def parse_balance_sheet(data):
    if data is not None:
        balance_sheet = {}
        for item in data:
            key = item.get('displayName', '').strip()
            value = item.get('value', None)
            if value is not None:
                # Convertir a float si es posible
                try:
                    value = float(value)
                except ValueError:
                    pass
                balance_sheet[key] = value
            
        return balance_sheet

# Llamar a la función y mostrar resultados

parsed_balance_sheets = [ {
    'FiscalYear': financial['FiscalYear'],
    'EndDate': financial['EndDate'],
     'StatementDate': financial['StatementDate'],
     'Type':financial['Type'],
     'BalanceSheet': parse_balance_sheet(financial['stockFinancialMap']['BAL'])

}  for financial in data['financials'] if financial['stockFinancialMap']['BAL'] is not None and financial['Type'] == 'Annual']


pd.DataFrame(parsed_balance_sheets)



Unnamed: 0,FiscalYear,EndDate,StatementDate,Type,BalanceSheet
0,2025,2025-03-31,2025-03-31,Annual,"{'Cash Equivalents': 376.49, 'Short Term Inves..."
1,2024,2024-03-31,2024-03-31,Annual,"{'Cash': 287.76, 'Cash Equivalents': 15.31, 'S..."
2,2023,2023-03-31,2023-03-31,Annual,"{'Cash': 199.54, 'Cash Equivalents': 48.97, 'S..."
3,2022,2022-03-31,2022-03-31,Annual,"{'Cash': 164.6, 'Cash Equivalents': 14.09, 'Sh..."
4,2021,2021-03-31,2021-03-31,Annual,"{'Cash': 129.92, 'Cash Equivalents': 36.93, 'S..."
5,2020,2020-03-31,2020-03-31,Annual,"{'Cash': 1.06, 'Cash Equivalents': 1177.46, 'S..."


In [18]:
df = pd.DataFrame(parsed_balance_sheets)

df = pd.concat(
    [df[['FiscalYear', 'EndDate', 'StatementDate', 'Type']], 
     pd.json_normalize(df['BalanceSheet'])
     ], axis=1
)

columns = df.T.iloc[0,:].values
df = df.T.iloc[4:,:]
df.columns = list(columns)
df

Unnamed: 0,2025,2024,2023,2022,2021,2020
Cash Equivalents,376.49,15.31,48.97,14.09,36.93,1177.46
Short Term Investments,3606.07,2443.5,2037.69,3121.44,4744.25,1513.75
Cashand Short Term Investments,3982.56,2746.57,2286.2,3300.13,4911.1,2692.27
Accounts Receivable- Trade Net,3370.25,2953.61,2543.56,2384.37,2295.03,2302.29
Total Receivables Net,3375.27,3179.47,2712.87,2553.58,2450.11,2464.6
Total Inventory,5624.78,4468.58,4141.05,4129.67,2938.81,2905.18
Other Current Assets Total,413.79,90.05,53.22,162.07,29.12,52.31
Total Current Assets,13396.4,10549.55,9243.68,10187.95,10367.18,8151.04
Property/ Plant/ Equipment Total- Net,14368.08,14405.01,13137.89,10733.66,10418.65,10582.21
Intangibles Net,21.33,25.74,25.94,21.23,24.33,28.49


In [None]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma

#embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

vector_db = Chroma(
    persist_directory="./api/chorma_db",
    collection_name="Indian_Companies_Annual_Reports",
    embedding_function=embeddings
)


# Build Chroma filter using $and
filter_conditions = [
    {
        "ticker": 
        {"$eq": "DMART"}     
    },
    {
        "year": 
        {"$eq": 2023}
    }
]


# Create the filter with proper ChromaDB structure
filters = {"$and": filter_conditions}

# Perform the vector similarity search
matched_docs = vector_db.similarity_search(
    query="What are the company's risks and concerns?",
    k=5,
    filter=filters
)

matched_docs

[Document(id='cfd7fb89-03aa-4c36-8d90-2419801a428e', metadata={'page_start': 33, 'ticker': 'DMART', 'source': 'Annual Report 2023.pdf', 'year': 2023, 'section_title': 'Directors’ Report', 'page_end': 53}, page_content='The assessment also enlightens the fact that centre possesses all \nrequired permissions from the Atomic Energy Regulatory Board \n(AERB) for using the CT Simulator. The medical staff handling \nthe CT simulator also adhered to all radiation safety protocol \nmeasures like using a TLD badge. No major challenges related to \nproject execution were reported or observed.\nExpansion of Dr. Hedgewar Hospital in Aurangabad  \n(Dr. Babasaheb Ambedkar Vaidyakiya Pratishthan)\nDMart collaborated with Dr. Babasaheb Ambedkar Vaidyakiya \nPratishthan to provide support for the expansion of Dr. Hedgewar \nHospital in Aurangabad. The project included support for \nbuilding expansion and expansion of oxygen pipeline to beds \nnot equipped with oxygen facility. This project is in alignm

In [None]:
vector_db.get(
    where={"ticker": "DMART", "year": 2023}
)


ValueError: Expected where to have exactly one operator, got {'ticker': 'DMART', 'year': 2023} in get.

In [21]:
from langchain_community.document_loaders import PyPDFLoader

# Cargar el archivo PDF
loader = PyPDFLoader("C:/Users/esteb/OneDrive/Escritorio/Cursos y Materias/AnualReportRAG/data/anual_reports/MRF/Annual-Report-2024.pdf")

# Extraer las páginas como objetos Document
documents = loader.load()

# `documents` es una lista de `Document` con metadata por página
print(documents[0].page_content[:500])  # Mostrar contenido de la primera página





In [22]:
documents

[Document(metadata={'producer': 'Adobe PDF Library 10.0.1', 'creator': 'Adobe InDesign CS6 (Windows)', 'creationdate': '2024-07-05T17:16:03+05:30', 'moddate': '2024-07-06T14:19:07+05:30', 'title': '', 'trapped': '/False', 'source': 'C:/Users/esteb/OneDrive/Escritorio/Cursos y Materias/AnualReportRAG/data/anual_reports/MRF/Annual-Report-2024.pdf', 'total_pages': 208, 'page': 0, 'page_label': '1'}, page_content=''),
 Document(metadata={'producer': 'Adobe PDF Library 10.0.1', 'creator': 'Adobe InDesign CS6 (Windows)', 'creationdate': '2024-07-05T17:16:03+05:30', 'moddate': '2024-07-06T14:19:07+05:30', 'title': '', 'trapped': '/False', 'source': 'C:/Users/esteb/OneDrive/Escritorio/Cursos y Materias/AnualReportRAG/data/anual_reports/MRF/Annual-Report-2024.pdf', 'total_pages': 208, 'page': 1, 'page_label': '2'}, page_content=''),
 Document(metadata={'producer': 'Adobe PDF Library 10.0.1', 'creator': 'Adobe InDesign CS6 (Windows)', 'creationdate': '2024-07-05T17:16:03+05:30', 'moddate': '2024

In [14]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma

#embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

vector_db = Chroma(
    persist_directory="./api/chroma_db",
    collection_name="Indian_Companies_Annual_Reports",
    embedding_function=embeddings
)


all_data = vector_db._collection.get(include=["documents", "metadatas"])

filtered = [
    doc for doc, meta in zip(all_data['documents'], all_data['metadatas'])
    if meta.get("ticker") == "MRF" and meta.get("year") == 2024 and meta.get('section_title') == 'Management Discussion and Analysis'.upper()]

print('\n'.join(filtered))

40
MANAGEMENT DISCUSSION AND ANALYSIS
(Within the limits set by Company’s competitive position)
In the year gone by, Global economy delivered better than expected 
growth despite geopolitical issues casting a shadow on the world. World 
economy moved towards a soft landing with growth holding up and 
inflation declining. MRF continued its industry leading growth in the 
domestic market.
Economic growth was steady during the year despite monetary tightening, 
was muted with increased geo economic fragmentation, trade restrictions 
and lower consumption arising from tight financial conditions. Emerging 
economies performed better than the developed world. There was 
steady reduction in inflation in both developed and emerging economies 
prompting Central Banks to pause interest rate hikes, though inflation is still 
above the target in most economies. Geo political issues continued causing 
disturbance to the world with the continuing Ukraine war, tensions in West 
Asia and disturbances 

In [7]:
import requests

payload = {
    "ticker": "MRF",
    "year": 2023
}
ticker ="MRF"
section = 'Management Discussion and Analysis' 

if ticker == "MRF":
    section = section.upper()

#payload['section'] = section
    

response = requests.post('https://container-finrobot.dc4gg5b1dous0.eu-west-3.cs.amazonlightsail.com//extract_section_names', json=payload)
print(response.status_code)
print(response.json())

200
{'year': 2023, 'ticker': 'MRF', 'section_names': ['NEW PRODUCT LAUNCH', 'BOARD OF DIRECTORS', 'MOTORSPORT', '10', 'MRF T & S', 'REPORT ON CORPORATE GOVERNANCE', 'NOTES FORMING PART OF THE FINANCIAL STATEMENTS', 'BUSINESS RESPONSIBILITY AND SUSTAINABILITY REPORT', 'STANDALONE FINANCIAL STATEMENTS', 'AWARDS AND ACCOLADES', 'STATEMENT OF PROFIT AND LOSS', 'FORM AOC-1', '11', 'ESG', 'GROWTH STORY', 'MRF CORP', "CHAIRMAN'S MESSAGE", 'BALANCE SHEET', 'CONSOLIDATED FINANCIAL STATEMENTS']}


In [1]:
from database.documents import DMartARExtractor
import os 
api_key = os.getenv('OPENAI_API_KEY')

extractor = DMartARExtractor(pdf_path='C:/Users/esteb/OneDrive/Escritorio/Cursos y Materias/AnualReportRAG/data/anual_reports/DMART/Annual Report 2024.pdf', year=2024, ticker='DMART',api_key=api_key )


In [6]:
extractor.assign_end_pages_dmart()

In [2]:
extractor.extract_section_documents()

Good Products Great Value
Correctly created document Good Products Great Value
About DMart
Correctly created document About DMart
Core Values, Vision and Mission
Correctly created document Core Values, Vision and Mission
Presence and Expansion Strategy
Correctly created document Presence and Expansion Strategy
Key Product Categories
Correctly created document Key Product Categories
Key Performance Indicators
Correctly created document Key Performance Indicators
Message from the Chairman
Correctly created document Message from the Chairman
Environmental Focus at DMart
Correctly created document Environmental Focus at DMart
Corporate Social Responsibility
Correctly created document Corporate Social Responsibility
Board of Directors
Correctly created document Board of Directors
Senior Leadership Team
Correctly created document Senior Leadership Team
Corporate Information Statutory Reports
Correctly created document Corporate Information Statutory Reports
Management Discussion and Analysis

[Document(metadata={'year': 2024, 'ticker': 'DMART', 'section_title': 'Good Products Great Value', 'page_start': 1, 'page_end': 1, 'source': 'Annual Report 2024.pdf'}, page_content='Good Products  \nGreat Value \nDMart has continued in its mission to fulfill customers’ everyday needs by providing good \nquality products at great value. Our customer-centric approach has helped us achieve \ncredible growth thus far.'),
 Document(metadata={'year': 2024, 'ticker': 'DMART', 'section_title': 'About DMart', 'page_start': 2, 'page_end': 2, 'source': 'Annual Report 2024.pdf'}, page_content='CORPORATE OVERVIEW \nSTATUTORY REPORTS\nFINANCIAL STATEMENTS\nNOTICE OF THE AGM\n2\nOUR BEGINNING\nAbout DMart\nBy the late 1990s, our founder,  \nMr. Radhakishan Damani, was already \nestablished as one of the more \nsuccessful and well-known value \ninvestors in the Indian equity markets. \nThrough his investing style, he had \ndeveloped a very keen understanding \nof the Indian consumer sector and \nits p

In [1]:
# Test API

import requests

query = "What are the company's risks and concerns?"
ticker = "DMART"
url = "http://127.0.0.1:8000/query"
year = 2024
section = "Management Discussion and Analysis"
k = 5

payload = {
    "question": query,
    "ticker": ticker,
    "year": year,
    "section": section,
    "k": k
}

response = requests.post(url, json=payload)
if response.status_code == 200:
    data = response.json()
    for chunk in data['chunks']:
        print(chunk)

Management team, led by the Managing Director, is responsible for 
proactively managing risks with appropriate mitigation measures 
and ensuring their implementation thereof.
Below are some of the key risks and concerns in our business:
   If we are unable to continue to offer daily low prices pursuant 
to our EDLC/EDLP pricing strategy, we risk losing our distinct 
advantage and a substantial portion of our customers, which 
will adversely affect our business, financial condition, and 
results of operations 
   Availability of commercially viable real estate properties at 
suitable locations for our new stores, timely execution of sale 
deeds/leave and license registrations and getting regulatory 
approvals for these properties 
   Our ability to attract, hire, train and retain skilled employees
   Our inability to maintain an optimal level of inventory in our 
stores may impact our operations adversely 
   Our continued understanding and prediction of consumers’ 
changing needs 

In [4]:
# Usage of openai 
import os
from dotenv import load_dotenv  
load_dotenv()
import openai
openai.api_key = os.getenv("OPENAI_API_KEY")
from agent.rag_agent import build_section_selection_prompt, rephrase_question, select_sections_with_llm

rephrased_question = rephrase_question("What are the company's risks and concerns?", "DMART", 2024)

APIRemovedInV1: 

You tried to access openai.ChatCompletion, but this is no longer supported in openai>=1.0.0 - see the README at https://github.com/openai/openai-python for the API.

You can run `openai migrate` to automatically upgrade your codebase to use the 1.0.0 interface. 

Alternatively, you can pin your installation to the old version, e.g. `pip install openai==0.28`

A detailed migration guide is available here: https://github.com/openai/openai-python/discussions/742


In [15]:
from openai import OpenAI
client = OpenAI()


def rephrase_question(question: str, ticker, year) -> str:
    instrctions = f"""You are an expert financial analyst. Given a user's question about a company's annual report, rephrase the question to make it more specific and relevant for analysis.
        ### Company Ticker: {ticker}
        ### Year: {year}"""

    response = client.responses.create(
        model="gpt-4o-mini",
        instructions=instrctions,
        input=question,
        temperature=0
    )
    return response.output[0].content[0].text.strip()


question = "What are the company's risks and concerns mentioned in the Management Discussion and Analysis?"
rephrased_question = rephrase_question(question, "DMART", 2024)


In [34]:
import requests
from typing import List, Tuple
from agent.rag_agent import QueryResponse

query = rephrased_question
ticker = "DMART"
url = "http://127.0.0.1:8000/query"
year = 2024
section = "Management Discussion and Analysis"
k = 5

payload = {
    "question": query,
    "ticker": ticker,
    "year": year,
    "section": section,
    "k": k
}

response = requests.post(url, json=payload)
if response.status_code == 200:
    data = response.json()
    results = QueryResponse(
        chunks= data.get('chunks', []),
        source_documents = data.get('source_documents', []),
        source_sections=data.get('source_sections', []),
    )


def build_final_answer(question: str, results: QueryResponse) -> str:

    section= results.source_sections[0]
    documents = results.source_documents[0]
    chunks_by_section = [f"Section: {section} Document:{documents} Content: {chunk}" for chunk in results.chunks]

    instrctions = f"""You are an expert financial analyst. Given a user's question about a company's annual report, answer the question given the information given as context extracted from the annual report.
        ### Company Ticker: {ticker}
        ### Year: {year}
        ### Context:
        ### {"\n\n" + "\n\n".join(chunks_by_section)}"""

    response = client.responses.create(
        model="gpt-4o-mini",
        instructions=instrctions,
        input=question,
        temperature=0
    )
    return response.output[0].content[0].text.strip()

final_answer = build_final_answer(rephrased_question, results)
print(final_answer)

The Management Discussion and Analysis section of DMART's 2024 annual report highlights several specific risks and concerns related to operational, financial, and market conditions:

### Operational Risks:
1. **Pricing Strategy**: Inability to maintain daily low prices under the EDLC/EDLP strategy could lead to loss of customers and competitive advantage.
2. **Real Estate Availability**: Challenges in securing commercially viable real estate for new stores, including timely execution of legal documents and obtaining regulatory approvals.
3. **Employee Management**: Difficulty in attracting, hiring, training, and retaining skilled employees.
4. **Inventory Management**: Inability to maintain optimal inventory levels may adversely impact operations.
5. **Store Expansion**: Effective management of store expansion and operations in new locations is crucial.

### Financial Risks:
1. **Cybersecurity**: Breaches in cybersecurity could disrupt business operations, leading to financial losses.


In [52]:
# Generar section description:
import json
from openai import OpenAI
load_dotenv()

client = OpenAI()

def generate_section_description(section: str, documents: str) -> str:
    context = "\n\n".join(documents)
    
    prompt = f"""You are an expert financial analyst. Given a section title and the content of the section, generate a concise description of the section. 
        The description should be clear, concise, and informative, summarizing the key points of the section max 200 words
        ### Section Title: {section}"""

    response = client.responses.create(
        model="gpt-4o-mini",
        instructions=prompt,
        input=context,
        temperature=0
    )
    description = response.output[0].content[0].text.strip()

    return {"section":section, "description": description}
# Example usage
section = "Management Discussion and Analysis" 
documents = [
    "This section discusses the company's performance, risks, and future outlook.",
    "The company has faced challenges in the market but remains optimistic about growth."
]   
description_json = generate_section_description(section=section, documents=documents)
print(description_json)

{'section': 'Management Discussion and Analysis', 'description': 'The "Management Discussion and Analysis" section provides an overview of the company\'s recent performance, highlighting both challenges and opportunities in the market. It addresses key risks that could impact future operations while maintaining an optimistic outlook for growth. The management emphasizes strategic initiatives aimed at overcoming current obstacles and capitalizing on potential market trends. Overall, the section reflects a balanced perspective on the company\'s current standing and future prospects, underscoring a commitment to navigating challenges effectively.'}


In [None]:
def add_new_section_description(self, description_dict: dict) -> None:
    """
    Adds a new section description to the section descriptions file.

    Args:
        section (str): The title of the section.
        description (str): The description of the section.

    """

    file_path = self.section_descriptions_path
    # Asegúrate de que el archivo existe
    if not os.path.exists(file_path):
        os.makedirs(os.path.dirname(file_path), exist_ok=True)
        with open(file_path, "w") as file:
            json.dump({}, file)  # Crear archivo vacío si no existe

    # Leer el contenido existente
    with open(file_path, "r") as file:
        data = json.load(file)

        if self.ticker not in data:
            data[self.ticker] = {}
            
        if self.year not in data[self.ticker]:
            data[self.ticker][self.year] = {}

        data[self.ticker][self.year][section] = description

    # Escribir el nuevo contenido   
    with open(file_path, "w", encoding='utf-8') as file:
        json.dump(data, file, indent=4)

In [5]:
from database.documents import MRFExtractor
import os
from dotenv import load_dotenv
from database.generate_vdb import generate_vdb
load_dotenv()

extractor = MRFExtractor(pdf_path="./data/anual_reports/TATAMOTORS/Annual Report 2024.pdf", ticker='TATAMOTORS', year=2024, api_key=os.getenv("OPENAI_API_KEY"), index_page=4)
#extractor.clean_and_extract_index_mrf()
#extractor.sections = [{'title':'Bussiness Overview', 'page_start':4, 'page_end': 71}, {'title':'Risk Management', 'page_start':72, 'page_end': 77}, {'title':'Management Discussion and Analysis', 'page_start': 210, 'page_end': 231}, {'title': 'Consolidated financials', 'page_start': 262, 'page_end': 382},
# {'title': 'Standalone financials', 'page_start': 383, 'page_end': 477}]

extractor.docs

[Document(metadata={'producer': 'iLovePDF', 'creator': 'Adobe InDesign 19.3 (Windows)', 'creationdate': '2024-05-31T00:18:18+05:30', 'moddate': '2024-05-31T01:27:01+05:30', 'trapped': '/False', 'source': './data/anual_reports/TATAMOTORS/Annual Report 2024.pdf', 'total_pages': 530, 'page': 0, 'page_label': 'A'}, page_content='www.tatamotors.com\nBombay House, 24 Homi Mody Street, Mumbai 400 001, India\nTataMotorsGroup tatamotors company/tata-motors\nuser/TataMotorsGroup tatamotorsgroup\nConcept, content and design at                         |  whatsup@stirrup.works\nGROWING RESPONSIBLY\n79TH INTEGRATED ANNUAL REPORT 2023-24\nGROWING RESPONSIBLY'),
 Document(metadata={'producer': 'iLovePDF', 'creator': 'Adobe InDesign 19.3 (Windows)', 'creationdate': '2024-05-31T00:18:18+05:30', 'moddate': '2024-05-31T01:27:01+05:30', 'trapped': '/False', 'source': './data/anual_reports/TATAMOTORS/Annual Report 2024.pdf', 'total_pages': 530, 'page': 1, 'page_label': 'B'}, page_content='At Tata Motors Lim

In [None]:
sections_dict = {
    "2024":[{'title':'Bussiness Overview', 'page_start':4, 'page_end': 71}, 
            {'title':'Risk Management', 'page_start':72, 'page_end': 77}, 
            {'title':'Management Discussion and Analysis', 'page_start': 210, 'page_end': 231}, 
            {'title': 'Consolidated financials', 'page_start': 262, 'page_end': 382},
            {'title': 'Standalone financials', 'page_start': 383, 'page_end': 477}],

    "2023": [{'title':'Bussiness Overview', 'page_start':4, 'page_end': 71}, 
            {'title':'Risk Management', 'page_start':72, 'page_end': 77}, 
            {'title':'Management Discussion and Analysis', 'page_start': 210, 'page_end': 231}, 
            {'title': 'Consolidated financials', 'page_start': 262, 'page_end': 382},
            {'title': 'Standalone financials', 'page_start': 383, 'page_end': 477}],

    "2022": [{'title':'Bussiness Overview', 'page_start':4, 'page_end': 71}, 
            {'title':'Risk Management', 'page_start':72, 'page_end': 77}, 
            {'title':'Management Discussion and Analysis', 'page_start': 210, 'page_end': 231}, 
            {'title': 'Consolidated financials', 'page_start': 262, 'page_end': 382},
            {'title': 'Standalone financials', 'page_start': 383, 'page_end': 477}],

            
    "2021": [{'title':'Bussiness Overview', 'page_start':4, 'page_end': 71}, 
            {'title':'Risk Management', 'page_start':72, 'page_end': 77}, 
            {'title':'Management Discussion and Analysis', 'page_start': 210, 'page_end': 231}, 
            {'title': 'Consolidated financials', 'page_start': 262, 'page_end': 382},
            {'title': 'Standalone financials', 'page_start': 383, 'page_end': 477}]

}

In [2]:
extractor.sections

[{'title': 'Bussiness Overview', 'page_start': 4, 'page_end': 71},
 {'title': 'Risk Management', 'page_start': 72, 'page_end': 77},
 {'title': 'Management Discussion and Analysis',
  'page_start': 210,
  'page_end': 231},
 {'title': 'Consolidated financials', 'page_start': 262, 'page_end': 382},
 {'title': 'Standalone financials', 'page_start': 383, 'page_end': 477}]

In [3]:
generate_vdb(pdf_path="./data/anual_reports/TATAMOTORS/Annual Report 2024.pdf", year=2024, ticker='TATAMOTORS', index_page=4, sections=extractor.sections)

NameError: name 'vector_db' is not defined

Management Discussion and Analysis
Economy Overview
India
The GDP growth estimate for FY24 has been revised upwards 
from 7.3% to 7.6% as per the estimates, highlighting the 
enduring strength of the Indian economy. India grew above 
8% for three consecutive quarters, reaffirming as a standout 
performer amidst sluggish global growth trends.
Indian economy witness strong growth momentum
5.5%
Q2 Q3 Q4 Q1 Q2 Q3
2023-242022-23
Source: MoSPI
Real GDP Growth
4.3%
6.2%
8.2% 8.1% 8.4%
Capital expenditure outlay for the next year is going to be 
increased by 11.1%, which would be 3.4% of India’s total GDP.
Per cent of GDO (Current Pricess)
10.0
2020-21 2021-22
(2nd RE)
2022-23
(1st RE)
10.4
10.9
Stengthening Investment by the Private Corporate Sector
Source: MoSPI
Asian economies such as China, Japan, India, and South 
Korea are among the largest net oil importers globally. Hence 
continued shipping disruptions could hit Asia. A rise in oil 
prices may pose upside risks to inflation and, conse

In [15]:
from agent.rag_agent import select_sections_with_llm, build_section_selection_prompt, rephrase_question, handle_question, api_call, ask_sections
import os 
import json
from dotenv import load_dotenv
load_dotenv()

path = "./database/section_descriptions.json"
with open(path, 'r', encoding='utf-8') as file:
    section_descriptions = json.load(file)

print(handle_question(
    question="What are the company's operational risks?",
    ticker="MRF",
    year=2024,
    section_descriptions=section_descriptions["MRF"]["2024"]
    ) )

Selected Sections: ['MANAGEMENT DISCUSSION AND ANALYSIS', 'REPORT ON CORPORATE GOVERNANCE']
Final response:
The company's operational risks include:

1. **Geopolitical and Monetary Issues**: Weak exports due to geopolitical tensions and monetary problems in various markets can impact sales, particularly in commercial vehicles, two-wheelers, and three-wheelers (Source: Annual-Report-2024.pdf, Content: 41).

2. **Regulatory Changes**: Upcoming regulatory norms expected in 2024-25 could affect the tyre performance criteria for Original Equipment Manufacturers (OEMs), which may impact production and sales (Source: Annual-Report-2024.pdf, Content: 41).

3. **Global Shipping Challenges**: The Red Sea crisis led to increased ocean freight costs and container availability issues, affecting export shipments (Source: Annual-Report-2024.pdf, Content: 42).

4. **Market Competition**: Intense price competition in key markets can hinder growth and profitability (Source: Annual-Report-2024.pdf, Conte

In [14]:
section_descriptions['MRF']['2024']

build_section_selection_prompt(section_descriptions['MRF']['2024']
)

'\nYou are an assistant specialized in analyzing companies\' annual reports. Given a user\'s question and a list of available sections (with descriptions), select the 2 most relevant sections to answer the question.\n\n### Available Sections:\n- BOARD\'S REPORT: The "Board\'s Report" section of MRF\'s annual report provides a comprehensive overview of the company\'s financial performance over the past year, as well as a ten-year financial summary. The purpose of this section is to present a clear and concise summary of the company\'s financial health and performance to shareholders and potential investors. Key data points typically found in this section include revenue from operations, other income, total income, profit before and after taxation, share capital, reserves, net worth, and fixed assets. It also includes a performance overview, detailing the company\'s total income, profit before tax, provision for taxation, and net profit for the year. Other important information includes 

In [12]:

print(handle_question(
    question="Profit and loss statements for the year 2024 compared to 2023.",
    ticker="DMART",
    year=2024,
    section_descriptions=section_descriptions["DMART"]["2024"]
    ) )

Selected Sections: ['Statement of Standalone Profit and Loss', 'Statement of Consolidated Profit and Loss']
Final response:
Here's a comparison of the Profit and Loss statements for Avenue Supermarts Limited (DMART) for the fiscal years ended March 31, 2024, and March 31, 2023:

### Standalone Profit and Loss Statement
1. **Revenue from Operations**:
   - 2024: ₹49,532.95 crore
   - 2023: ₹41,833.25 crore
   - **Increase**: ₹7,699.70 crore

2. **Total Income**:
   - 2024: ₹49,722.00 crore
   - 2023: ₹41,996.34 crore
   - **Increase**: ₹7,725.66 crore

3. **Total Expenses**:
   - 2024: ₹46,110.62 crore
   - 2023: ₹38,765.18 crore
   - **Increase**: ₹7,345.44 crore

4. **Profit Before Tax**:
   - 2024: ₹3,611.38 crore
   - 2023: ₹3,231.16 crore
   - **Increase**: ₹380.22 crore

5. **Net Profit After Tax**:
   - 2024: ₹2,694.92 crore
   - 2023: ₹2,556.40 crore
   - **Increase**: ₹138.52 crore

6. **Earnings Per Share (Basic)**:
   - 2024: ₹41.43
   - 2023: ₹39.46
   - **Increase**: ₹1.97


In [2]:
selected_sections

['Management Discussion and Analysis', 'Directors’ Report']

In [42]:
import json
import os

descriptions = {
    "DMART": {
        "2024": {
            "Section A": "Description of Section A",
            "Description B": "Description of Section B"
        }
    }
}

# Crear carpeta si no existe
os.makedirs("./database", exist_ok=True)

# Verificar si el archivo no existe
if not os.path.exists("./database/section_descriptions.json"):
    with open("./database/section_descriptions.json", "w") as file:
        json.dump(descriptions, file, indent=4)


In [None]:
import os
from database.documents import MRFExtractor

mrf_extractor = MRFExtractor(pdf_path="./data/anual_reports/MRF/Annual-Report-2024.pdf", ticker='MRF', year=2024, api_key=os.getenv("OPENAI_API_KEY"))
sections = mrf_extractor.extract_section_documents()
sections

Detected image-based TOC, using OCR...
Section CHAIRMAN'S MESSAGE is empty or contains no valid text.
Section NEW PRODUCT LAUNCH is empty or contains no valid text.
Section BHARAT MOBILITY & MRF AERO MUSCLE is empty or contains no valid text.
Section OEMRECOGNITION is empty or contains no valid text.
Section ESG is empty or contains no valid text.
Section MRF CORP is empty or contains no valid text.
Section MOTORSPORTS is empty or contains no valid text.
Section GROWTH STORY is empty or contains no valid text.
Section BOARD OF DIRECTORS is empty or contains no valid text.


[Document(metadata={'year': 2024, 'ticker': 'MRF', 'section_title': "BOARD'S REPORT", 'page_start': 13, 'page_end': 39, 'source': 'Annual-Report-2024.pdf', 'report_type': 'annual'}, page_content='\n\n\n\n13\nTen Year Financial Summary (Standalone) 2024 2023 2022 2021 2020 2019 2018 2017 2014-16 2014\n Revenue from Operations 24674 22578 18989 15922 15991 15837 15181 14749 22162 13198\n  Other Income 312 248 315 207 331 417 329 329 321 65\n  Total Income 24986 22826 19304 16129 16322 16254 15510 15078 22483 13263\n  Profit Before Taxation 2739 1119 879 1700 1399 1609 1602 2066 3606 1339\n  Provision for Taxation 698 303 232 451 4 512 510 615 1132 441\n  Profit after Taxation 2041 816 647 1249 1395 1097 1092 1451 2474 898\n  Share Capital 4.24 4.24 4.24 4.24 4.24 4.24 4.24 4.24 4.24 4.24\n  Reserves* 16436 14505 13773 13175 12000 10649 9600 8540 7157 4513\n  Net Worth 16441 14509 13777 13179 12004 10653 9604 8544 7161 4518\n  Fixed Assets Gross 22496 19930 16442 15018 14133 10780 9028 75

In [None]:
import json 

with open('./section_descriptions.json', 'r') as file:
    sections_ = json.load(file)
    print(sections_['MRF']['2024'].keys())

In [4]:
import json

raw_string = '[\n    {"title": "CHAIRMAN’S MESSAGE", "page_start": 1, "page_end": 1},\n    {"title": "NEW PRODUCT LAUNCH", "page_start": 2, "page_end": 4},\n    {"title": "BHARAT MOBILITY & MRF AERO MUSCLE", "page_start": 5, "page_end": 5},\n    {"title": "OEMRECOGNITION", "page_start": 6, "page_end": 7},\n    {"title": "ESG", "page_start": 7, "page_end": 7},\n    {"title": "MRF CORP", "page_start": 8, "page_end": 8},\n    {"title": "MOTORSPORTS", "page_start": 9, "page_end": 9},\n    {"title": "GROWTH STORY", "page_start": 10, "page_end": 11},\n    {"title": "BOARD OF DIRECTORS", "page_start": 12, "page_end": 12},\n    {"title": "BOARD’S REPORT", "page_start": 13, "page_end": 39},\n    {"title": "MANAGEMENT DISCUSSION AND ANALYSIS", "page_start": 40, "page_end": 44},\n    {"title": "REPORT ON CORPORATE GOVERNANCE", "page_start": 45, "page_end": 63},\n    {"title": "STANDALONE FINANCIAL STATEMENTS", "page_start": 64, "page_end": 77},\n    {"title": "BALANCE SHEET", "page_start": 78, "page_end": 78},\n    {"title": "STATEMENT OF PROFIT AND LOSS", "page_start": 79, "page_end": 87},\n    {"title": "NOTES FORMING PART OF THE FINANCIAL STATEMENTS", "page_start": 88, "page_end": 140},\n    {"title": "CONSOLIDATED FINANCIAL STATEMENTS", "page_start": 141, "page_end": 203},\n    {"title": "FORM AOC-1", "page_start": 204, "page_end": 204}\n]'

# Asegúrate de que las comillas sean compatibles con JSON
raw_string = text.replace("’", "'")  # Reemplazar comillas tipográficas si es necesario

toc_list = json.loads(raw_string)

print(toc_list)


[{'title': "CHAIRMAN'S MESSAGE", 'page_start': 1, 'page_end': 1}, {'title': 'NEW PRODUCT LAUNCH', 'page_start': 2, 'page_end': 4}, {'title': 'BHARAT MOBILITY & MRF AERO MUSCLE', 'page_start': 5, 'page_end': 5}, {'title': 'OEMRECOGNITION', 'page_start': 6, 'page_end': 7}, {'title': 'ESG', 'page_start': 7, 'page_end': 7}, {'title': 'MRF CORP', 'page_start': 8, 'page_end': 8}, {'title': 'MOTORSPORTS', 'page_start': 9, 'page_end': 9}, {'title': 'GROWTH STORY', 'page_start': 10, 'page_end': 11}, {'title': 'BOARD OF DIRECTORS', 'page_start': 12, 'page_end': 12}, {'title': "BOARD'S REPORT", 'page_start': 13, 'page_end': 39}, {'title': 'MANAGEMENT DISCUSSION AND ANALYSIS', 'page_start': 40, 'page_end': 44}, {'title': 'REPORT ON CORPORATE GOVERNANCE', 'page_start': 45, 'page_end': 63}, {'title': 'STANDALONE FINANCIAL STATEMENTS', 'page_start': 64, 'page_end': 77}, {'title': 'BALANCE SHEET', 'page_start': 78, 'page_end': 78}, {'title': 'STATEMENT OF PROFIT AND LOSS', 'page_start': 79, 'page_end'

In [6]:
import easyocr
from pdf2image import convert_from_path
from PIL import Image
import tempfile


reader = easyocr.Reader(['en'], gpu = True) # this needs to run only once to load the model into memory
path = "./data/anual_reports/MRF/Annual-Report-2024.pdf"

with tempfile.TemporaryDirectory() as temp_dir:
    # Convert the specific page to image with high DPI
    images = convert_from_path(
        path, 
        first_page=2,
        end_page=2, 
        dpi=300,
        poppler_path='C:/poppler/Release-24.08.0-0/poppler-24.08.0/Library/bin' # Add path if needed: r'C:\path\to\poppler-xx\bin'
    )
    
    # Save temp image and perform OCR
    temp_img_path = os.path.join(temp_dir, "toc_page.png")
    images[0].save(temp_img_path, 'PNG')
    
    result = reader.readtext(temp_img_path)

    print(result)

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


[([[np.int32(1145), np.int32(78)], [np.int32(1651), np.int32(78)], [np.int32(1651), np.int32(174)], [np.int32(1145), np.int32(174)]], 'COntEnt', np.float64(0.276963794096284)), ([[np.int32(1174), np.int32(461)], [np.int32(1274), np.int32(461)], [np.int32(1274), np.int32(500)], [np.int32(1174), np.int32(500)]], 'PAGE', np.float64(0.960416409867735)), ([[np.int32(2523), np.int32(460)], [np.int32(2625), np.int32(460)], [np.int32(2625), np.int32(502)], [np.int32(2523), np.int32(502)]], 'PAGE', np.float64(0.999393880367279)), ([[np.int32(120), np.int32(597)], [np.int32(145), np.int32(597)], [np.int32(145), np.int32(626)], [np.int32(120), np.int32(626)]], '1.', np.float64(0.9988565789729192)), ([[np.int32(184), np.int32(591)], [np.int32(599), np.int32(591)], [np.int32(599), np.int32(631)], [np.int32(184), np.int32(631)]], "CHAIRMAN'S MESSAGE", np.float64(0.999381452444439)), ([[np.int32(1439), np.int32(591)], [np.int32(1488), np.int32(591)], [np.int32(1488), np.int32(631)], [np.int32(1439), 

In [9]:
import requests
from typing import List, Tuple
from agent.rag_agent import QueryResponse

question = "What are the company's risks and concerns mentioned in the Management Discussion and Analysis?"
ticker = "MRF"
url = "http://127.0.0.1:8000/query"
year = 2024
section = "MANAGEMENT DISCUSSION AND ANALYSIS"
k = 5

payload = {
    "question": question,
    "ticker": ticker,
    "year": year,
    "section": section,
    "k": k
}

response = requests.post(url, json=payload)

In [8]:
print(response.content)

b'{"chunks":["of events etc.\\n5. The compliance of the provisions of Corporate and other \\napplicable laws, rules, regulations, standards is the responsibility \\nof management. My examination was limited to the verification of \\nprocedures on test basis.\\n6. The Secretarial Audit report is neither an assurance as to the future \\nviability of the company nor of the efficacy or effectiveness with \\nwhich the management has conducted the affairs of the company.\\nK ELANGOVAN\\nCompany Secretary in Practice\\nPlace: Chennai FCS No.1808, CP No. 3552, P R No. 892/2020\\nDate: 10th May, 2022 UDIN: F001808D000277281","33\\nthe size and operations of the Company to monitor and ensure compliance \\nwith applicable laws, rules, regulations and guidelines:\\n1. Factories Act, 1948;\\n2. Labour laws and other incidental laws related to labour and \\nemployees appointed by the Company including those on \\ncontractual basis as relating to wages, gratuity, prevention of sexual \\nharassment, d