In [8]:
!pip install bs4 selenium pandas rapidfuzz



In [9]:
import os
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, parse_qs, unquote
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry


class SEBIScraper:
    BASE = "https://www.sebi.gov.in"
    SEARCH_URL = BASE + "/sebiweb/home/HomeAction.do"
    HEADERS = {"User-Agent": "Mozilla/5.0"}

    def __init__(self, folder="SEBI_RHPs", max_retries=3, backoff=1):
        self.folder = folder
        os.makedirs(folder, exist_ok=True)

        # Create a resilient session
        self.session = requests.Session()
        retries = Retry(
            total=max_retries,
            backoff_factor=backoff,
            status_forcelist=[500, 502, 503, 504],
        )
        self.session.mount("https://", HTTPAdapter(max_retries=retries))

    # -------------------------------
    # Utility: safe filename
    # -------------------------------
    def _safe_filename(self, name: str) -> str:
        return re.sub(r"[^a-zA-Z0-9._-]", "_", name)

    # -------------------------------
    # Step 1: Search company filings
    # -------------------------------
    def search_company(self, company_name: str):
        params = {
            "doListing": "yes",
            "sid": "3",
            "ssid": "15",
            "smid": "11",
            "search": company_name,
        }
        r = self.session.get(self.SEARCH_URL, params=params,
                             headers=self.HEADERS, timeout=20)
        r.raise_for_status()
        soup = BeautifulSoup(r.text, "html.parser")

        results = []
        for a in soup.find_all("a", href=True):
            href = a["href"]
            if "/filings/public-issues/" in href:
                results.append(urljoin(self.BASE, href))
        return list(dict.fromkeys(results))  # deduplicate

    # -------------------------------
    # Step 2: Extract PDFs from filing page
    # -------------------------------
    def extract_pdfs(self, filing_url: str):
        r = self.session.get(filing_url, headers=self.HEADERS, timeout=20)
        r.raise_for_status()
        soup = BeautifulSoup(r.text, "html.parser")
        pdfs = []

        # Case 1: direct <a href="...pdf"> or attachdocs links
        for a in soup.find_all("a", href=True):
            href = a["href"]
            if href.lower().endswith(".pdf") or "attachdocs" in href or "file=" in href:
                candidate = urljoin(self.BASE, href)
                parsed = urlparse(candidate)
                qs = parse_qs(parsed.query)
                if "file" in qs:  # unwrap real PDF link
                    pdfs.append(unquote(qs["file"][0]))
                else:
                    pdfs.append(candidate)

        # Case 2: iframe/embed with ?file=
        for tag in soup.find_all(["iframe", "embed"], src=True):
            src = tag["src"]
            if "file=" in src:
                candidate = urljoin(self.BASE, src)
                parsed = urlparse(candidate)
                qs = parse_qs(parsed.query)
                if "file" in qs:
                    pdfs.append(unquote(qs["file"][0]))
                else:
                    pdfs.append(candidate)

        return list(dict.fromkeys(pdfs))

    # -------------------------------
    # Step 3: Download PDF
    # -------------------------------
    def download_pdf(self, pdf_url: str, prefix="RHP"):
        # Unwrap ?file= link if needed
        parsed = urlparse(pdf_url)
        qs = parse_qs(parsed.query)
        if "file" in qs:
            pdf_url = unquote(qs["file"][0])

        filename = os.path.basename(urlparse(pdf_url).path)
        safe_name = self._safe_filename(f"{prefix}__{filename}")
        path = os.path.join(self.folder, safe_name)

        if os.path.exists(path):
            print(f"‚ÑπÔ∏è Already exists: {path}")
            return path

        with self.session.get(pdf_url, headers=self.HEADERS, stream=True, timeout=30) as r:
            r.raise_for_status()
            # Ensure it's a PDF
            if "application/pdf" not in r.headers.get("Content-Type", ""):
                print(f"‚ö†Ô∏è Not a PDF link (may be wrapper): {pdf_url}")
            with open(path, "wb") as f:
                for chunk in r.iter_content(8192):
                    if chunk:
                        f.write(chunk)

        print(f"‚úÖ Saved: {path}")
        return path

    # -------------------------------
    # Step 4: Orchestrator
    # -------------------------------
    def download_all(self, company_name: str):
        filings = self.search_company(company_name)
        if not filings:
            print(f"‚ùå No filings found for {company_name}")
            return []

        print(f"Filing pages for {company_name}:")
        for f in filings:
            print(" ‚Ä¢", f)

        results = []
        for f in filings:
            pdfs = self.extract_pdfs(f)
            print(f"\nüîó PDFs from {f}:")
            for p in pdfs:
                print("   ", p)

            # Infer filing type from URL (RHP, DRHP, Addendum, Corrigendum)
            basename = os.path.basename(urlparse(f).path)
            filing_type = (
                "RHP" if "rhp" in basename.lower()
                else "DRHP" if "drhp" in basename.lower()
                else "ADDENDUM" if "addendum" in basename.lower()
                else "CORRIGENDUM" if "corrigendum" in basename.lower()
                else "FILING"
            )

            prefix = f"{company_name.replace(' ', '_')}_{filing_type}"

            for pdf in pdfs:
                local = self.download_pdf(pdf, prefix=prefix)
                results.append({
                    "company": company_name,
                    "filing_url": f,
                    "pdf_url": pdf,
                    "local_path": local,
                    "type": filing_type,
                })

        return results



Company=Lenskart Solutions Limited

In [10]:
# -------------------------------
# Example usage
# -------------------------------
if __name__ == "__main__":
    scraper = SEBIScraper()
    company = "Tata Capital Limited"  # try also "Tata Capital Limited"
    all_files = scraper.download_all(company)
    print("\n‚úÖ Completed. Files downloaded:")
    for f in all_files:
        print(f)

Filing pages for Tata Capital Limited:
 ‚Ä¢ https://www.sebi.gov.in/filings/public-issues/sep-2025/tata-capital-limited-rhp_96899.html

üîó PDFs from https://www.sebi.gov.in/filings/public-issues/sep-2025/tata-capital-limited-rhp_96899.html:
    https://www.sebi.gov.in/sebi_data/attachdocs/sep-2025/1759136346791.pdf
‚ÑπÔ∏è Already exists: SEBI_RHPs\Tata_Capital_Limited_RHP__1759136346791.pdf

‚úÖ Completed. Files downloaded:
{'company': 'Tata Capital Limited', 'filing_url': 'https://www.sebi.gov.in/filings/public-issues/sep-2025/tata-capital-limited-rhp_96899.html', 'pdf_url': 'https://www.sebi.gov.in/sebi_data/attachdocs/sep-2025/1759136346791.pdf', 'local_path': 'SEBI_RHPs\\Tata_Capital_Limited_RHP__1759136346791.pdf', 'type': 'RHP'}


In [11]:
!pip install -q langchain langchain-core langchain-community langchain-groq pypdf pandas rapidfuzz langchain-text-splitters

import os
import re
import json
from typing import List, Dict, Any, Optional
from dataclasses import dataclass, field
import pandas as pd

from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

# ======================= CONFIG ===========================
# Set your key
os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY", "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")
MODEL_NAME = "llama-3.1-8b-instant"
TEMPERATURE = 0.2


# ======================= HELPERS ==========================
def pick_best_pdf(results: List[Dict[str, Any]]) -> str:
    """Prefer RHP > DRHP > others"""
    if not results:
        raise RuntimeError("No filings found.")
    ranked = sorted(results, key=lambda r: 0 if r.get("type", "").upper() == "RHP" else 1 if r.get("type",
                                                                                                   "").upper() == "DRHP" else 2)
    top = ranked[0]
    lp = top.get("local_path")
    if not lp:
        raise RuntimeError("Best filing has no local_path.")
    return lp


def load_pdf_chunks(path: str, chunk_size=1600, chunk_overlap=200) -> List[str]:
    loader = PyPDFLoader(path)
    docs = loader.load()
    tagged = []
    for d in docs:
        pg = d.metadata.get("page", 0)
        tagged.append(f"[PAGE {pg}]\n{d.page_content}")
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return splitter.split_text("\n".join(tagged))


def batch_chunks(chunks: List[str], max_chars: int = 4500) -> List[str]:
    """Pack small chunks into compact batches under model limits."""
    batches, cur, cur_len = [], [], 0
    for ch in chunks:
        if cur_len + len(ch) + 2 > max_chars and cur:
            batches.append("\n\n".join(cur))
            cur, cur_len = [], 0
        cur.append(ch)
        cur_len += len(ch) + 2
    if cur:
        batches.append("\n\n".join(cur))
    return batches


# Prefer newest FY in text snippets (simple regex heuristic)
FY_PATTERN = re.compile(r"FY\s?(20\d{2}|19\d{2})", re.IGNORECASE)


def newest_fy_score(text: str) -> int:
    years = [int(y) for y in FY_PATTERN.findall(text or "")]
    return max(years) if years else 0


# ======================= DATA MODEL =======================
@dataclass
class IPOExtract:
    company_overview: str = ""
    industry_market: str = ""
    financial_performance: Dict[str, Any] = field(default_factory=dict)
    ipo_structure: Dict[str, Any] = field(default_factory=dict)
    promoters_management: Dict[str, Any] = field(default_factory=dict)
    risk_factors: List[str] = field(default_factory=list)
    valuation_peers: Dict[str, Any] = field(default_factory=dict)
    legal_regulatory: Dict[str, Any] = field(default_factory=dict)
    growth_metrics: Dict[str, Any] = field(default_factory=dict)
    investment_summary: Dict[str, Any] = field(default_factory=dict)
    sources: List[str] = field(default_factory=list)


def empty_extract() -> Dict[str, Any]:
    return {
        "company_overview": "",
        "industry_market": "",
        "financial_performance": {
            "Revenue trend (last 3 years)": "N/A",
            "EBITDA trend (last 3 years)": "N/A",
            "Net Profit trend (last 3 years)": "N/A",
            "Return on Equity": "N/A",
            "Return on Assets": "N/A",
            "Net Profit Margin": "N/A",
            "Debt-to-Equity": "N/A",
            "Cash flow trends": "N/A",
            "Capital expenditure": "N/A",
        },
        "ipo_structure": {
            "Total issue size": "N/A",
            "Price band": "N/A",
            "Lot size": "N/A",
            "Fresh issue": "N/A",
            "Offer for Sale": "N/A",
            "Use of proceeds": "N/A",
        },
        "promoters_management": {
            "Promoters": "N/A",
            "Management": "N/A",
            "Shareholding (pre vs post IPO)": "N/A",
        },
        "risk_factors": [],
        "valuation_peers": {
            "Valuation multiples": "N/A",
            "Peer comparison": "N/A",
            "Why valuation may be N/A": "N/A",
        },
        "legal_regulatory": {
            "Litigations": "N/A",
            "Regulatory dependencies": "N/A",
        },
        "growth_metrics": {
            "Revenue Growth": "N/A",
            "Profit Growth": "N/A",
            "Interpretation of growth": "N/A",
            "Why RG/PG may be N/A": "N/A",
        },
        "investment_summary": {
            "Strengths": "N/A",
            "Concerns": "N/A",
            "Overall IPO view": "N/A",
        },
        "sources": []
    }


# ======================= LLM CHAIN ========================
def build_chain():
    llm = ChatGroq(model=MODEL_NAME, temperature=TEMPERATURE)
    parser = JsonOutputParser()

    # NOTE: Escape braces to avoid KeyError with prompt variables
    prompt = ChatPromptTemplate.from_messages([
        ("system",
         "You are a financial analyst assistant. Use ONLY the provided RHP text. "
         "Be concise, structured, and context-aware. If data is missing or typical pre-IPO N/A, explain why."),
        ("user",
         """Extract JSON exactly in this schema (keys must match; do not add or remove keys).
 Return ONLY JSON. Use full-form metric names (no bare acronyms). Include page markers like [PAGE 123] you used.

 {{
   "company_overview": "<business model, products/services, target markets, revenue model, edge>",
   "industry_market": "<sector, industry trends, competition, positioning>",
   "financial_performance": {{
     "Revenue trend (last 3 years)": "",
     "EBITDA trend (last 3 years)": "",
     "Net Profit trend (last 3 years)": "",
     "Return on Equity": "",
     "Return on Assets": "",
     "Net Profit Margin": "",
     "Debt-to-Equity": "",
     "Cash flow trends": "",
     "Capital expenditure": ""
   }},
   "ipo_structure": {{
     "Total issue size": "",
     "Price band": "",
     "Lot size": "",
     "Fresh issue": "",
     "Offer for Sale": "",
     "Use of proceeds": ""
   }},
   "promoters_management": {{
     "Promoters": "",
     "Management": "",
     "Shareholding (pre vs post IPO)": ""
   }},
   "risk_factors": ["", "", "", "", ""],
   "valuation_peers": {{
     "Valuation multiples": "",
     "Peer comparison": "",
     "Why valuation may be N/A": ""
   }},
   "legal_regulatory": {{
     "Litigations": "",
     "Regulatory dependencies": ""
   }},
   "growth_metrics": {{
     "Revenue Growth": "",
     "Profit Growth": "",
     "Interpretation of growth": "",
     "Why RG/PG may be N/A": ""
   }},
   "investment_summary": {{
     "Strengths": "",
     "Concerns": "",
     "Overall IPO view": ""
   }},
   "sources": ["[PAGE ...]", "[PAGE ...]"]
 }}

 Guidelines:
 - Keep each field terse but informative (1‚Äì3 sentences), except risk_factors as 5‚Äì7 bullets.
 - If "Price band" or market-derived multiples are not in RHP, set "N/A" and explain under the relevant 'why' fields.
 - If ratios are absent, say "N/A ‚Äì not disclosed in RHP financials section" (typical pre-IPO).
 - Use exact wording from the RHP when citing quantitative facts.

 TEXT:
 {context}
 """)
    ])
    return prompt | llm | parser


# ======================= MAP / REDUCE =====================
SECTION_KEYS = list(empty_extract().keys())


def merge_text(best: str, candidate: str) -> str:
    """Prefer newer FY mentions; otherwise prefer longer non-empty text."""
    if not candidate:
        return best
    if not best:
        return candidate
    if newest_fy_score(candidate) > newest_fy_score(best):
        return candidate
    if len(candidate) > len(best) and newest_fy_score(candidate) == newest_fy_score(best):
        return candidate
    return best


def merge_field(best: Any, candidate: Any) -> Any:
    if isinstance(best, str):
        return merge_text(best, candidate if isinstance(candidate, str) else "")
    if isinstance(best, list):
        # Prefer longer non-empty list; keep unique non-empty bullets
        cand_list = [x for x in (candidate or []) if isinstance(x, str) and x.strip()]
        best_list = [x for x in (best or []) if isinstance(x, str) and x.strip()]
        merged = best_list or []
        for x in cand_list:
            if x not in merged:
                merged.append(x)
        return merged[:7]  # cap
    if isinstance(best, dict):
        out = dict(best)
        for k, v in best.items():
            out[k] = merge_field(best.get(k, ""), (candidate or {}).get(k, ""))
        return out
    return best or candidate


def safe_parse_json(s: str) -> Optional[Dict[str, Any]]:
    try:
        return json.loads(s) if isinstance(s, str) else s
    except Exception:
        return None


def extract_map_reduce(batches: List[str]) -> Dict[str, Any]:
    chain = build_chain()
    agg = empty_extract()
    for b in batches:
        try:
            partial = chain.invoke({"context": b})
            partial = safe_parse_json(partial) or partial  # parser already returns dict
            if not isinstance(partial, dict):
                continue
            for k in SECTION_KEYS:
                agg[k] = merge_field(agg[k], partial.get(k, "" if isinstance(agg[k], str) else {}))
        except Exception:
            # skip noisy batch
            continue
    # Clean up sources (dedupe)
    agg["sources"] = sorted(set(agg.get("sources", [])))
    # If completely empty risk list, keep max 0
    return agg


# ======================= WEIGHT TABLE ======================
def weights_df():
    return pd.DataFrame([
        ("Profitability", "Return on Equity (ROE)", 0.15),
        ("Profitability", "Net Profit Margin (NPM)", 0.10),
        ("Efficiency", "Return on Assets (ROA)", 0.10),
        ("Leverage", "Debt-to-Equity (D/E)", 0.10),
        ("Valuation", "Price-to-Earnings (P/E)", 0.20),
        ("Valuation", "Price-to-Book (P/B)", 0.10),
        ("Growth", "Revenue Growth (RG)", 0.15),
        ("Growth", "Profit Growth (PG)", 0.10),
    ], columns=["Category", "Metric", "Suggested Weight"])


# ======================= ORCHESTRATOR ======================
def analyze(company: str, scraper) -> Dict[str, Any]:
    filings = scraper.download_all(company)
    pdf = pick_best_pdf(filings)
    chunks = load_pdf_chunks(pdf)
    # Heuristic: prioritize chunks likely to contain structured data
    priority_words = ("financial", "revenue", "profit", "risk", "promoter",
                      "offer", "issue", "objects", "use of proceeds", "capital",
                      "management", "industry", "market", "valuation", "peers")
    prioritized = [c for c in chunks if any(w in c.lower() for w in priority_words)]
    rest = [c for c in chunks if c not in prioritized]
    # Batch them
    batches = batch_chunks(prioritized + rest, max_chars=4000)
    extracted = extract_map_reduce(batches[:60])  # safety cap to avoid TPM
    return {
        "company": company,
        "extracted": extracted,
        "weights": weights_df()
    }


# %%
# Example
if __name__ == "__main__":
    scraper = SEBIScraper(folder="SEBI_RHPs")
    company = "Tata Capital Limited"  # change as needed
    result = analyze(company, scraper)

    print("\n=== COMPANY ===")
    print(result["company"])

    print("\n=== 1) Company Overview ===")
    print(result["extracted"]["company_overview"])

    print("\n=== 2) Industry & Market Position ===")
    print(result["extracted"]["industry_market"])

    print("\n=== 3) Financial Performance ===")
    for k, v in result["extracted"]["financial_performance"].items():
        print(f"- {k}: {v}")

    print("\n=== 4) IPO Structure ===")
    for k, v in result["extracted"]["ipo_structure"].items():
        print(f"- {k}: {v}")

    print("\n=== 5) Promoters & Management ===")
    for k, v in result["extracted"]["promoters_management"].items():
        print(f"- {k}: {v}")

    print("\n=== 6) Risk Factors ===")
    for i, r in enumerate(result["extracted"]["risk_factors"][:7], 1):
        print(f"{i}. {r}")

    print("\n=== 7) Valuation & Peer Benchmarking ===")
    for k, v in result["extracted"]["valuation_peers"].items():
        print(f"- {k}: {v}")

    print("\n=== 8) Legal & Regulatory Disclosures ===")
    for k, v in result["extracted"]["legal_regulatory"].items():
        print(f"- {k}: {v}")

    print("\n=== 9) Growth Metrics ===")
    for k, v in result["extracted"]["growth_metrics"].items():
        print(f"- {k}: {v}")

    print("\n=== 10) Investment Summary ===")
    for k, v in result["extracted"]["investment_summary"].items():
        print(f"- {k}: {v}")

    print("\n=== SOURCES (Pages) ===")
    print(", ".join(result["extracted"]["sources"]) or "N/A")

    print("\n=== SUGGESTED WEIGHTS ===")
    print(result["weights"])


Filing pages for Tata Capital Limited:
 ‚Ä¢ https://www.sebi.gov.in/filings/public-issues/sep-2025/tata-capital-limited-rhp_96899.html

üîó PDFs from https://www.sebi.gov.in/filings/public-issues/sep-2025/tata-capital-limited-rhp_96899.html:
    https://www.sebi.gov.in/sebi_data/attachdocs/sep-2025/1759136346791.pdf
‚ÑπÔ∏è Already exists: SEBI_RHPs\Tata_Capital_Limited_RHP__1759136346791.pdf

=== COMPANY ===
Tata Capital Limited

=== 1) Company Overview ===
The company operates in the financial services sector, providing digital lending services to customers. Its business model involves offering loans to individuals and small businesses through its online platform. The company's target market includes individuals and small businesses seeking quick and convenient loan options. Its revenue model is based on interest income from loans disbursed. The company's edge lies in its use of artificial intelligence and machine learning algorithms to assess creditworthiness and provide personalize