In [None]:
!pip install bs4 selenium pandas rapidfuzz

Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Collecting selenium
  Downloading selenium-4.38.0-py3-none-any.whl.metadata (7.5 kB)
Collecting rapidfuzz
  Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Collecting trio<1.0,>=0.31.0 (from selenium)
  Downloading trio-0.32.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket<1.0,>=0.12.2 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting outcome (from trio<1.0,>=0.31.0->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket<1.0,>=0.12.2->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Downloading selenium-4.38.0-py3-none-any.whl (9.7 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚î

In [None]:
import os
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, parse_qs, unquote
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry


class SEBIScraper:
    BASE = "https://www.sebi.gov.in"
    SEARCH_URL = BASE + "/sebiweb/home/HomeAction.do"
    HEADERS = {"User-Agent": "Mozilla/5.0"}

    def __init__(self, folder="SEBI_RHPs", max_retries=3, backoff=1):
        self.folder = folder
        os.makedirs(folder, exist_ok=True)

        # Create a resilient session
        self.session = requests.Session()
        retries = Retry(
            total=max_retries,
            backoff_factor=backoff,
            status_forcelist=[500, 502, 503, 504],
        )
        self.session.mount("https://", HTTPAdapter(max_retries=retries))

    # -------------------------------
    # Utility: safe filename
    # -------------------------------
    def _safe_filename(self, name: str) -> str:
        return re.sub(r"[^a-zA-Z0-9._-]", "_", name)

    # -------------------------------
    # Step 1: Search company filings
    # -------------------------------
    def search_company(self, company_name: str):
        params = {
            "doListing": "yes",
            "sid": "3",
            "ssid": "15",
            "smid": "11",
            "search": company_name,
        }
        r = self.session.get(self.SEARCH_URL, params=params,
                             headers=self.HEADERS, timeout=20)
        r.raise_for_status()
        soup = BeautifulSoup(r.text, "html.parser")

        results = []
        for a in soup.find_all("a", href=True):
            href = a["href"]
            if "/filings/public-issues/" in href:
                results.append(urljoin(self.BASE, href))
        return list(dict.fromkeys(results))  # deduplicate

    # -------------------------------
    # Step 2: Extract PDFs from filing page
    # -------------------------------
    def extract_pdfs(self, filing_url: str):
        r = self.session.get(filing_url, headers=self.HEADERS, timeout=20)
        r.raise_for_status()
        soup = BeautifulSoup(r.text, "html.parser")
        pdfs = []

        # Case 1: direct <a href="...pdf"> or attachdocs links
        for a in soup.find_all("a", href=True):
            href = a["href"]
            if href.lower().endswith(".pdf") or "attachdocs" in href or "file=" in href:
                candidate = urljoin(self.BASE, href)
                parsed = urlparse(candidate)
                qs = parse_qs(parsed.query)
                if "file" in qs:  # unwrap real PDF link
                    pdfs.append(unquote(qs["file"][0]))
                else:
                    pdfs.append(candidate)

        # Case 2: iframe/embed with ?file=
        for tag in soup.find_all(["iframe", "embed"], src=True):
            src = tag["src"]
            if "file=" in src:
                candidate = urljoin(self.BASE, src)
                parsed = urlparse(candidate)
                qs = parse_qs(parsed.query)
                if "file" in qs:
                    pdfs.append(unquote(qs["file"][0]))
                else:
                    pdfs.append(candidate)

        return list(dict.fromkeys(pdfs))

    # -------------------------------
    # Step 3: Download PDF
    # -------------------------------
    def download_pdf(self, pdf_url: str, prefix="RHP"):
        # Unwrap ?file= link if needed
        parsed = urlparse(pdf_url)
        qs = parse_qs(parsed.query)
        if "file" in qs:
            pdf_url = unquote(qs["file"][0])

        filename = os.path.basename(urlparse(pdf_url).path)
        safe_name = self._safe_filename(f"{prefix}__{filename}")
        path = os.path.join(self.folder, safe_name)

        if os.path.exists(path):
            print(f"‚ÑπÔ∏è Already exists: {path}")
            return path

        with self.session.get(pdf_url, headers=self.HEADERS, stream=True, timeout=30) as r:
            r.raise_for_status()
            # Ensure it's a PDF
            if "application/pdf" not in r.headers.get("Content-Type", ""):
                print(f"‚ö†Ô∏è Not a PDF link (may be wrapper): {pdf_url}")
            with open(path, "wb") as f:
                for chunk in r.iter_content(8192):
                    if chunk:
                        f.write(chunk)

        print(f"‚úÖ Saved: {path}")
        return path

    # -------------------------------
    # Step 4: Orchestrator
    # -------------------------------
    def download_all(self, company_name: str):
        filings = self.search_company(company_name)
        if not filings:
            print(f"‚ùå No filings found for {company_name}")
            return []

        print(f"Filing pages for {company_name}:")
        for f in filings:
            print(" ‚Ä¢", f)

        results = []
        for f in filings:
            pdfs = self.extract_pdfs(f)
            print(f"\nüîó PDFs from {f}:")
            for p in pdfs:
                print("   ", p)

            # Infer filing type from URL (RHP, DRHP, Addendum, Corrigendum)
            basename = os.path.basename(urlparse(f).path)
            filing_type = (
                "RHP" if "rhp" in basename.lower()
                else "DRHP" if "drhp" in basename.lower()
                else "ADDENDUM" if "addendum" in basename.lower()
                else "CORRIGENDUM" if "corrigendum" in basename.lower()
                else "FILING"
            )

            prefix = f"{company_name.replace(' ', '_')}_{filing_type}"

            for pdf in pdfs:
                local = self.download_pdf(pdf, prefix=prefix)
                results.append({
                    "company": company_name,
                    "filing_url": f,
                    "pdf_url": pdf,
                    "local_path": local,
                    "type": filing_type,
                })

        return results



In [None]:
# -------------------------------
# Example usage
# -------------------------------
if __name__ == "__main__":
    scraper = SEBIScraper()
    company = "Waaree"  # try also "Tata Capital Limited"
    all_files = scraper.download_all(company)
    print("\n‚úÖ Completed. Files downloaded:")
    for f in all_files:
        print(f)

Filing pages for Waaree:
 ‚Ä¢ https://www.sebi.gov.in/filings/public-issues/oct-2024/waaree-energies-limited-rhp_87562.html

üîó PDFs from https://www.sebi.gov.in/filings/public-issues/oct-2024/waaree-energies-limited-rhp_87562.html:
    https://www.sebi.gov.in/sebi_data/attachdocs/oct-2024/1728986428790.pdf
‚úÖ Saved: SEBI_RHPs/Waaree_RHP__1728986428790.pdf

‚úÖ Completed. Files downloaded:
{'company': 'Waaree', 'filing_url': 'https://www.sebi.gov.in/filings/public-issues/oct-2024/waaree-energies-limited-rhp_87562.html', 'pdf_url': 'https://www.sebi.gov.in/sebi_data/attachdocs/oct-2024/1728986428790.pdf', 'local_path': 'SEBI_RHPs/Waaree_RHP__1728986428790.pdf', 'type': 'RHP'}
