# Introduction

In this notebook we will scrape CBSL website and download daily price reports

In [5]:
import asyncio
import re
from datetime import datetime
from urllib.parse import urljoin
from playwright.async_api import async_playwright

In [6]:
BASE_URL = 'https://www.cbsl.gov.lk'
LIST_URL   = f"{BASE_URL}/daily-price-report"
START_YEAR = 2020

In [8]:
async def main():
    print("Navigating to:", LIST_URL)

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)  # open a visible browser for debugging
        page = await browser.new_page()
        await page.goto(LIST_URL, timeout=60000)

        print("✅ Page loaded:", page.url)

        html = await page.content()
        print("🔍 HTML snippet:\n", html[:500])

if __name__ == "__main__":
    await main()

Navigating to: https://www.cbsl.gov.lk/daily-price-report
✅ Page loaded: https://www.cbsl.gov.lk/daily-price-report
🔍 HTML snippet:
 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"><html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en" dir="ltr" class="js"><head>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<link rel="shortcut icon" href="https://www.cbsl.gov.lk/sites/default/files/logo_0_0.jpg" type="image/jpeg">
<meta name="generator" content="Drupal 7 (http://drupal.org)">
<link rel="canonical" href="https://www.


In [15]:
import asyncio
import aiohttp
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from datetime import datetime

BASE = "https://www.cbsl.gov.lk"
AJAX_URL = f"{BASE}/en/views/ajax"

async def fetch_page(session, page_num):
    payload = {
        "view_name": "price_report",
        "view_display_id": "block_1",
        "view_path": "node/144",
        "view_base_path": "publications/price",
        "page": page_num,
        "pager_element": 0,
    }
    async with session.post(AJAX_URL, data=payload) as resp:
        js = await resp.json()
        html_fragment = None
        # find the "insert" command object and extract its data
        for item in js:
            if isinstance(item, dict) and item.get("command") == "insert":
                html_fragment = item.get("data")
                break
        if not html_fragment:
            return []

        soup = BeautifulSoup(html_fragment, "html.parser")
        rows = []
        for a in soup.select("a[href$='.pdf']"):
            href = urljoin(BASE, a["href"])
            text = a.get_text(strip=True)
            try:
                date = datetime.strptime(text.split("-")[-1].strip(), "%d %B %Y")
            except Exception:
                date = None
            rows.append((date, href))
        return rows


async def main():
    async with aiohttp.ClientSession() as session:
        all_links = []
        page = 0
        while True:
            rows = await fetch_page(session, page)
            if not rows:
                break
            print(f"📄 Page {page}: {len(rows)} reports")
            all_links.extend(rows)
            page += 1
        print(f"✅ Collected {len(all_links)} total reports")
        for d, u in all_links[:5]:
            print(d, "→", u)

if __name__ == "__main__":
    await main()

📄 Page 0: 24 reports
📄 Page 1: 24 reports
📄 Page 2: 24 reports
📄 Page 3: 24 reports
📄 Page 4: 24 reports
📄 Page 5: 24 reports
📄 Page 6: 24 reports
📄 Page 7: 24 reports
📄 Page 8: 24 reports
📄 Page 9: 24 reports
📄 Page 10: 24 reports
📄 Page 11: 24 reports
📄 Page 12: 24 reports
📄 Page 13: 24 reports
📄 Page 14: 24 reports
📄 Page 15: 24 reports
📄 Page 16: 24 reports
📄 Page 17: 24 reports
📄 Page 18: 24 reports
📄 Page 19: 24 reports
📄 Page 20: 24 reports
📄 Page 21: 24 reports
📄 Page 22: 24 reports
📄 Page 23: 24 reports
📄 Page 24: 24 reports
📄 Page 25: 24 reports
📄 Page 26: 24 reports
📄 Page 27: 24 reports
📄 Page 28: 24 reports
📄 Page 29: 24 reports
📄 Page 30: 24 reports
📄 Page 31: 24 reports
📄 Page 32: 24 reports
📄 Page 33: 24 reports
📄 Page 34: 24 reports
📄 Page 35: 24 reports
📄 Page 36: 24 reports
📄 Page 37: 24 reports
📄 Page 38: 24 reports
📄 Page 39: 24 reports
📄 Page 40: 24 reports


CancelledError: 

In [20]:
import asyncio
import aiohttp
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from datetime import datetime
import os

BASE = "https://www.cbsl.gov.lk"
AJAX_URL = f"{BASE}/en/views/ajax"
OUTPUT_DIR = "cbsldata"
os.makedirs(OUTPUT_DIR, exist_ok=True)


async def fetch_page(session, page_num):
    payload = {
        "view_name": "price_report",
        "view_display_id": "block_1",
        "view_path": "node/144",
        "view_base_path": "publications/price",
        "pager_element": 0,
        "page": page_num,
        "field_year_tid": "88",   # <-- 2025
        "field_month_tid": "All", # <-- no specific month
    }
    async with session.post(AJAX_URL, data=payload) as resp:
        js = await resp.json()
        html_fragment = None
        for item in js:
            if isinstance(item, dict) and item.get("command") == "insert":
                html_fragment = item.get("data")
                break
        if not html_fragment:
            return []

        soup = BeautifulSoup(html_fragment, "html.parser")
        rows = []
        for a in soup.select("a[href$='.pdf']"):
            href = urljoin(BASE, a["href"])
            text = a.get_text(strip=True)
            try:
                date = datetime.strptime(text.split("-")[-1].strip(), "%d %B %Y")
            except Exception:
                date = None
            rows.append((date, href))
        return rows


async def download_pdf(session, date_obj, url):
    fname = f"{date_obj.strftime('%Y-%m-%d') if date_obj else 'unknown'}.pdf"
    fpath = os.path.join(OUTPUT_DIR, fname)
    if os.path.exists(fpath):
        print(f"⚪ Skipping {fname} (already exists)")
        return
    try:
        async with session.get(url) as resp:
            if resp.status == 200:
                with open(fpath, "wb") as f:
                    f.write(await resp.read())
                print(f"✅ Saved {fname}")
            else:
                print(f"⚠️ Failed {url} [{resp.status}]")
    except Exception as e:
        print(f"❌ Error {url}: {e}")


async def main():
    async with aiohttp.ClientSession() as session:
        all_links = []
        # only fetch first 5 pages
        for page in range(5):
            rows = await fetch_page(session, page)
            print(f"📄 Page {page}: {len(rows)} reports")
            all_links.extend(rows)

        print(f"🔗 Total {len(all_links)} PDFs to download")

        # download concurrently (limit 5 at a time)
        sem = asyncio.Semaphore(5)
        async def bounded_download(date_obj, url):
            async with sem:
                await download_pdf(session, date_obj, url)

        tasks = [bounded_download(d, u) for d, u in all_links]
        await asyncio.gather(*tasks)

        print("🎉 Done! All files saved in ./cbsldata")


if __name__ == "__main__":
    await main()


📄 Page 0: 24 reports
📄 Page 1: 24 reports
📄 Page 2: 24 reports
📄 Page 3: 24 reports
📄 Page 4: 24 reports
🔗 Total 120 PDFs to download
✅ Saved 2025-10-17.pdf
✅ Saved unknown.pdf
✅ Saved unknown.pdf
⚪ Skipping unknown.pdf (already exists)
✅ Saved unknown.pdf
⚪ Skipping unknown.pdf (already exists)
✅ Saved 2025-10-16.pdf
⚪ Skipping unknown.pdf (already exists)
✅ Saved 2025-10-14.pdf
⚪ Skipping unknown.pdf (already exists)
✅ Saved 2025-10-15.pdf
⚪ Skipping unknown.pdf (already exists)
✅ Saved 2025-10-10.pdf
⚪ Skipping unknown.pdf (already exists)
✅ Saved 2025-10-08.pdf
⚪ Skipping unknown.pdf (already exists)
✅ Saved 2025-10-09.pdf
⚪ Skipping unknown.pdf (already exists)
✅ Saved 2025-10-13.pdf
⚪ Skipping unknown.pdf (already exists)
✅ Saved 2025-10-02.pdf
✅ Saved 2025-10-03.pdf
⚪ Skipping unknown.pdf (already exists)
⚪ Skipping unknown.pdf (already exists)
✅ Saved 2025-10-07.pdf
⚪ Skipping unknown.pdf (already exists)
✅ Saved 2025-10-01.pdf
⚪ Skipping unknown.pdf (already exists)
✅ Saved 20

CancelledError: 

In [21]:
import asyncio
import aiohttp
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from datetime import datetime
import os

BASE = "https://www.cbsl.gov.lk"
AJAX_URL = f"{BASE}/en/views/ajax"
OUTPUT_DIR = "cbsldata_2025"
os.makedirs(OUTPUT_DIR, exist_ok=True)


async def fetch_page(session, page_num, year_id="88"):
    """Fetch one paginated batch of report links for a given year."""
    payload = {
        "view_name": "price_report",
        "view_display_id": "block_1",
        "view_path": "node/144",
        "view_base_path": "publications/price",
        "pager_element": 0,
        "page": page_num,
        "field_year_tid": year_id,   # 88 → 2025
        "field_month_tid": "All",
    }
    async with session.post(AJAX_URL, data=payload) as resp:
        if resp.status != 200:
            print(f"⚠️  Page {page_num} failed [{resp.status}]")
            return []

        js = await resp.json()
        html_fragment = next(
            (item.get("data") for item in js if isinstance(item, dict) and item.get("command") == "insert"),
            None,
        )
        if not html_fragment:
            return []

        soup = BeautifulSoup(html_fragment, "html.parser")
        rows = []
        for a in soup.select("a[href$='.pdf']"):
            href = urljoin(BASE, a["href"])
            text = a.get_text(strip=True)
            try:
                date = datetime.strptime(text.split("-")[-1].strip(), "%d %B %Y")
            except Exception:
                date = None
            rows.append((date, href))
        return rows


async def download_pdf(session, date_obj, url):
    """Download one PDF safely."""
    fname = f"{date_obj.strftime('%Y-%m-%d') if date_obj else 'unknown'}.pdf"
    fpath = os.path.join(OUTPUT_DIR, fname)
    if os.path.exists(fpath):
        print(f"⚪ Skipping {fname} (already exists)")
        return

    try:
        async with session.get(url) as resp:
            if resp.status == 200:
                with open(fpath, "wb") as f:
                    f.write(await resp.read())
                print(f"✅ Saved {fname}")
            else:
                print(f"⚠️  Failed {url} [{resp.status}]")
    except Exception as e:
        print(f"❌ Error {url}: {e}")


async def main():
    async with aiohttp.ClientSession() as session:
        all_links = []
        page = 0

        # Keep fetching until CBSL returns no reports
        while True:
            rows = await fetch_page(session, page)
            if not rows:
                print(f"⛔ No more data after page {page - 1}")
                break
            print(f"📄 Page {page}: {len(rows)} reports")
            all_links.extend(rows)
            page += 1
            await asyncio.sleep(1)  # polite pause between requests

        print(f"🔗 Total {len(all_links)} PDFs to download")

        # download concurrently (limit 5 at a time)
        sem = asyncio.Semaphore(5)

        async def bounded_download(date_obj, url):
            async with sem:
                await download_pdf(session, date_obj, url)

        tasks = [bounded_download(d, u) for d, u in all_links]
        await asyncio.gather(*tasks)

        print("🎉 Done! All 2025 reports saved in ./cbsldata_2025")


# In Jupyter or interactive environments
await main()


📄 Page 0: 24 reports
📄 Page 1: 24 reports
📄 Page 2: 24 reports
📄 Page 3: 24 reports


  m = tuple(map(os.fspath, m))


CancelledError: 