In [None]:
"""
gold_scraper.py
Scrape daily Indian gold prices (₹/g & ₹/10 g, End-of-Day / High / Low)
for **any period** of contiguous months and save to CSV.

▶  pip install requests beautifulsoup4 pandas
"""


import calendar, re, requests, pandas as pd
from bs4 import BeautifulSoup
from datetime import date, datetime
from typing import List, Tuple

BASE = "https://www.goldpriceindia.com/gold-price-{month}-{year}.php"
HEADERS = {"User-Agent": "Mozilla/5.0"}

PAT = re.compile(
    r"Gold Price on (\d{1,2}) ([A-Za-z]+) (\d{4}).*?"
    r"₹/g\s*₹([\d,]+)\s*₹([\d,]+)\s*₹([\d,]+).*?"
    r"₹/10 g\s*₹([\d,]+)\s*₹([\d,]+)\s*₹([\d,]+)", flags=re.S
)


def _month_iter(start: date, end: date) -> List[Tuple[int, int]]:
    ym0, ym1 = 12 * start.year + start.month - 1, 12 * end.year + end.month - 1
    for ym in range(ym0, ym1 + 1):
        y, m = divmod(ym, 12)
        yield y, m + 1


def _url(y: int, m: int) -> str:
    return BASE.format(month=calendar.month_name[m].lower(), year=y)


def _clean(n: str) -> int:
    return int(n.replace("₹", "").replace(",", "").strip())


def _scrape_month(y: int, m: int) -> pd.DataFrame:
    html = requests.get(_url(y, m), headers=HEADERS, timeout=30).text
    text = BeautifulSoup(html, "html.parser").get_text("\n").replace("\xa0", " ")
    rows = []
    for m in PAT.finditer(text):
        day, mon, yr = m.group(1, 2, 3)
        dt = datetime.strptime(f"{day} {mon} {yr}", "%d %B %Y").date()
        g_end, g_hi, g_lo, x_end, x_hi, x_lo = map(_clean, m.group(4, 5, 6, 7, 8, 9))
        rows.append(
            dict(
                date=dt,
                end_per_g=g_end,
                high_per_g=g_hi,
                low_per_g=g_lo,
                end_per_10g=x_end,
                high_per_10g=x_hi,
                low_per_10g=x_lo,
            )
        )
    return pd.DataFrame(rows)


# ---------------------------------------------------------------------
def scrape_gold_prices(
    start: date,
    end: date,
    out_csv: str | None = None,
) -> pd.DataFrame:
    """
    Scrape every month from *start* through *end* (inclusive).
    If `out_csv` is None, an automatic name like
       gold_2009-01_to_2025-06.csv
    is generated.
    """
    if start > end:
        raise ValueError("start must be ≤ end")

    frames = []
    for y, m in _month_iter(start, end):
        print(f"· {calendar.month_abbr[m]} {y}", end=" ", flush=True)
        df_m = _scrape_month(y, m)
        print(f"{len(df_m)} rows")
        frames.append(df_m)

    df = pd.concat(frames, ignore_index=True).sort_values("date").reset_index(drop=True)

    # -------- automatic file name ------------------------------------
    if out_csv is None:
        out_csv = f"gold_{start.year:04d}-{start.month:02d}_to_{end.year:04d}-{end.month:02d}.csv"

    df.to_csv(out_csv, index=False)
    print(f"\n✔ Saved {len(df)} rows to {out_csv}")
    return df


# ------------------ quick demo --------------------------------------
if __name__ == "__main__":
    scrape_gold_prices(date(2009, 1, 1), date(2025, 6, 1))


· Jan 2009 26 rows
· Feb 2009 14 rows
· Mar 2009 31 rows
· Apr 2009 30 rows
· May 2009 31 rows
· Jun 2009 30 rows
· Jul 2009 31 rows
· Aug 2009 31 rows
· Sep 2009 30 rows
· Oct 2009 31 rows
· Nov 2009 30 rows
· Dec 2009 27 rows
· Jan 2010 31 rows
· Feb 2010 28 rows
· Mar 2010 31 rows
· Apr 2010 30 rows
· May 2010 31 rows
· Jun 2010 30 rows
· Jul 2010 31 rows
· Aug 2010 31 rows
· Sep 2010 18 rows
· Oct 2010 19 rows
· Nov 2010 30 rows
· Dec 2010 31 rows
· Jan 2011 31 rows
· Feb 2011 28 rows
· Mar 2011 31 rows
· Apr 2011 30 rows
· May 2011 31 rows
· Jun 2011 30 rows
· Jul 2011 31 rows
· Aug 2011 31 rows
· Sep 2011 30 rows
· Oct 2011 31 rows
· Nov 2011 30 rows
· Dec 2011 31 rows
· Jan 2012 31 rows
· Feb 2012 29 rows
· Mar 2012 31 rows
· Apr 2012 30 rows
· May 2012 31 rows
· Jun 2012 30 rows
· Jul 2012 31 rows
· Aug 2012 31 rows
· Sep 2012 30 rows
· Oct 2012 31 rows
· Nov 2012 30 rows
· Dec 2012 31 rows
· Jan 2013 31 rows
· Feb 2013 24 rows
· Mar 2013 31 rows
· Apr 2013 30 rows
· May 2013 3