In [1]:
pip install selenium webdriver-manager


Collecting selenium
  Downloading selenium-4.35.0-py3-none-any.whl.metadata (7.4 kB)
Collecting webdriver-manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting urllib3<3.0,>=2.5.0 (from urllib3[socks]<3.0,>=2.5.0->selenium)
  Downloading urllib3-2.5.0-py3-none-any.whl.metadata (6.5 kB)
Collecting trio~=0.30.0 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.12.2 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting certifi>=2025.6.15 (from selenium)
  Downloading certifi-2025.8.3-py3-none-any.whl.metadata (2.4 kB)
Collecting typing_extensions~=4.14.0 (from selenium)
  Downloading typing_extensions-4.14.1-py3-none-any.whl.metadata (3.0 kB)
Collecting sortedcontainers (from trio~=0.30.0->selenium)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.30.0->selenium)
  Downloading outcome-1.3.0.p



In [9]:
import os, re, time, json, hashlib
from datetime import datetime
from typing import List, Dict, Optional

import pandas as pd
import requests
from bs4 import BeautifulSoup
from dateutil import parser as dtparse
import pytz
from concurrent.futures import ThreadPoolExecutor, as_completed
import gzip, io


# Config
DUBAI_TZ = pytz.timezone("Asia/Dubai")
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
}
REQ_TIMEOUT = 20
PAUSE = 0.4  

# Eventbrite 
EVENTBRITE_TOKEN = os.getenv("EVENTBRITE_TOKEN", "WJS7O2YA33T2S3UC5U4I").strip()
USE_EVENTBRITE = bool(EVENTBRITE_TOKEN) and EVENTBRITE_TOKEN.upper() != "WJS7O2YA33T2S3UC5U4I"


# Helpers
def now_dubai() -> datetime:
    return datetime.now(DUBAI_TZ)

def aware_dt(s: Optional[str]) -> Optional[datetime]:
    if not s:
        return None
    try:
        d = dtparse.parse(s)
        if d.tzinfo is None:
            d = DUBAI_TZ.localize(d)
        else:
            d = d.astimezone(DUBAI_TZ)
        return d
    except Exception:
        return None

def to_iso(d: Optional[datetime]) -> Optional[str]:
    return d.isoformat() if d else None

def make_uid(parts) -> str:
    base = "|".join([p or "" for p in parts])
    return hashlib.sha1(base.encode("utf-8")).hexdigest()

def norm_row(source, source_id, title, start_dt, end_dt,
             url=None, venue=None, address=None, city="Dubai", country="UAE",
             status="scheduled", category=None, organizer=None, price=None, image=None) -> Dict:
    uid = make_uid([source, str(source_id or ""), title or "", to_iso(start_dt) or ""])
    return {
        "uid": uid,
        "source": source,
        "source_id": str(source_id or ""),
        "title": (title or "").strip(),
        "start": to_iso(start_dt),
        "end": to_iso(end_dt),
        "status": status,
        "url": url,
        "venue": venue,
        "address": address,
        "city": city,
        "country": country,
        "category": category,
        "organizer": organizer,
        "price": price,
        "image": image,
        "ingested_at": to_iso(now_dubai()),
    }

def keep_future_or_ongoing(row: Dict, assume_future_if_missing: bool = True) -> bool:
    """
    Relaxed by default so you see rows even if dates don't parse.
    Set assume_future_if_missing=False to strictly keep only future/ongoing rows.
    """
    now = now_dubai()
    s = aware_dt(row.get("start")) if row.get("start") else None
    e = aware_dt(row.get("end")) if row.get("end") else None
    if s and s >= now: return True
    if e and e >= now: return True
    if s and s.date() == now.date(): return True
    return assume_future_if_missing and (s is None and e is None)

def dedupe(rows: List[Dict]) -> List[Dict]:
    seen, out = set(), []
    for r in rows:
        if r["uid"] in seen:
            continue
        seen.add(r["uid"]); out.append(r)
    return out


# Eventbrite 

def fetch_eventbrite_dubai() -> List[Dict]:
    rows = []
    if not USE_EVENTBRITE:
        return rows
    url = "https://www.eventbriteapi.com/v3/events/search/"
    params = {
        "q": "Dubai",
        "location.address": "Dubai",
        "expand": "venue,category,organizer,logo",
        "sort_by": "date",
        "page": 1,
    }
    headers = {"Authorization": f"Bearer {EVENTBRITE_TOKEN}"}
    while True:
        r = requests.get(url, params=params, headers=headers, timeout=REQ_TIMEOUT)
        if r.status_code != 200:
            print("Eventbrite HTTP", r.status_code, "- stopping.")
            break
        data = r.json()
        evs = data.get("events", []) or []
        if not evs:
            break
        for ev in evs:
            title = (ev.get("name") or {}).get("text")
            start = aware_dt((ev.get("start") or {}).get("utc"))
            end   = aware_dt((ev.get("end") or {}).get("utc"))
            venue = (ev.get("venue") or {}).get("name")
            address = None
            if ev.get("venue") and (ev["venue"].get("address")):
                a = ev["venue"]["address"]
                address = ", ".join([a.get(k) for k in ["address_1","address_2","city","region","postal_code"] if a.get(k)])
            url_e = ev.get("url")
            category = (ev.get("category") or {}).get("name")
            organizer = (ev.get("organizer") or {}).get("name")
            image = (ev.get("logo") or {}).get("url")
            rows.append(norm_row("eventbrite", ev.get("id"), title, start, end, url_e, venue, address,
                                 category=category, organizer=organizer, image=image))
        params["page"] += 1
        time.sleep(PAUSE)
    print("Eventbrite rows:", len(rows))
    return rows


# VisitDubai (skip gracefully if non-200)

def parse_visitdubai_card(card) -> Optional[Dict]:
    a = card.select_one("a[href]")
    if not a: return None
    href = a["href"]
    url = "https://www.visitdubai.com" + href if href.startswith("/") else href
    title_el = card.select_one("h3, h2, .card__title, .c-card__title, [data-test='title']")
    title = title_el.get_text(" ", strip=True) if title_el else "Event"

    date_el = card.select_one(".card__date, .event-card__date, .c-card__date, .date, [data-test='date']")
    date_text = date_el.get_text(" ", strip=True) if date_el else None

    start_dt = end_dt = None
    if date_text:
        parts = re.split(r"\s*[-–—]\s*", date_text)
        try:
            if len(parts) == 2:
                start_dt = aware_dt(parts[0] + " " + str(datetime.now().year))
                end_dt = aware_dt(parts[1])
                if end_dt and start_dt and end_dt < start_dt:
                    start_dt = aware_dt(parts[0] + " " + str(end_dt.year))
            else:
                start_dt = aware_dt(date_text)
        except Exception:
            pass

    venue_el = card.select_one(".event-card__venue, .venue, .c-card__subtitle, [data-test='venue']")
    venue = venue_el.get_text(" ", strip=True) if venue_el else None

    img_el = card.select_one("img[src], img[data-src]")
    image = (img_el.get("src") or img_el.get("data-src")) if img_el else None

    return norm_row("visitdubai", url, title, start_dt, end_dt, url, venue, None, image=image)

def fetch_visitdubai(max_pages: int = 3) -> List[Dict]:
    rows = []
    base = "https://www.visitdubai.com/en/whats-on/dubai-events"
    for page in range(1, max_pages + 1):
        url = base if page == 1 else f"{base}?page={page}"
        r = requests.get(url, headers=HEADERS, timeout=REQ_TIMEOUT)
        if r.status_code != 200:
            print("VisitDubai HTTP", r.status_code, "- skipping.")
            break
        soup = BeautifulSoup(r.text, "html.parser")
        cards = soup.select("article, .event-card, .c-card, li a[href*='/en/whats-on/'], a[href*='/en/whats-on/']")
        got = 0
        for c in cards:
            row = parse_visitdubai_card(c)
            if row:
                rows.append(row); got += 1
        print(f"VisitDubai page {page}: parsed {got} rows")
        time.sleep(PAUSE)
    print("VisitDubai rows:", len(rows))
    return rows


# Platinumlist 
ROBOTS = [
    "https://platinumlist.net/robots.txt",
    "https://dubai.platinumlist.net/robots.txt",
]
SITEMAP_SEEDS = [
    "https://platinumlist.net/sitemap.xml",
    "https://platinumlist.net/sitemap_index.xml",
    "https://dubai.platinumlist.net/sitemap.xml",
    "https://dubai.platinumlist.net/sitemap_index.xml",
]

def _fetch_bytes(url: str) -> Optional[bytes]:
    try:
        r = requests.get(url, headers=HEADERS, timeout=REQ_TIMEOUT, allow_redirects=True)
        if r.status_code == 200 and r.content:
            return r.content
    except Exception:
        pass
    return None

def _fetch_text_or_gzip(url: str) -> Optional[str]:
    """Fetch XML or XML.GZ and return text."""
    b = _fetch_bytes(url)
    if not b:
        return None
    # try decompress if .gz, otherwise decode
    if url.lower().split("?")[0].endswith(".gz"):
        try:
            with gzip.GzipFile(fileobj=io.BytesIO(b)) as gz:
                return gz.read().decode("utf-8", errors="replace")
        except Exception:
            try:
                return b.decode("utf-8", errors="replace")
            except Exception:
                return None
    else:
        try:
            return b.decode("utf-8", errors="replace")
        except Exception:
            # sometimes servers send gzipped content with wrong extension
            try:
                with gzip.GzipFile(fileobj=io.BytesIO(b)) as gz:
                    return gz.read().decode("utf-8", errors="replace")
            except Exception:
                return None

def _parse_sitemap_locs(xml_text: str) -> List[str]:
    # Extract <loc>...</loc>
    locs = re.findall(r"<loc>\s*([^<]+)\s*</loc>", xml_text, flags=re.I)
    return [u.strip() for u in locs if u.strip()]

def _harvest_from_robots() -> List[str]:
    """Read robots.txt to discover all listed sitemaps."""
    out = []
    for rob in ROBOTS:
        txt = _fetch_text_or_gzip(rob)
        if not txt:
            continue
        for line in txt.splitlines():
            line = line.strip()
            if line.lower().startswith("sitemap:"):
                url = line.split(":", 1)[1].strip()
                if url:
                    out.append(url)
    # add fallbacks
    out.extend(SITEMAP_SEEDS)
    # dedupe
    seen, uniq = set(), []
    for u in out:
        if u not in seen:
            seen.add(u); uniq.append(u)
    return uniq

def _expand_sitemaps(seed_urls: List[str], max_depth: int = 2) -> List[str]:
    """
    Recursively expand sitemap indexes into concrete sitemap files and page URLs.
    Returns a flat list of all <loc> values found.
    """
    results = []
    visited = set()
    queue = [(u, 0) for u in seed_urls]

    while queue:
        url, depth = queue.pop(0)
        if url in visited:
            continue
        visited.add(url)

        xml = _fetch_text_or_gzip(url)
        if not xml:
            continue

        locs = _parse_sitemap_locs(xml)
        if not locs:
            continue

        results.extend(locs)

        
        if depth < max_depth:
            nested = [u for u in locs if ("sitemap" in u.lower()) or u.lower().endswith((".xml", ".xml.gz"))]
            for n in nested:
                queue.append((n, depth + 1))

        time.sleep(PAUSE)

    # dedupe
    seen, uniq = set(), []
    for u in results:
        if u not in seen:
            seen.add(u); uniq.append(u)
    return uniq

def harvest_platinum_sitemaps(max_links: int = 300) -> List[str]:
    """
    Discover all sitemaps via robots.txt (+ seeds), expand them, and return
    concrete event detail URLs (/event/).
    """
    seed = _harvest_from_robots()
    expanded = _expand_sitemaps(seed, max_depth=2)

    
    page_like = [u for u in expanded if "/event/" in u.lower()]
    # Normalize 
    clean = []
    for u in page_like:
        if u.startswith("//"): u = "https:" + u
        if u.startswith("/"):  u = "https://platinumlist.net" + u
        clean.append(u)

   
    prefer = [u for u in clean if "dubai" in u.lower()]
    rest   = [u for u in clean if "dubai" not in u.lower()]
    ordered = prefer + rest

    # Dedupe & cap
    seen, uniq = set(), []
    for u in ordered:
        if u not in seen:
            seen.add(u); uniq.append(u)
    return uniq[:max_links]

def _first_event_jsonld_from_html(html: str) -> Optional[dict]:
    soup = BeautifulSoup(html, "html.parser")
    for tag in soup.select('script[type="application/ld+json"]'):
        raw = (tag.string or tag.text or "").strip()
        if not raw:
            continue
        try:
            blob = json.loads(raw)
            items = blob if isinstance(blob, list) else [blob]
            for d in items:
                if isinstance(d, dict) and str(d.get("@type","")).lower() == "event":
                    return d
        except Exception:
            continue
    return None

def parse_platinum_detail_fast(url: str) -> Optional[Dict]:
    try:
        r = requests.get(url, headers=HEADERS, timeout=REQ_TIMEOUT)
        if r.status_code != 200:
            return None
        data = _first_event_jsonld_from_html(r.text)
        if not data:
            return None
        title = data.get("name") or "Event"
        start_dt = aware_dt(data.get("startDate")) if isinstance(data.get("startDate"), str) else None
        end_dt   = aware_dt(data.get("endDate"))   if isinstance(data.get("endDate"), str)   else None

        venue = None
        loc = data.get("location")
        if isinstance(loc, dict):
            venue = loc.get("name") or (loc.get("address") if isinstance(loc.get("address"), str) else None)

        img = data.get("image")
        image = img[0] if isinstance(img, list) and img else (img if isinstance(img, str) else None)

        return norm_row("platinumlist", url, title, start_dt, end_dt, url, venue, None, image=image)
    except Exception:
        return None

def fetch_platinumlist_via_sitemaps(max_pages: int = 2, per_page_limit: int = 120) -> List[Dict]:

    max_links = per_page_limit * max_pages
    links = harvest_platinum_sitemaps(max_links=max_links)
    print("Sitemap event links:", len(links))

    out = []
    if not links:
        return out

    
    workers = min(12, max(2, len(links)))
    with ThreadPoolExecutor(max_workers=workers) as ex:
        futs = {ex.submit(parse_platinum_detail_fast, url): url for url in links}
        for fut in as_completed(futs):
            row = fut.result()
            if row:
                out.append(row)

    print("Parsed event rows:", len(out))
    return dedupe(out)

# Runner

def run(csv_path: str = "dubai_events.csv",
        include_eventbrite: bool = USE_EVENTBRITE,
        include_visitdubai: bool = False,      # default False because you saw 404s earlier
        include_platinumlist: bool = True,
        pages: int = 2,
        relaxed_filter: bool = True) -> pd.DataFrame:
    """
    Scrape events and write to CSV. Returns the DataFrame.
    - relaxed_filter=True keeps rows even if dates are missing/unparsable (so you see output).
    """
    all_rows: List[Dict] = []

    if include_eventbrite:
        try:
            all_rows.extend(fetch_eventbrite_dubai())
        except Exception as e:
            print("Eventbrite fetch failed:", repr(e))

    if include_visitdubai:
        try:
            all_rows.extend(fetch_visitdubai(max_pages=pages))
        except Exception as e:
            print("VisitDubai fetch failed:", repr(e))

    if include_platinumlist:
        try:
            all_rows.extend(fetch_platinumlist_via_sitemaps(max_pages=pages))
        except Exception as e:
            print("Platinumlist fetch failed:", repr(e))

    all_rows = dedupe(all_rows)
    filtered = [r for r in all_rows if keep_future_or_ongoing(r, assume_future_if_missing=relaxed_filter)]

    df = pd.DataFrame(filtered, columns=[
        "uid","source","source_id","title","start","end","status","url","venue",
        "address","city","country","category","organizer","price","image","ingested_at"
    ])

    def _safe_dt(x):
        try: return dtparse.parse(x) if pd.notna(x) else None
        except Exception: return None

    if not df.empty:
        df["start_dt_sort"] = df["start"].apply(_safe_dt)
        df = df.sort_values(by=["start_dt_sort","title"], ascending=[True, True]).drop(columns=["start_dt_sort"], errors="ignore")

    df.to_csv(csv_path, index=False, encoding="utf-8-sig")
    print(f"Saved {len(df)} events to {csv_path}")
    return df


In [10]:
df = run(
    csv_path="dubai_events.csv",
    include_eventbrite=False,
    include_visitdubai=False,   
    include_platinumlist=True,
    pages=2,                    
    relaxed_filter=True
)

from IPython.display import display
display(df.head(20))
print("Rows:", len(df))
print("By source:\n", df["source"].value_counts(dropna=False))


Sitemap event links: 2
Parsed event rows: 0
Saved 0 events to dubai_events.csv


Unnamed: 0,uid,source,source_id,title,start,end,status,url,venue,address,city,country,category,organizer,price,image,ingested_at


Rows: 0
By source:
 Series([], Name: count, dtype: int64)
