In [1]:
pip install requests beautifulsoup4 pandas


Note: you may need to restart the kernel to use updated packages.




In [2]:
pip install schedule

Note: you may need to restart the kernel to use updated packages.




In [3]:
pip install streamlit pandas requests beautifulsoup4


Note: you may need to restart the kernel to use updated packages.




In [4]:
pip install pymongo

Note: you may need to restart the kernel to use updated packages.




In [5]:
pip install tabulate

Note: you may need to restart the kernel to use updated packages.




In [6]:
pip install requests beautifulsoup4 python-dateutil pytz pandas


Note: you may need to restart the kernel to use updated packages.




In [None]:


import os, re, time, json, hashlib
from datetime import datetime
from typing import List, Dict, Optional
import pandas as pd
import requests
from bs4 import BeautifulSoup
from dateutil import parser as dtparse
import pytz

# Config 
DUBAI_TZ = pytz.timezone("Asia/Dubai")
HEADERS = {"User-Agent": "DubaiEventsCSV/1.0 (+respectful scraping; non-bulk)"}
REQ_TIMEOUT = 20
PAUSE = 1.1  # politeness delay between HTTP requests
EVENTBRITE_TOKEN = os.getenv("EVENTBRITE_TOKEN", "WJS7O2YA33T2S3UC5U4I").strip()

#  Helpers
def now_dubai() -> datetime:
    return datetime.now(DUBAI_TZ)

def aware_dt(s: Optional[str]) -> Optional[datetime]:
    if not s: return None
    try:
        d = dtparse.parse(s)
        if d.tzinfo is None: d = DUBAI_TZ.localize(d)
        else: d = d.astimezone(DUBAI_TZ)
        return d
    except Exception:
        return None

def to_iso(d: Optional[datetime]) -> Optional[str]:
    return d.isoformat() if d else None

def make_uid(parts) -> str:
    base = "|".join([p or "" for p in parts])
    return hashlib.sha1(base.encode("utf-8")).hexdigest()

def norm_row(source, source_id, title, start_dt, end_dt,
             url=None, venue=None, address=None, city="Dubai", country="UAE",
             status="scheduled", category=None, organizer=None, price=None, image=None) -> Dict:
    uid = make_uid([source, str(source_id or ""), title or "", to_iso(start_dt) or ""])
    return {
        "uid": uid,
        "source": source,
        "source_id": str(source_id or ""),
        "title": (title or "").strip(),
        "start": to_iso(start_dt),
        "end": to_iso(end_dt),
        "status": status,
        "url": url,
        "venue": venue,
        "address": address,
        "city": city,
        "country": country,
        "category": category,
        "organizer": organizer,
        "price": price,
        "image": image,
        "ingested_at": to_iso(now_dubai()),
    }

def keep_future_or_ongoing(row: Dict) -> bool:
    now = now_dubai()
    s = aware_dt(row.get("start")) if row.get("start") else None
    e = aware_dt(row.get("end")) if row.get("end") else None
    if s and s >= now: return True
    if e and e >= now: return True
    if s and s.date() == now.date(): return True
    return False

def dedupe(rows: List[Dict]) -> List[Dict]:
    seen, out = set(), []
    for r in rows:
        if r["uid"] in seen: continue
        seen.add(r["uid"]); out.append(r)
    return out

# Eventbrite 
def fetch_eventbrite_dubai() -> List[Dict]:
    rows = []
    if not EVENTBRITE_TOKEN: return rows
    url = "https://www.eventbriteapi.com/v3/events/search/"
    params = {
        "q": "Dubai",
        "location.address": "Dubai",
        "expand": "venue,category,organizer,logo",
        "sort_by": "date",
    }
    headers = {"Authorization": f"Bearer {EVENTBRITE_TOKEN}"}
    page = 1
    while True:
        params["page"] = page
        r = requests.get(url, params=params, headers=headers, timeout=REQ_TIMEOUT)
        if r.status_code != 200: break
        data = r.json()
        evs = data.get("events", [])
        if not evs: break
        for ev in evs:
            title = (ev.get("name") or {}).get("text")
            start = aware_dt((ev.get("start") or {}).get("utc"))
            end   = aware_dt((ev.get("end") or {}).get("utc"))
            venue = (ev.get("venue") or {}).get("name")
            address = None
            if ev.get("venue") and ev["venue"].get("address"):
                a = ev["venue"]["address"]
                address = ", ".join([a.get(k) for k in ["address_1","address_2","city","region","postal_code"] if a.get(k)])
            url_e = ev.get("url")
            category = (ev.get("category") or {}).get("name")
            organizer = (ev.get("organizer") or {}).get("name")
            image = (ev.get("logo") or {}).get("url")
            rows.append(norm_row("eventbrite", ev.get("id"), title, start, end, url_e, venue, address,
                                 category=category, organizer=organizer, image=image))
        page += 1
        time.sleep(PAUSE)
    return rows

# Visit Dubai 
def parse_visitdubai_card(card) -> Optional[Dict]:
    a = card.select_one("a[href]")
    if not a: return None
    href = a["href"]
    url = "https://www.visitdubai.com" + href if href.startswith("/") else href
    title_el = card.select_one("h3, h2"); title = title_el.get_text(strip=True) if title_el else None

    date_text = None
    for sel in [".card__date", ".event-card__date", ".c-card__date", ".date", "[data-test='date']"]:
        el = card.select_one(sel)
        if el: date_text = el.get_text(" ", strip=True); break

    start_dt = end_dt = None
    if date_text:
        parts = re.split(r"\s*[-–—]\s*", date_text)
        try:
            if len(parts) == 2:
                start_dt = aware_dt(parts[0] + " " + str(datetime.now().year))
                end_dt = aware_dt(parts[1])
                if end_dt and start_dt and end_dt < start_dt:
                    start_dt = aware_dt(parts[0] + " " + str(end_dt.year))
            else:
                start_dt = aware_dt(date_text)
        except Exception:
            pass

    venue_el = card.select_one(".event-card__venue, .venue, .c-card__subtitle")
    venue = venue_el.get_text(" ", strip=True) if venue_el else None

    img_el = card.select_one("img[src], img[data-src]")
    image = (img_el.get("src") or img_el.get("data-src")) if img_el else None

    return norm_row("visitdubai", url, title or "Event", start_dt, end_dt, url, venue, None, image=image)

def fetch_visitdubai(max_pages: int = 5) -> List[Dict]:
    rows = []
    base = "https://www.visitdubai.com/en/whats-on/dubai-events"
    for page in range(1, max_pages + 1):
        url = base if page == 1 else f"{base}?page={page}"
        r = requests.get(url, headers=HEADERS, timeout=REQ_TIMEOUT)
        if r.status_code != 200: break
        soup = BeautifulSoup(r.text, "html.parser")
        cards = soup.select("article, .event-card, .c-card, li a[href*='/en/whats-on/']")
        if not cards: cards = soup.select("a[href*='/en/whats-on/']")
        for c in cards:
            row = parse_visitdubai_card(c)
            if row: rows.append(row)
        time.sleep(PAUSE)
    return rows

# Platinumlist 
def parse_platinum_card(card) -> Optional[Dict]:
    a = card.select_one("a[href]")
    if not a: return None
    href = a["href"]
    url = "https://dubai.platinumlist.net" + href if href.startswith("/") else href
    title_el = card.select_one("h3, h2, .event-card-title, .title")
    title = title_el.get_text(" ", strip=True) if title_el else None
    date_el = card.select_one(".date, .event-card-date, time")
    date_text = date_el.get_text(" ", strip=True) if date_el else None

    start_dt = end_dt = None
    if date_text:
        parts = re.split(r"\s*[-–—]\s*", date_text)
        try:
            if len(parts) == 2:
                start_dt = aware_dt(parts[0] + " " + str(datetime.now().year))
                end_dt = aware_dt(parts[1])
                if end_dt and start_dt and end_dt < start_dt:
                    start_dt = aware_dt(parts[0] + " " + str(end_dt.year))
            else:
                start_dt = aware_dt(date_text)
        except Exception:
            pass

    venue_el = card.select_one(".venue, .event-card-venue, [data-qa='venue-name']")
    venue = venue_el.get_text(" ", strip=True) if venue_el else None
    img_el = card.select_one("img[src], img[data-src]")
    image = (img_el.get("src") or img_el.get("data-src")) if img_el else None

    return norm_row("platinumlist", url, title or "Event", start_dt, end_dt, url, venue, None, image=image)

def fetch_platinumlist(max_pages: int = 5) -> List[Dict]:
    rows = []
    base = "https://dubai.platinumlist.net/event"
    for page in range(1, max_pages + 1):
        url = base if page == 1 else f"{base}?page={page}"
        r = requests.get(url, headers=HEADERS, timeout=REQ_TIMEOUT)
        if r.status_code != 200: break
        soup = BeautifulSoup(r.text, "html.parser")
        cards = soup.select("article, .event-card, .event-card--item, li a[href*='/event/']")
        if not cards: cards = soup.select("a[href*='/event/']")
        for c in cards:
            row = parse_platinum_card(c)
            if row: rows.append(row)
        time.sleep(PAUSE)
    return rows

#  Runner 
def run(csv_path: str = "dubai_events.csv", include_eventbrite: bool = True, pages: int = 5) -> pd.DataFrame:
    """Scrape Dubai events and write to CSV. Returns the DataFrame."""
    all_rows: List[Dict] = []

    # Eventbrite 
    if include_eventbrite and EVENTBRITE_TOKEN:
        try:
            all_rows.extend(fetch_eventbrite_dubai())
        except Exception as e:
            print("Eventbrite fetch failed:", e)

    # VisitDubai + Platinumlist
    try:
        all_rows.extend(fetch_visitdubai(max_pages=pages))
    except Exception as e:
        print("VisitDubai fetch failed:", e)

    try:
        all_rows.extend(fetch_platinumlist(max_pages=pages))
    except Exception as e:
        print("Platinumlist fetch failed:", e)

    # De-dupe + keep upcoming/ongoing
    all_rows = dedupe(all_rows)
    filtered = [r for r in all_rows if keep_future_or_ongoing(r)]

    # DataFrame + sort + CSV
    df = pd.DataFrame(filtered, columns=[
        "uid","source","source_id","title","start","end","status","url","venue",
        "address","city","country","category","organizer","price","image","ingested_at"
    ])

    def _safe_dt(x):
        try: return dtparse.parse(x) if pd.notna(x) else None
        except Exception: return None
    df["start_dt_sort"] = df["start"].apply(_safe_dt)
    df = df.sort_values(by=["start_dt_sort","title"], ascending=[True, True]).drop(columns=["start_dt_sort"])

    df.to_csv(csv_path, index=False, encoding="utf-8-sig")
    print(f"Saved {len(df)} events to {csv_path}")
    return df
