<a href="https://colab.research.google.com/github/Annettteee/annette-colab-projects/blob/main/Predicting_Architecture_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re, time, csv
from pathlib import Path
from urllib.parse import urljoin, urlparse
import requests
from bs4 import BeautifulSoup

BASE = "https://buildingsdb.com/"
HEADERS = {
    "User-Agent": "AnnetteCapstoneBot/1.0 (academic research; contact: your_email@example.com)"
}
SESSION = requests.Session()
SESSION.headers.update(HEADERS)

OUT_DIR = Path("data_html"); OUT_DIR.mkdir(exist_ok=True, parents=True)

def get(url, sleep=1.0):
    """GET with basic retry + polite delay."""
    time.sleep(sleep)
    for attempt in range(3):
        r = SESSION.get(url, timeout=20)
        if r.ok: return r
        time.sleep(2*(attempt+1))
    r.raise_for_status()

def parse_state_links(home_html):
    """From homepage, gather state index URLs like /GA/, /NY/ ..."""
    soup = BeautifulSoup(home_html, "html.parser")
    links = []
    for a in soup.find_all("a", href=True):
        href = a["href"]
        # state pages are like https://buildingsdb.com/GA/
        if re.fullmatch(r"https?://buildingsdb\.com/[A-Z]{2}/", urljoin(BASE, href)):
            links.append(urljoin(BASE, href))
    return sorted(set(links))

def extract_building_links(state_url):
    """Collect building detail URLs from a state page by finding city/building paths."""
    res = get(state_url)
    soup = BeautifulSoup(res.text, "html.parser")
    bldg_urls = set()
    for a in soup.find_all("a", href=True):
        url = urljoin(state_url, a["href"])
        parsed = urlparse(url)
        # Expect /STATE/city/slug/ (at least 4 segments)
        segments = [s for s in parsed.path.split("/") if s]
        if len(segments) >= 3 and re.fullmatch(r"[A-Z]{2}", segments[0]):
            bldg_urls.add(url)
    return sorted(bldg_urls)

def scrape_building(burl):
    """Parse a building page and extract structured facts with multiple fallback strategies."""
    res = get(burl, sleep=0.8)
    OUT_DIR.joinpath(f"{re.sub(r'[^a-zA-Z0-9]+','_', burl)}.html").write_text(res.text, encoding="utf-8")
    soup = BeautifulSoup(res.text, "html.parser")

    def textnorm(s):
        return re.sub(r"\s+", " ", s or "").strip()

    data = {
        "source_url": burl,
        "name": None, "state": None, "city": None,
        "architect_firm": None, "year_completed": None,
        "typology": None, "main_use": None, "style": None,
        "height_ft": None, "floors": None, "address": None, "aka_names": None
    }

    # name (usually the main H1)
    h1 = soup.find(["h1","h2"])
    if h1: data["name"] = textnorm(h1.get_text())

    # state & city from URL path
    segs = [s for s in urlparse(burl).path.split("/") if s]
    if len(segs) >= 3:
        data["state"], data["city"] = segs[0], segs[1]

    # quick helpers
    page_text = textnorm(soup.get_text(" | "))

    # heuristic field extractors
    def find_labeled_value(label_patterns):
        for pat in label_patterns:
            m = re.search(pat, page_text, flags=re.IGNORECASE)
            if m:
                return textnorm(m.group(1))
        return None

    data["architect_firm"] = find_labeled_value([
        r"Architect\s*\|\s*([^|]+)",           # "Architect | Rabun Hogan ..."
        r"Architect(?:s)?\s*[:\-]\s*([^|]+)"
    ])

    data["year_completed"] = find_labeled_value([
        r"Year\s*\|\s*([0-9]{4})",
        r"completed(?: in)?\s*([12][0-9]{3})"
    ])

    data["typology"] = find_labeled_value([
        r"Typology\s*\|\s*([^|]+)"
    ])

    data["main_use"] = find_labeled_value([
        r"Main use\s*\|\s*([^|]+)"
    ])

    data["style"] = find_labeled_value([
        r"Style\s*\|\s*([^|]+)",
        r"categorized as an?\s*([A-Za-z\-\s]+)\s*building"
    ])

    # height/floors often appear as "610ft" / "53 floors"
    h = re.search(r"([0-9][0-9,]*)\s*ft", page_text, flags=re.I)
    if h: data["height_ft"] = h.group(1).replace(",", "")
    f = re.search(r"([0-9]{1,3})\s*floors?", page_text, flags=re.I)
    if f: data["floors"] = f.group(1)

    # address / aka names (appear on some detail pages)
    addr = re.search(r"(?:address is|street address is)\s*([^|]+?)(?:\||\.)", page_text, flags=re.I)
    if addr: data["address"] = textnorm(addr.group(1))

    aka = re.search(r"also known.*?as,\s*(.+?)(?:\.\s|$)", page_text, flags=re.I)
    if aka: data["aka_names"] = textnorm(aka.group(1))

    return data

def main(sample_only=True, limit_states=5, limit_buildings_per_state=50, outfile="buildingsdb_sample.csv"):
    home = get(BASE).text
    state_links = parse_state_links(home)
    if sample_only:
        state_links = state_links[:limit_states]

    seen = set()
    rows = []

    for s_url in state_links:
        bldg_links = extract_building_links(s_url)[:limit_buildings_per_state]
        for burl in bldg_links:
            if burl in seen: continue
            seen.add(burl)
            try:
                row = scrape_building(burl)
                rows.append(row)
                print(f"OK: {row.get('name')} | {burl}")
            except Exception as e:
                print(f"ERR: {burl} -> {e}")

    # write CSV
    cols = ["name","state","city","architect_firm","year_completed","typology","main_use",
            "style","height_ft","floors","address","aka_names","source_url"]
    with open(outfile, "w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=cols)
        w.writeheader()
        for r in rows: w.writerow(r)

if __name__ == "__main__":
    # start small; later set sample_only=False for full crawl
    main(sample_only=True)


OK: Regions Center | https://buildingsdb.com/AL/birmingham/regions-center/
OK: RSA Battle House Tower | https://buildingsdb.com/AL/mobile/rsa-battle-house-tower/
OK: Simmons Tower | https://buildingsdb.com/AR/little-rock/simmons-tower/
OK: Chase Tower | https://buildingsdb.com/AZ/phoenix/chase-tower/
OK: 5900 Wilshire Building | https://buildingsdb.com/CA/los-angeles/5900-wilshire-building/
OK: 611 Place Building | https://buildingsdb.com/CA/los-angeles/611-place-building/
OK: 777 Tower | https://buildingsdb.com/CA/los-angeles/777-tower/
OK: Aon Center | https://buildingsdb.com/CA/los-angeles/aon-center/
OK: AT&T Center | https://buildingsdb.com/CA/los-angeles/att-center/
OK: Bank of America Plaza Building | https://buildingsdb.com/CA/los-angeles/bank-of-america-plaza-building/
OK: Bullocks Wilshire Building | https://buildingsdb.com/CA/los-angeles/bullocks-wilshire-building/
OK: Century Plaza Towers | https://buildingsdb.com/CA/los-angeles/century-plaza-towers/
OK: City National Plaza

In [None]:
"""
Scrape ALL buildings from BuildingsDB into CSV.

What you get per row (best-effort, many fields are optional):
- name, state, city, architect_firm, year_completed, typology, main_use, style,
  height_ft, floors, address, source_url

Usage:
  python scrape_buildingsdb_all.py

Tip:
  Start once, inspect CSV, then rerun (it skips already-seen URLs).
"""

import csv
import re
import time
from pathlib import Path
from urllib.parse import urljoin, urlparse

import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

BASE = "https://buildingsdb.com/"
OUT_DIR = Path("data_buildingsdb")
HTML_DIR = OUT_DIR / "html"
OUT_DIR.mkdir(parents=True, exist_ok=True)
HTML_DIR.mkdir(parents=True, exist_ok=True)
CSV_PATH = OUT_DIR / "buildingsdb_all.csv"

HEADERS = {
    "User-Agent": "AnnetteCapstone/1.0 (academic research; contact: youremail@school.edu)"
}

def make_session():
    s = requests.Session()
    s.headers.update(HEADERS)
    retry = Retry(
        total=5,
        backoff_factor=0.8,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["GET", "HEAD", "OPTIONS"]
    )
    s.mount("https://", HTTPAdapter(max_retries=retry))
    s.mount("http://", HTTPAdapter(max_retries=retry))
    return s

SESSION = make_session()

def get(url, sleep=0.6):
    # Polite crawling: small delay between requests
    time.sleep(sleep)
    r = SESSION.get(url, timeout=25)
    r.raise_for_status()
    return r

def save_html(url, text):
    safe = re.sub(r"[^a-zA-Z0-9]+", "_", url.strip("/"))
    p = HTML_DIR / f"{safe}.html"
    p.write_text(text, encoding="utf-8")
    return p

def parse_state_links(home_html):
    """Collect state pages like https://buildingsdb.com/NY/ from the homepage."""
    soup = BeautifulSoup(home_html, "html.parser")
    links = []
    for a in soup.find_all("a", href=True):
        href = urljoin(BASE, a["href"])
        # Match /XX/ where XX are uppercase letters (states) — we exclude /US/
        if re.fullmatch(r"https?://buildingsdb\.com/[A-Z]{2}/", href):
            links.append(href)
    return sorted(set(links))

def is_building_url(url):
    """True for /STATE/city/building-slug/ and False for state/city indexes."""
    try:
        parsed = urlparse(url)
        if parsed.netloc != "buildingsdb.com":
            return False
        segs = [s for s in parsed.path.split("/") if s]
        # Expect exactly: STATE / city / slug (3+ segments OK; first must be a 2-letter state)
        return len(segs) >= 3 and re.fullmatch(r"[A-Z]{2}", segs[0])
    except Exception:
        return False

def collect_building_links_from_state(state_url):
    """Find every building link on a given state page."""
    res = get(state_url)
    soup = BeautifulSoup(res.text, "html.parser")
    bldg_urls = set()

    for a in soup.find_all("a", href=True):
        url = urljoin(state_url, a["href"])
        if is_building_url(url):
            bldg_urls.add(url)

    # Save HTML for reproducibility
    save_html(state_url, res.text)
    print(f"[state] {state_url} -> {len(bldg_urls)} building links")
    return sorted(bldg_urls)

# ---------- Field extraction helpers ----------

def _norm(s):
    return re.sub(r"\s+", " ", (s or "").strip())

def extract_from_state_card_text(text):
    """
    A state page's building 'card' text often looks like:
    'GLC Grand Building Architect Rabun Hogan Ota Rasche Architects Year 1992
     Typology Skyscraper Main use Hotel Style Art-deco'

    We try to pull these fields when available.
    """
    t = _norm(text)
    data = {}
    m = re.search(r"Architect\s+(.+?)\s+Year\s+(\d{4})\s+Typology\s+(.+?)\s+Main use\s+(.+?)\s+Style\s+(.+)$", t, flags=re.I)
    if m:
        data["architect_firm"] = _norm(m.group(1))
        data["year_completed"] = _norm(m.group(2))
        data["typology"] = _norm(m.group(3))
        data["main_use"] = _norm(m.group(4))
        data["style"] = _norm(m.group(5))
    else:
        # Try partials
        y = re.search(r"\bYear\s+(\d{4})\b", t, flags=re.I)
        if y: data["year_completed"] = y.group(1)
        arch = re.search(r"\bArchitect\s+(.+?)(?:\s+Year|\s*$)", t, flags=re.I)
        if arch: data["architect_firm"] = _norm(arch.group(1))
        typ = re.search(r"\bTypology\s+([A-Za-z\-\s]+)", t, flags=re.I)
        if typ: data["typology"] = _norm(typ.group(1))
        use = re.search(r"\bMain use\s+([A-Za-z\-\s]+)", t, flags=re.I)
        if use: data["main_use"] = _norm(use.group(1))
        sty = re.search(r"\bStyle\s+([A-Za-z\-\s]+)", t, flags=re.I)
        if sty: data["style"] = _norm(sty.group(1))
    return data

def extract_from_building_page(html, burl):
    """
    Parse a building detail page for name, city/state, architect, year(s), style tags, height, floors, address.
    """
    soup = BeautifulSoup(html, "html.parser")
    data = {
        "source_url": burl,
        "name": None, "state": None, "city": None,
        "architect_firm": None, "year_completed": None,
        "typology": None, "main_use": None, "style": None,
        "height_ft": None, "floors": None, "address": None
    }

    # Name
    h1 = soup.find(["h1", "h2"])
    if h1:
        data["name"] = _norm(h1.get_text())

    # State/City from URL
    segs = [s for s in urlparse(burl).path.split("/") if s]
    if len(segs) >= 3:
        data["state"], data["city"] = segs[0], segs[1]

    page_text = _norm(soup.get_text(" | "))

    # Architect patterns (common on many pages)
    m = re.search(r"designed by\s+([^,|]+)", page_text, flags=re.I)
    if m:
        data["architect_firm"] = _norm(m.group(1))

    # Year from narrative or timeline
    m = re.search(r"built between\s+(\d{4})\s+and\s+(\d{4})", page_text, flags=re.I)
    if m:
        data["year_completed"] = m.group(2)
    else:
        m = re.search(r"built in\s+(\d{4})", page_text, flags=re.I)
        if m:
            data["year_completed"] = m.group(1)
        else:
            m = re.search(r"Construction completed\s+(\d{4})", page_text, flags=re.I)
            if m:
                data["year_completed"] = m.group(1)

    # Style & typology hints from narrative (“Art-deco skyscraper” etc.)
    m = re.search(r"\b([A-Za-z\- ]+)\s+skyscraper\b", page_text, flags=re.I)
    if m:
        data["style"] = _norm(m.group(1))
        data["typology"] = "Skyscraper"

    # Height & floors (best-effort)
    h = re.search(r"([0-9][0-9,]*)\s*ft", page_text, flags=re.I)
    if h: data["height_ft"] = h.group(1).replace(",", "")
    f = re.search(r"([0-9]{1,3})\s+floors?", page_text, flags=re.I)
    if f: data["floors"] = f.group(1)

    # Address (some pages state “Its precise street address is …”)
    addr = re.search(r"(?:precise street address is|address is)\s+([^|\.]+)", page_text, flags=re.I)
    if addr: data["address"] = _norm(addr.group(1))

    return data

def scrape_building(burl):
    """Download and parse a single building page; merge with any info from its state card (if available later)."""
    res = get(burl, sleep=0.5)
    save_html(burl, res.text)
    return extract_from_building_page(res.text, burl)

def merge_dict_priority(a, b):
    """Fill missing fields in 'a' with values from 'b'."""
    out = dict(a)
    for k, v in (b or {}).items():
        if (out.get(k) is None or out.get(k) == "") and v:
            out[k] = v
    return out

def crawl_all():
    # Discover all states
    home = get(BASE).text
    save_html(BASE, home)
    state_links = parse_state_links(home)
    if not state_links:
        raise RuntimeError("No state links found on homepage. Site structure may have changed.")

    seen_urls = set()
    rows = []

    # If CSV exists, load existing URLs to avoid re-scraping on resume
    if CSV_PATH.exists():
        with open(CSV_PATH, newline="", encoding="utf-8") as f:
            for row in csv.DictReader(f):
                if row.get("source_url"):
                    seen_urls.add(row["source_url"])

    for s_url in state_links:
        # Collect building URLs from the state page
        bldg_urls = collect_building_links_from_state(s_url)
        print(f" [discover] {s_url} -> {len(bldg_urls)} buildings")

        # Build a map of state-card facts (name + metadata) keyed by URL
        state_card_facts = {}

        # Build a quick index by scanning anchor text on the state page
        res = get(s_url)
        soup = BeautifulSoup(res.text, "html.parser")
        for a in soup.find_all("a", href=True):
            url = urljoin(s_url, a["href"])
            if url in bldg_urls:
                facts = extract_from_state_card_text(a.get_text(" "))
                if facts:
                    state_card_facts[url] = facts

        # Scrape each building page
        for burl in bldg_urls:
            if burl in seen_urls:
                continue
            try:
                detail = scrape_building(burl)
                merged = merge_dict_priority(detail, state_card_facts.get(burl))
                rows.append(merged)
                seen_urls.add(burl)
                print(f"   ✓ {merged.get('name') or burl}")
            except Exception as e:
                print(f"   ✗ ERROR {burl}: {e}")

        # Write incrementally after each state
        write_csv(rows, append=True)
        rows.clear()

    print("Done. CSV at:", CSV_PATH)

def write_csv(new_rows, append=False):
    cols = ["name","state","city","architect_firm","year_completed","typology","main_use",
            "style","height_ft","floors","address","source_url"]
    mode = "a" if append and CSV_PATH.exists() else "w"
    with open(CSV_PATH, mode, newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=cols)
        if mode == "w":
            w.writeheader()
        for r in new_rows:
            w.writerow({k: r.get(k, "") for k in cols})

if __name__ == "__main__":
    crawl_all()


[state] https://buildingsdb.com/AL/ -> 2 building links
 [discover] https://buildingsdb.com/AL/ -> 2 buildings
   ✓ Regions Center
   ✓ RSA Battle House Tower
[state] https://buildingsdb.com/AR/ -> 1 building links
 [discover] https://buildingsdb.com/AR/ -> 1 buildings
   ✓ Simmons Tower
[state] https://buildingsdb.com/AZ/ -> 1 building links
 [discover] https://buildingsdb.com/AZ/ -> 1 buildings
   ✓ Chase Tower
[state] https://buildingsdb.com/CA/ -> 72 building links
 [discover] https://buildingsdb.com/CA/ -> 72 buildings
   ✓ 5900 Wilshire Building
   ✓ 611 Place Building
   ✓ 777 Tower
   ✓ Aon Center
   ✓ AT&T Center
   ✓ Bank of America Plaza Building
   ✓ Bullocks Wilshire Building
   ✓ Century Plaza Towers
   ✓ City National Plaza
   ✓ DominguezâWilshire Building
   ✓ E. Clem Wilson Building
   ✓ Eastern Columbia Building
   ✓ Equitable Life Building
   ✓ Figueroa at Wilshire Building
   ✓ FourFortyFour South Flower Building
   ✓ Garfield Building
   ✓ Gas Company Tower
   ✓ 

new website

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Scrape building pages from The Skyscraper Center (CTBUH) starting from user-provided seed URLs.

Usage:
  python scrape_ctbuh_us.py --seeds seeds.txt --out ctbuh_us.csv

seeds.txt can contain lines like:
  https://www.skyscrapercenter.com/company/3393            # company page (Weber Thompson)
  https://www.skyscrapercenter.com/city/new-york-city      # example city page (adjust)
  https://www.skyscrapercenter.com/country/united-states   # country hub (if available)
  https://www.skyscrapercenter.com/buildings?country=US    # explore results (if the site uses this pattern)
  https://www.skyscrapercenter.com/search?q=chicago        # search results (if present)

Notes:
- This script does NOT guess URL patterns; it just follows whatever links are present on seed pages.
- It only fetches pages on the domain `www.skyscrapercenter.com`.
- It extracts fields best-effort with multiple fallbacks and keeps the raw HTML for auditability.
"""

import argparse
import csv
import re
import sys
import time
from pathlib import Path
from urllib.parse import urljoin, urlparse

import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

BASE_HOST = "www.skyscrapercenter.com"
UA = {
    "User-Agent": "Annette-Capstone/1.0 (academic research; contact: your_email@example.edu)"
}

def make_session():
    s = requests.Session()
    s.headers.update(UA)
    retry = Retry(
        total=5,
        backoff_factor=0.8,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["GET", "HEAD", "OPTIONS"],
        raise_on_status=False,
    )
    s.mount("https://", HTTPAdapter(max_retries=retry))
    s.mount("http://", HTTPAdapter(max_retries=retry))
    return s

SESSION = make_session()

def polite_get(url, sleep=0.8, timeout=25):
    time.sleep(sleep)  # be polite
    r = SESSION.get(url, timeout=timeout)
    r.raise_for_status()
    return r

# ----------------------- Utilities -----------------------

def safe_path(s: str) -> str:
    return re.sub(r"[^a-zA-Z0-9_.-]+", "_", s).strip("_")

def is_same_host(url: str) -> bool:
    try:
        return urlparse(url).netloc.lower() == BASE_HOST
    except Exception:
        return False

def is_building_url(url: str) -> bool:
    # Building pages typically contain "/building/" in the path.
    try:
        u = urlparse(url)
        return u.netloc.lower() == BASE_HOST and "/building/" in u.path
    except Exception:
        return False

def normalize_ws(s: str | None) -> str | None:
    if not s: return None
    s = re.sub(r"\s+", " ", s).strip()
    return s or None

def to_int(s: str | None) -> str | None:
    if not s: return None
    m = re.search(r"-?\d+", s.replace(",", ""))
    return m.group(0) if m else None

def to_float(s: str | None) -> str | None:
    if not s: return None
    m = re.search(r"-?\d+(?:\.\d+)?", s.replace(",", ""))
    return m.group(0) if m else None

# ----------------------- Link discovery -----------------------

def discover_building_links_from_seed(seed_url: str, html: str) -> set[str]:
    """Find building links on a generic seed page (company/city/results/etc.)."""
    soup = BeautifulSoup(html, "html.parser")
    links = set()
    for a in soup.find_all("a", href=True):
        href = a["href"]
        abs_url = urljoin(seed_url, href)
        if is_building_url(abs_url):
            links.add(abs_url)
    return links

# ----------------------- Building parsing -----------------------

def parse_building_page(url: str, html: str) -> dict:
    """
    Extracts fields best-effort from a building page.
    We try multiple strategies because CTBUH pages vary by template.
    """
    soup = BeautifulSoup(html, "html.parser")
    text_all = normalize_ws(soup.get_text(" | "))

    data = {
        "source": "ctbuh",
        "source_url": url,
        "name": None,
        "country": None,
        "state": None,
        "city": None,
        "year_completed": None,
        "height_m": None,
        "height_ft": None,
        "floors": None,
        "function": None,
        "material": None,
        "architect_firm": None,
        "developer": None,
        "contractor": None,
        "structural_engineer": None,
        "mep_engineer": None,
        "latitude": None,
        "longitude": None,
    }

    # --- Name (H1/H2) ---
    h = soup.find(["h1", "h2"])
    if h:
        data["name"] = normalize_ws(h.get_text())

    # --- Location block (breadcrumbs or labeled fields) ---
    # Many pages have breadcrumbs like Country > City, sometimes state is shown in parentheses.
    # Try some label-based extraction too.
    # Country / City:
    # Heuristic: look for a table or info block with "City", "Country", sometimes "Location"
    labels = {
        "city": r"(?:\bCity\b|\blocation\b).*?:\s*([^|]+?)(?:\||$)",
        "country": r"(?:\bCountry\b).*?:\s*([^|]+?)(?:\||$)",
    }
    for key, pat in labels.items():
        m = re.search(pat, text_all, flags=re.I)
        if m:
            data[key] = normalize_ws(m.group(1))

    # Try to parse "City, State, Country" in a single line if present
    if not data["country"] or not data["city"]:
        m = re.search(r"\b([A-Za-z .'-]+),\s*([A-Za-z .'-]+),\s*([A-Za-z .'-]+)\b", text_all)
        if m:
            data["city"] = data["city"] or normalize_ws(m.group(1))
            data["state"] = data["state"] or normalize_ws(m.group(2))
            data["country"] = data["country"] or normalize_ws(m.group(3))

    # --- Year completed (various phrasings) ---
    for pat in [
        r"\bCompletion(?: year)?\s*[:|]\s*(\d{4})\b",
        r"\bCompleted(?: in)?\s*(\d{4})\b",
        r"\bYear\s*(\d{4})\b",
    ]:
        m = re.search(pat, text_all, flags=re.I)
        if m:
            data["year_completed"] = m.group(1)
            break

    # --- Height (m/ft) ---
    # Look for patterns like "xxx m / yyy ft"
    m = re.search(r"\b(\d{2,4}(?:\.\d+)?)\s*m\s*/\s*(\d{2,4}(?:\.\d+)?)\s*ft\b", text_all, flags=re.I)
    if m:
        data["height_m"] = m.group(1)
        data["height_ft"] = m.group(2)
    else:
        # Fallback: find "Height: xxx m" or "xxx ft"
        m1 = re.search(r"\bHeight\b[^0-9]*(\d{2,4}(?:\.\d+)?)\s*m\b", text_all, flags=re.I)
        m2 = re.search(r"\bHeight\b[^0-9]*(\d{2,4}(?:\.\d+)?)\s*ft\b", text_all, flags=re.I)
        if m1: data["height_m"] = m1.group(1)
        if m2: data["height_ft"] = m2.group(1)

    # --- Floors ---
    m = re.search(r"\b(\d{1,3})\s+floors?\b", text_all, flags=re.I)
    if m: data["floors"] = m.group(1)

    # --- Function / material (label-based heuristics) ---
    m = re.search(r"\bFunction\b\s*[:|]\s*([^|]+)", text_all, flags=re.I)
    if m: data["function"] = normalize_ws(m.group(1))
    m = re.search(r"\bMaterial\b\s*[:|]\s*([^|]+)", text_all, flags=re.I)
    if m: data["material"] = normalize_ws(m.group(1))

    # --- Companies / firms (if listed) ---
    for key, label in [
        ("architect_firm", r"\bArchitect\b\s*[:|]\s*([^|]+)"),
        ("developer", r"\bDeveloper\b\s*[:|]\s*([^|]+)"),
        ("contractor", r"\bContractor\b\s*[:|]\s*([^|]+)"),
        ("structural_engineer", r"\bStructural Engineer\b\s*[:|]\s*([^|]+)"),
        ("mep_engineer", r"\bMEP Engineer\b\s*[:|]\s*([^|]+)"),
    ]:
        m = re.search(label, text_all, flags=re.I)
        if m:
            data[key] = normalize_ws(m.group(1))

    # --- Coordinates (if present anywhere) ---
    # Look for patterns like "Latitude: xx.xxxx | Longitude: yy.yyyy" or "(xx.x, yy.y)"
    m = re.search(r"Latitude\s*[:]\s*([+-]?\d+(?:\.\d+)?)\b.*?Longitude\s*[:]\s*([+-]?\d+(?:\.\d+)?)\b", text_all, flags=re.I)
    if m:
        data["latitude"], data["longitude"] = m.group(1), m.group(2)
    else:
        m = re.search(r"\(([+-]?\d+(?:\.\d+)?),\s*([+-]?\d+(?:\.\d+)?)\)", text_all)
        if m:
            data["latitude"], data["longitude"] = m.group(1), m.group(2)

    # Normalize numerics to clean strings
    data["year_completed"] = to_int(data["year_completed"])
    data["floors"] = to_int(data["floors"])
    data["height_m"] = to_float(data["height_m"])
    data["height_ft"] = to_float(data["height_ft"])

    return data

# ----------------------- Main crawling logic -----------------------

def read_seeds(path: Path) -> list[str]:
    seeds = []
    for line in path.read_text(encoding="utf-8").splitlines():
        line = line.strip()
        if not line or line.startswith("#"):
            continue
        seeds.append(line)
    return seeds

def crawl(seeds: list[str], out_csv: Path, out_dir: Path, resume: bool = True):
    out_dir.mkdir(parents=True, exist_ok=True)
    html_dir = out_dir / "html"
    html_dir.mkdir(exist_ok=True)

    # For resuming, collect already-scraped building URLs
    seen_buildings = set()
    if resume and out_csv.exists():
        with out_csv.open(newline="", encoding="utf-8") as f:
            for row in csv.DictReader(f):
                if row.get("source_url"):
                    seen_buildings.add(row["source_url"])

    # CSV setup
    fieldnames = [
        "source","source_url","name","country","state","city",
        "year_completed","height_m","height_ft","floors","function","material",
        "architect_firm","developer","contractor","structural_engineer","mep_engineer",
        "latitude","longitude"
    ]
    mode = "a" if out_csv.exists() else "w"
    out_f = out_csv.open(mode, newline="", encoding="utf-8")
    writer = csv.DictWriter(out_f, fieldnames=fieldnames)
    if mode == "w":
        writer.writeheader()

    try:
        # 1) Gather building links from all seeds
        discovered = set()
        for seed in seeds:
            if not is_same_host(seed):
                print(f"[skip external] {seed}")
                continue
            try:
                r = polite_get(seed)
            except Exception as e:
                print(f"[seed error] {seed}: {e}")
                continue

            seed_html = r.text
            (html_dir / f"seed_{safe_path(seed)}.html").write_text(seed_html, encoding="utf-8")
            links = discover_building_links_from_seed(seed, seed_html)
            print(f"[seed] {seed} -> {len(links)} building links")
            discovered.update(links)

        # 2) Visit each building page
        total = len(discovered)
        print(f"[discover] total unique building links: {total}")

        count = 0
        for burl in sorted(discovered):
            count += 1
            if burl in seen_buildings:
                # already scraped
                continue
            try:
                r = polite_get(burl)
            except Exception as e:
                print(f"[building error] {burl}: {e}")
                continue

            html = r.text
            (html_dir / f"b_{safe_path(burl)}.html").write_text(html, encoding="utf-8")
            data = parse_building_page(burl, html)
            writer.writerow(data)
            out_f.flush()
            print(f"[{count}/{total}] ✓ {data.get('name') or burl}")

    finally:
        out_f.close()

# ----------------------- CLI -----------------------

# Modified main function to run within Colab
def main(seeds_file="seeds.txt", out_csv_file="ctbuh_us.csv", out_dir_name="ctbuh_data", no_resume=False):
    seeds_path = Path(seeds_file)
    out_csv = Path(out_csv_file)
    out_dir = Path(out_dir_name)

    # Create a dummy seeds file if it doesn't exist
    if not seeds_path.exists():
        seeds_path.write_text("https://www.skyscrapercenter.com/city/new-york-city")
        print(f"Created dummy seeds file: {seeds_path}")


    seeds = read_seeds(seeds_path)
    if not seeds:
        print("No seeds provided.", file=sys.stderr)
        sys.exit(1)

    crawl(seeds, out_csv, out_dir, resume=(not no_resume))

if __name__ == "__main__":
    # Start the crawl with default or specified parameters
    main()

Created dummy seeds file: seeds.txt
[seed] https://www.skyscrapercenter.com/city/new-york-city -> 11 building links
[discover] total unique building links: 11
[1/11] ✓ 111 West 57th Street
[2/11] ✓ 3 World Trade Center
[3/11] ✓ 30 Hudson Yards
[4/11] ✓ 432 Park Avenue
[5/11] ✓ Bank of America Tower
[6/11] ✓ Central Park Tower
[7/11] ✓ Empire State Building
[8/11] ✓ JPMorgan Chase World Headquarters
[9/11] ✓ Metropolitan Life Tower
[10/11] ✓ One Vanderbilt Avenue
[11/11] ✓ One World Trade Center


In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import csv, re, time, sys
from pathlib import Path
from urllib.parse import urljoin, urlparse

import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

BASE = "https://www.skyscrapercenter.com"
COUNTRY_URL = f"{BASE}/country/united-states"  # entry point for U.S. cities
UA = {"User-Agent": "Annette-Capstone/1.0 (academic research; contact: your_email@school.edu)"}

def session():
    s = requests.Session()
    s.headers.update(UA)
    retry = Retry(total=5, backoff_factor=0.8, status_forcelist=[429,500,502,503,504],
                  allowed_methods=["GET","HEAD","OPTIONS"], raise_on_status=False)
    s.mount("https://", HTTPAdapter(max_retries=retry))
    return s

S = session()

def get(url, sleep=1.0, timeout=25):
    time.sleep(sleep)  # politeness
    r = S.get(url, timeout=timeout)
    r.raise_for_status()
    return r.text

def norm(s):
    if not s: return None
    return re.sub(r"\s+", " ", s).strip()

def to_num(s, kind=int):
    if not s: return None
    m = re.search(r"-?\d+(?:\.\d+)?", s.replace(",", ""))
    if not m: return None
    return m.group(0)

def is_building_url(u):
    try:
        p = urlparse(u)
        return p.netloc.lower() == "www.skyscrapercenter.com" and "/building/" in p.path
    except:
        return False

def safe_name(u):
    return re.sub(r"[^a-zA-Z0-9_.-]+","_",u)[:180]

# ---------------- city discovery ----------------

def get_us_city_urls():
    """Collect all /city/... links from the US country hub."""
    html = get(COUNTRY_URL)
    soup = BeautifulSoup(html, "html.parser")
    cities = set()
    for a in soup.find_all("a", href=True):
        href = a["href"]
        absu = urljoin(COUNTRY_URL, href)
        if urlparse(absu).netloc == "www.skyscrapercenter.com" and "/city/" in urlparse(absu).path:
            cities.add(absu)
    # Fallback: also grab from the global Cities index and filter by “United States” in-row text
    try:
        cities_html = get(f"{BASE}/cities")
        sp = BeautifulSoup(cities_html, "html.parser")
        for row in sp.find_all(["tr","li","div"]):
            txt = row.get_text(" ", strip=True)
            if "United States" in txt:
                for a in row.find_all("a", href=True):
                    cu = urljoin(f"{BASE}/cities", a["href"])
                    if "/city/" in cu:
                        cities.add(cu)
    except Exception:
        pass
    return sorted(cities)

# ------------- building discovery on a city page -------------

def discover_building_links_from_city(city_url):
    html = get(city_url)
    soup = BeautifulSoup(html, "html.parser")
    links = set()
    for a in soup.find_all("a", href=True):
        absu = urljoin(city_url, a["href"])
        if is_building_url(absu):
            links.add(absu)
    return links, html  # return html so we can cache

# ------------- building detail parsing -------------

def parse_building(url, html):
    soup = BeautifulSoup(html, "html.parser")
    whole = norm(soup.get_text(" | "))

    out = {
        "source":"ctbuh","source_url":url,
        "name":None,"country":None,"state":None,"city":None,
        "year_completed":None,"height_m":None,"height_ft":None,"floors":None,
        "function":None,"material":None,
        "architect_firm":None,"developer":None,"contractor":None,
        "structural_engineer":None,"mep_engineer":None,
        "latitude":None,"longitude":None
    }

    # name
    h = soup.find(["h1","h2"])
    if h: out["name"] = norm(h.get_text())

    # location (label heuristics + generic pattern)
    for key, pat in {
        "city": r"\bCity\b\s*[:|]\s*([^|]+)",
        "country": r"\bCountry\b\s*[:|]\s*([^|]+)",
        "state": r"\bState\b\s*[:|]\s*([^|]+)",
    }.items():
        m = re.search(pat, whole, flags=re.I)
        if m: out[key] = norm(m.group(1))

    if not out["city"] or not out["country"]:
        m = re.search(r"\b([A-Za-z .'-]+),\s*([A-Za-z .'-]+),\s*([A-Za-z .'-]+)\b", whole)
        if m:
            out["city"] = out["city"] or norm(m.group(1))
            out["state"] = out["state"] or norm(m.group(2))
            out["country"] = out["country"] or norm(m.group(3))

    # year
    for pat in [r"\bCompletion(?: year)?\s*[:|]\s*(\d{4})\b", r"\bCompleted(?: in)?\s*(\d{4})\b", r"\bYear\s*(\d{4})\b"]:
        m = re.search(pat, whole, flags=re.I)
        if m: out["year_completed"] = to_num(m.group(1)); break

    # height (m / ft)
    m = re.search(r"\b(\d{2,4}(?:\.\d+)?)\s*m\s*/\s*(\d{2,4}(?:\.\d+)?)\s*ft\b", whole, flags=re.I)
    if m:
        out["height_m"], out["height_ft"] = m.group(1), m.group(2)
    else:
        m1 = re.search(r"\bHeight\b[^0-9]*(\d{2,4}(?:\.\d+)?)\s*m\b", whole, flags=re.I)
        m2 = re.search(r"\bHeight\b[^0-9]*(\d{2,4}(?:\.\d+)?)\s*ft\b", whole, flags=re.I)
        if m1: out["height_m"] = m1.group(1)
        if m2: out["height_ft"] = m2.group(1)

    # floors
    m = re.search(r"\b(\d{1,3})\s+floors?\b", whole, flags=re.I)
    if m: out["floors"] = to_num(m.group(0))

    # function/material
    m = re.search(r"\bFunction\b\s*[:|]\s*([^|]+)", whole, flags=re.I)
    if m: out["function"] = norm(m.group(1))
    m = re.search(r"\bMaterial\b\s*[:|]\s*([^|]+)", whole, flags=re.I)
    if m: out["material"] = norm(m.group(1))

    # firms
    for key, lab in [
        ("architect_firm", r"\bArchitect\b\s*[:|]\s*([^|]+)"),
        ("developer", r"\bDeveloper\b\s*[:|]\s*([^|]+)"),
        ("contractor", r"\bContractor\b\s*[:|]\s*([^|]+)"),
        ("structural_engineer", r"\bStructural Engineer\b\s*[:|]\s*([^|]+)"),
        ("mep_engineer", r"\bMEP Engineer\b\s*[:|]\s*([^|]+)"),
    ]:
        m = re.search(lab, whole, flags=re.I)
        if m: out[key] = norm(m.group(1))

    # coords
    m = re.search(r"Latitude\s*:\s*([+-]?\d+(?:\.\d+)?)\b.*?Longitude\s*:\s*([+-]?\d+(?:\.\d+)?)\b", whole, flags=re.I)
    if m:
        out["latitude"], out["longitude"] = m.group(1), m.group(2)

    return out

def write_row(writer, row): writer.writerow(row)

def main():
    out_csv = Path("ctbuh_us_all.csv")
    html_dir = Path("ctbuh_us_html"); html_dir.mkdir(exist_ok=True)

    # resume support
    done = set()
    if out_csv.exists():
        with out_csv.open() as f:
            for r in csv.DictReader(f):
                if r.get("source_url"):
                    done.add(r["source_url"])

    fieldnames = ["source","source_url","name","country","state","city",
                  "year_completed","height_m","height_ft","floors","function","material",
                  "architect_firm","developer","contractor","structural_engineer","mep_engineer",
                  "latitude","longitude"]
    mode = "a" if out_csv.exists() else "w"
    f = out_csv.open(mode, newline="", encoding="utf-8")
    w = csv.DictWriter(f, fieldnames=fieldnames)
    if mode == "w": w.writeheader()

    try:
        # 1) discover U.S. cities
        cities = get_us_city_urls()
        print(f"[cities] discovered {len(cities)} U.S. city URLs")

        # 2) collect building links from every city page
        bset = set()
        for i, c in enumerate(cities, 1):
            try:
                links, city_html = discover_building_links_from_city(c)
                (html_dir / f"city_{safe_name(c)}.html").write_text(city_html, encoding="utf-8")
                bset.update(links)
                print(f"[{i}/{len(cities)}] {c} -> {len(links)} building links (total {len(bset)})")
            except Exception as e:
                print(f"[city error] {c}: {e}")

        # 3) visit each building page
        for j, burl in enumerate(sorted(bset), 1):
            if burl in done: continue
            try:
                bh = get(burl)
                (html_dir / f"b_{safe_name(burl)}.html").write_text(bh, encoding="utf-8")
                row = parse_building(burl, bh)
                # keep U.S.-only rows (in case cities list included non-US)
                if (row.get("country") or "").lower() not in ("united states","usa","u.s.","us"):
                    # try to infer from city URL parent
                    if "/country/united-states" not in COUNTRY_URL:
                        continue
                write_row(w, row); f.flush()
                if j % 50 == 0: print(f"[buildings] wrote {j}/{len(bset)}")
            except Exception as e:
                print(f"[building error] {burl}: {e}")

    finally:
        f.close()

if __name__ == "__main__":
    main()


[cities] discovered 396 U.S. city URLs
[1/396] https://www.skyscrapercenter.com/city/abu-dhabi -> 11 building links (total 11)
[2/396] https://www.skyscrapercenter.com/city/addis-ababa -> 6 building links (total 17)
[3/396] https://www.skyscrapercenter.com/city/ajman -> 10 building links (total 27)
[4/396] https://www.skyscrapercenter.com/city/al-khobar -> 5 building links (total 32)
[5/396] https://www.skyscrapercenter.com/city/albany -> 10 building links (total 42)
[6/396] https://www.skyscrapercenter.com/city/algiers -> 2 building links (total 44)
[7/396] https://www.skyscrapercenter.com/city/almaty -> 10 building links (total 54)
[8/396] https://www.skyscrapercenter.com/city/amman -> 8 building links (total 62)
[9/396] https://www.skyscrapercenter.com/city/ampang-jaya -> 10 building links (total 72)
[10/396] https://www.skyscrapercenter.com/city/amsterdam -> 10 building links (total 82)
[11/396] https://www.skyscrapercenter.com/city/ankara -> 10 building links (total 92)
[12/396] h

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
World-scale scraper for The Skyscraper Center (CTBUH).

- Start from any mix of seed pages (country, city, company, explore/search).
- Discover all building links (/building/...) on those pages.
- Visit each building page and extract normalized labeled fields from facts boxes/tables (DOM-first).
- Save raw HTML for reproducibility.
- Write a wide CSV: stable core columns + any extra labeled fields we find.

USAGE:
  python scrape_skyscrapercenter_world.py --seeds seeds.txt --out ctbuh_world.csv --outdir ctbuh_world_html

seeds.txt example (one per line, add many to scale up):
  https://www.skyscrapercenter.com/countries        # global countries index (if accessible)
  https://www.skyscrapercenter.com/city/new-york-city
  https://www.skyscrapercenter.com/city/chicago
  https://www.skyscrapercenter.com/country/united-states
  https://www.skyscrapercenter.com/company/3393
  https://www.skyscrapercenter.com/buildings        # explore list (if present)
  https://www.skyscrapercenter.com/search?q=Tower   # search results (optional)

Politeness:
- Keep request delays (>= 0.8–1.2s).
- Use a clear academic UA string.
- Do NOT download or redistribute images.
"""

import argparse, csv, re, sys, time
from pathlib import Path
from urllib.parse import urljoin, urlparse

import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

BASE_HOST = "www.skyscrapercenter.com"
BASE = f"https://{BASE_HOST}"
UA = {"User-Agent": "Annette-Capstone/1.0 (academic research; contact: your_email@example.edu)"}

# --------- HTTP session with retry ----------
def make_session():
    s = requests.Session()
    s.headers.update(UA)
    retry = Retry(
        total=5,
        backoff_factor=0.8,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["GET", "HEAD", "OPTIONS"],
        raise_on_status=False,
    )
    s.mount("https://", HTTPAdapter(max_retries=retry))
    s.mount("http://", HTTPAdapter(max_retries=retry))
    return s

S = make_session()

def polite_get(url, sleep=1.0, timeout=25):
    time.sleep(sleep)
    r = S.get(url, timeout=timeout)
    r.raise_for_status()
    # best-effort encoding normalization
    if not r.encoding or r.encoding.lower() in ("iso-8859-1","latin-1"):
        r.encoding = "utf-8"
    return r

# --------- Helpers ----------
def norm(s):
    if not s: return None
    s = re.sub(r"\s+", " ", s).strip()
    return s or None

def safe_name(url):
    return re.sub(r"[^a-zA-Z0-9_.-]+", "_", url)[:180]

def is_same_host(url):
    try:
        return urlparse(url).netloc.lower() == BASE_HOST
    except:
        return False

def is_building_url(url):
    try:
        u = urlparse(url)
        return u.netloc.lower() == BASE_HOST and "/building/" in u.path
    except:
        return False

# --------- Discovery ----------
def discover_building_links(seed_url, html):
    soup = BeautifulSoup(html, "lxml")
    links = set()
    for a in soup.find_all("a", href=True):
        href = a["href"]
        absu = urljoin(seed_url, a["href"])
        if is_building_url(absu):
            links.add(absu)
    return links

# --------- Parsing labeled facts ----------
# We’ll build a generic (label -> value) extractor that walks common HTML patterns used for fact panels:
# - <table> with <th>/<td> or <td class="label">/<td class="value">
# - definition lists <dl><dt>Label</dt><dd>Value</dd>
# - info blocks with "label: value"

CANON_MAP = {
    # canonical_name: [regex patterns for labels we expect]
    "name":          [r"^name$"],
    "status":        [r"^status$"],
    "city":          [r"^city$"],
    "state":         [r"^state|province|region$"],
    "country":       [r"^country$"],
    "location":      [r"^location$"],
    "completion_year":[r"^completion(?: year)?$","^completed$","^year$"],
    "start_year":    [r"^start(?:ed)?(?: year)?$","^start of construction$"],
    "height_arch_m": [r"^height.*architectural.*m","^height.*arch.*m"],
    "height_arch_ft":[r"^height.*architectural.*ft","^height.*arch.*ft"],
    "height_tip_m":  [r"^height.*tip.*m"],
    "height_tip_ft": [r"^height.*tip.*ft"],
    "height_roof_m": [r"^height.*roof.*m"],
    "height_roof_ft":[r"^height.*roof.*ft"],
    "floors_above":  [r"^floors? above ground$","^floors? \(above ground\)$"],
    "floors_below":  [r"^floors? below ground$","^floors? \(below ground\)$"],
    "floors_total":  [r"^floors?$","^storeys?$"],
    "function":      [r"^function$","^primary function$","^building function$"],
    "functions":     [r"^functions$","^mixed use breakdown$"],
    "material":      [r"^material$","^materials?$"],
    "structural_system":[r"^structural system$","^structure$"],
    "architect_firm":[r"^architects?$","^architecture firm$","^design architect$"],
    "owner":         [r"^owner$","^building owner$"],
    "developer":     [r"^developer$"],
    "contractor":    [r"^contractor$","^main contractor$","^general contractor$"],
    "structural_engineer":[r"^structural engineer$","^structural engineering$"],
    "mep_engineer":  [r"^mep engineer$","^services engineer$","^mechanical engineer$"],
    "elevators":     [r"^elevators?$","^lifts?$"],
    "cost":          [r"^cost$","^construction cost$"],
    "aka_names":     [r"^other names$","^also known as$","^aka$"],
    "latitude":      [r"^latitude$"],
    "longitude":     [r"^longitude$"],
}

# compile pattern map
CANON_MAP_COMPILED = {k: [re.compile(p, re.I) for p in v] for k,v in CANON_MAP.items()}

def label_to_canon(label):
    if not label: return None
    lab = norm(label).lower()
    for canon, patterns in CANON_MAP_COMPILED.items():
        for pat in patterns:
            if pat.search(lab):
                return canon
    return None

def clean_value(v):
    v = norm(v)
    if not v: return None
    # remove common tooltip sentences that caused pollution:
    v = re.sub(r"Height is measured from the level of the lowest.*?$", "", v, flags=re.I)
    v = re.sub(r"Other names the building has commonly been known as.*?$", "", v, flags=re.I)
    v = v.strip(" :|")
    return v or None

def extract_kv_from_tables(soup):
    kv = {}
    # table-based
    for tbl in soup.find_all("table"):
        for tr in tbl.find_all("tr"):
            cells = tr.find_all(["th","td"])
            if len(cells) >= 2:
                label = cells[0].get_text(" ", strip=True)
                val   = cells[1].get_text(" ", strip=True)
                canon = label_to_canon(label)
                if canon:
                    kv.setdefault(canon, clean_value(val))
                else:
                    # keep raw label as fallback
                    key = f"extra::{norm(label).lower()}"
                    kv.setdefault(key, clean_value(val))
    return kv

def extract_kv_from_deflists(soup):
    kv = {}
    for dl in soup.find_all("dl"):
        dts = dl.find_all("dt")
        dds = dl.find_all("dd")
        for dt,dd in zip(dts, dds):
            label = dt.get_text(" ", strip=True)
            val   = dd.get_text(" ", strip=True)
            canon = label_to_canon(label)
            if canon:
                kv.setdefault(canon, clean_value(val))
            else:
                key = f"extra::{norm(label).lower()}"
                kv.setdefault(key, clean_value(val))
    return kv

def extract_pairs_by_colon(soup):
    # very light fallback for 'Label: Value' blocks
    kv = {}
    for tag in soup.find_all(["p","li","div","span"]):
        txt = tag.get_text(" ", strip=True)
        if ":" in txt and len(txt) < 200:
            parts = txt.split(":",1)
            label, val = norm(parts[0]), norm(parts[1])
            canon = label_to_canon(label)
            if canon and val:
                kv.setdefault(canon, clean_value(val))
    return kv

def parse_building(url, html):
    soup = BeautifulSoup(html, "lxml")

    # NAME (safe)
    name = None
    h = soup.find(["h1","h2"])
    if h:
        name = norm(h.get_text(" ", strip=True))

    # Merge multiple extraction passes (tables, deflists, colon blocks)
    kv = {}
    for fn in (extract_kv_from_tables, extract_kv_from_deflists, extract_pairs_by_colon):
        try:
            kv.update({k:v for k,v in fn(soup).items() if v})
        except Exception:
            pass

    # Build final record
    rec = {
        "source": "ctbuh",
        "source_url": url,
        "name": name,
        # stable core:
        "country": kv.pop("country", None),
        "state": kv.pop("state", None),
        "city": kv.pop("city", None),
        "status": kv.pop("status", None),
        "year_completed": kv.pop("completion_year", None),
        "start_year": kv.pop("start_year", None),
        "height_arch_m": kv.pop("height_arch_m", None),
        "height_arch_ft": kv.pop("height_arch_ft", None),
        "height_tip_m": kv.pop("height_tip_m", None),
        "height_tip_ft": kv.pop("height_tip_ft", None),
        "height_roof_m": kv.pop("height_roof_m", None),
        "height_roof_ft": kv.pop("height_roof_ft", None),
        "floors_total": kv.pop("floors_total", None),
        "floors_above": kv.pop("floors_above", None),
        "floors_below": kv.pop("floors_below", None),
        "function": kv.pop("function", None),
        "functions": kv.pop("functions", None),
        "material": kv.pop("material", None),
        "structural_system": kv.pop("structural_system", None),
        "architect_firm": kv.pop("architect_firm", None),
        "owner": kv.pop("owner", None),
        "developer": kv.pop("developer", None),
        "contractor": kv.pop("contractor", None),
        "structural_engineer": kv.pop("structural_engineer", None),
        "mep_engineer": kv.pop("mep_engineer", None),
        "elevators": kv.pop("elevators", None),
        "cost": kv.pop("cost", None),
        "aka_names": kv.pop("aka_names", None),
        "latitude": kv.pop("latitude", None),
        "longitude": kv.pop("longitude", None),
    }

    # any other labeled fields we don’t map yet -> keep as extras
    # (this future-proofs your dataset; you can explore columns later)
    for k,v in list(kv.items()):
        if v:
            rec[k] = v

    return rec

# --------- Crawl ----------
def read_seeds(path: Path) -> list[str]:
    seeds = []
    for line in path.read_text(encoding="utf-8").splitlines():
        line = line.strip()
        if not line or line.startswith("#"):
            continue
        seeds.append(line)
    return seeds

def crawl(seeds, out_csv: Path, out_dir: Path, sleep=1.0):
    out_dir.mkdir(parents=True, exist_ok=True)
    html_dir = out_dir / "html"; html_dir.mkdir(exist_ok=True)

    # Resume support (don’t re-scrape finished buildings)
    done = set()
    if out_csv.exists():
        with out_csv.open(newline="", encoding="utf-8") as f:
            for row in csv.DictReader(f):
                if row.get("source_url"):
                    done.add(row["source_url"])

    # First pass: collect all building links
    building_links = set()
    for seed in seeds:
        if not is_same_host(seed):
            print(f"[skip external] {seed}")
            continue
        try:
            r = polite_get(seed, sleep=sleep)
        except Exception as e:
            print(f"[seed error] {seed}: {e}")
            continue
        html = r.text
        (html_dir / f"seed_{safe_name(seed)}.html").write_text(html, encoding="utf-8")
        found = discover_building_links(seed, html)
        building_links |= found
        print(f"[seed] {seed} -> {len(found)} building links (total {len(building_links)})")

    # Second pass: fetch each building
    # We’ll build the CSV header dynamically from union of keys as we go.
    fieldnames = set(["source","source_url","name","country","state","city","status",
                      "year_completed","start_year",
                      "height_arch_m","height_arch_ft","height_tip_m","height_tip_ft",
                      "height_roof_m","height_roof_ft",
                      "floors_total","floors_above","floors_below",
                      "function","functions","material","structural_system",
                      "architect_firm","owner","developer","contractor","structural_engineer",
                      "mep_engineer","elevators","cost","aka_names","latitude","longitude"])

    rows_buffer = []
    def flush(rows):
        nonlocal fieldnames
        # update fieldnames to include any extras
        for r in rows:
            fieldnames |= set(r.keys())
        # write in a stable order: core first, then sorted extras
        core = ["source","source_url","name","country","state","city","status",
                "year_completed","start_year",
                "height_arch_m","height_arch_ft","height_tip_m","height_tip_ft",
                "height_roof_m","height_roof_ft",
                "floors_total","floors_above","floors_below",
                "function","functions","material","structural_system",
                "architect_firm","owner","developer","contractor","structural_engineer",
                "mep_engineer","elevators","cost","aka_names","latitude","longitude"]
        extras = sorted([c for c in fieldnames if c not in core])
        ordered = core + extras

        mode = "a" if out_csv.exists() else "w"
        with out_csv.open(mode, newline="", encoding="utf-8") as f:
            w = csv.DictWriter(f, fieldnames=ordered)
            if mode == "w":
                w.writeheader()
            for r in rows:
                w.writerow({k: r.get(k, "") for k in ordered})

    count = 0
    total = len(building_links)
    for b in sorted(building_links):
        if b in done: continue
        try:
            rb = polite_get(b, sleep=sleep)
            html = rb.text
            (html_dir / f"b_{safe_name(b)}.html").write_text(html, encoding="utf-8")
            rec = parse_building(b, html)
            rows_buffer.append(rec)
            count += 1
            if count % 25 == 0:
                flush(rows_buffer); rows_buffer.clear()
                print(f"[{count}/{total}] wrote 25 rows (running total).")
        except Exception as e:
            print(f"[building error] {b}: {e}")

    if rows_buffer:
        flush(rows_buffer); rows_buffer.clear()
        print(f"[done] wrote remaining rows. Total new: {count}")

# --------- CLI ----------
# Modified main function to run within Colab
def main_colab(seeds_file="seeds.txt", out_csv_file="ctbuh_world.csv", out_dir_name="ctbuh_world_html", sleep_time=1.0):
    seeds_path = Path(seeds_file)
    out_csv = Path(out_csv_file)
    out_dir = Path(out_dir_name)

    # Create a dummy seeds file if it doesn't exist
    if not seeds_path.exists():
        seeds_path.write_text("https://www.skyscrapercenter.com/city/new-york-city")
        print(f"Created dummy seeds file: {seeds_path}")

    seeds = read_seeds(seeds_path)
    if not seeds:
        print("No seeds provided.", file=sys.stderr)
        sys.exit(1)

    crawl(seeds, out_csv, out_dir, sleep=sleep_time)


if __name__ == "__main__":
    # Start the crawl with default or specified parameters
    main_colab()

  "floors_above":  [r"^floors? above ground$","^floors? \(above ground\)$"],
  "floors_below":  [r"^floors? below ground$","^floors? \(below ground\)$"],


[seed] https://www.skyscrapercenter.com/city/new-york-city -> 11 building links (total 11)
[done] wrote remaining rows. Total new: 11


In [None]:
if __name__ == "__main__":
    # Start the crawl with default or specified parameters
    main_colab()

[seed] https://www.skyscrapercenter.com/city/new-york-city -> 11 building links (total 11)


In [None]:
!pip install playwright bs4 requests lxml
!playwright install

Collecting playwright
  Downloading playwright-1.55.0-py3-none-manylinux1_x86_64.whl.metadata (3.5 kB)
Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Collecting pyee<14,>=13 (from playwright)
  Downloading pyee-13.0.0-py3-none-any.whl.metadata (2.9 kB)
Downloading playwright-1.55.0-py3-none-manylinux1_x86_64.whl (45.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.9/45.9 MB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Downloading pyee-13.0.0-py3-none-any.whl (15 kB)
Installing collected packages: pyee, playwright, bs4
Successfully installed bs4-0.0.2 playwright-1.55.0 pyee-13.0.0
Downloading Chromium 140.0.7339.16 (playwright build v1187)[2m from https://cdn.playwright.dev/dbazure/download/playwright/builds/chromium/1187/chromium-linux.zip[22m
[1G173.7 MiB [] 0% 0.0s[0K[1G173.7 MiB [] 0% 35.9s[0K[1G173.7 MiB [] 0% 23.8s[0K[1G173.7 MiB [] 0% 19.4s[0K[1G173.7 MiB []

In [None]:
import asyncio

# Run the main function that uses asyncio
await main()

[cities] discovered 396 U.S. city URLs
[1/396] https://www.skyscrapercenter.com/city/abu-dhabi -> 11 building links (total 11)
[2/396] https://www.skyscrapercenter.com/city/addis-ababa -> 6 building links (total 17)
[3/396] https://www.skyscrapercenter.com/city/ajman -> 10 building links (total 27)
[4/396] https://www.skyscrapercenter.com/city/al-khobar -> 5 building links (total 32)
[5/396] https://www.skyscrapercenter.com/city/albany -> 10 building links (total 42)
[6/396] https://www.skyscrapercenter.com/city/algiers -> 2 building links (total 44)
[7/396] https://www.skyscrapercenter.com/city/almaty -> 10 building links (total 54)
[8/396] https://www.skyscrapercenter.com/city/amman -> 8 building links (total 62)
[9/396] https://www.skyscrapercenter.com/city/ampang-jaya -> 10 building links (total 72)
[10/396] https://www.skyscrapercenter.com/city/amsterdam -> 10 building links (total 82)
[11/396] https://www.skyscrapercenter.com/city/ankara -> 10 building links (total 92)
[12/396] h

TypeError: object NoneType can't be used in 'await' expression

In [None]:
# Create a dummy seeds.txt file if it doesn't exist
seeds_file_path = "seeds.txt"
if not Path(seeds_file_path).exists():
    with open(seeds_file_path, "w") as f:
        f.write("https://www.skyscrapercenter.com/city/new-york-city\n")
        f.write("https://www.skyscrapercenter.com/city/chicago\n")
    print(f"Created dummy seeds file: {seeds_file_path}")
else:
    print(f"Seeds file already exists: {seeds_file_path}")

Seeds file already exists: seeds.txt


In [None]:
import asyncio

# Run the main function that uses asyncio
await main()

[seed] https://www.skyscrapercenter.com/city/new-york-city -> 0 cumulative links


In [None]:
# scrape_skyscrapercenter_playwright.py
import asyncio, csv, re, time
from pathlib import Path
from urllib.parse import urljoin, urlparse

import requests
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

BASE_HOST = "www.skyscrapercenter.com"
BASE      = f"https://{BASE_HOST}"
UA_STR    = "Annette-Capstone/1.0 (academic research; contact: darpo24a@mtholyoke.edu)"

# ---------- HTTP (for building pages) ----------
def make_session():
    s = requests.Session()
    s.headers.update({"User-Agent": UA_STR})
    retry = Retry(total=5, backoff_factor=0.8, status_forcelist=[429,500,502,503,504],
                  allowed_methods=["GET","HEAD","OPTIONS"], raise_on_status=False)
    s.mount("https://", HTTPAdapter(max_retries=retry))
    s.mount("http://",  HTTPAdapter(max_retries=retry))
    return s
S = make_session()

def polite_get(url, sleep=0.8, timeout=25):
    time.sleep(sleep)
    r = S.get(url, timeout=timeout)
    r.raise_for_status()
    if not r.encoding or r.encoding.lower() in ("iso-8859-1","latin-1"):
        r.encoding = "utf-8"
    return r

# ---------- helpers ----------
def norm(s):
    if not s: return None
    return re.sub(r"\s+", " ", s).strip()

def is_building_url(u):
    try:
        p = urlparse(u)
        return p.netloc.lower() == BASE_HOST and "/building/" in p.path
    except: return False

def safe(u): return re.sub(r"[^a-zA-Z0-9_.-]+","_",u)[:180]

# label canonicalization (raw strings to avoid warnings)
CANON_MAP = {
    "status":              [r"^status$"],
    "city":                [r"^city$"],
    "state":               [r"^(state|province|region)$"],
    "country":             [r"^country$"],
    "completion_year":     [r"^completion( year)?$", r"^completed$", r"^year$"],
    "start_year":          [r"^start(ed)?( year)?$", r"^start of construction$"],
    "height_arch_m":       [r"^height.*architectural.*m", r"^height.*arch.*m"],
    "height_arch_ft":      [r"^height.*architectural.*ft", r"^height.*arch.*ft"],
    "height_tip_m":        [r"^height.*tip.*m"],
    "height_tip_ft":       [r"^height.*tip.*ft"],
    "height_roof_m":       [r"^height.*roof.*m"],
    "height_roof_ft":      [r"^height.*roof.*ft"],
    "floors_total":        [r"^(floors?|storeys?)$"],
    "floors_above":        [r"^floors? above ground$", r"^floors? \(above ground\)$"],
    "floors_below":        [r"^floors? below ground$", r"^floors? \(below ground\)$"],
    "function":            [r"^(function|primary function|building function)$"],
    "functions":           [r"^(functions|mixed use breakdown)$"],
    "material":            [r"^materials?$"],
    "structural_system":   [r"^(structural system|structure)$"],
    "architect_firm":      [r"^(architects?|architecture firm|design architect)$"],
    "owner":               [r"^owner$"],
    "developer":           [r"^developer$"],
    "contractor":          [r"^(contractor|main contractor|general contractor)$"],
    "structural_engineer": [r"^structural engineer(ing)?$"],
    "mep_engineer":        [r"^(mep engineer|services engineer|mechanical engineer)$"],
    "elevators":           [r"^(elevators?|lifts?)$"],
    "cost":                [r"^(cost|construction cost)$"],
    "aka_names":           [r"^(other names|also known as|aka)$"],
    "latitude":            [r"^latitude$"],
    "longitude":           [r"^longitude$"],
}
CANON_MAP_COMPILED = {k: [re.compile(p, re.I) for p in v] for k,v in CANON_MAP.items()}
def label_to_canon(label):
    lab = norm(label or "").lower()
    for canon, pats in CANON_MAP_COMPILED.items():
        for pat in pats:
            if pat.search(lab): return canon
    return None

TOOLTIP_GARBAGE = [
    r"Height is measured from the level of the lowest.*$",
    r"Other names the building has commonly been known as.*$",
]
def clean_val(v):
    v = norm(v)
    if not v: return None
    for pat in TOOLTIP_GARBAGE:
        v = re.sub(pat, "", v, flags=re.I)
    return v.strip(" :|") or clean_val

def kv_from_tables(soup):
    kv = {}
    for tbl in soup.find_all("table"):
        for tr in tbl.find_all("tr"):
            cells = tr.find_all(["th","td"])
            if len(cells) >= 2:
                label = cells[0].get_text(" ", strip=True)
                val   = cells[1].get_text(" ", strip=True)
                canon = label_to_canon(label)
                if canon and val:
                    kv.setdefault(canon, clean_val(val))
    return kv

def kv_from_deflists(soup):
    kv = {}
    for dl in soup.find_all("dl"):
        dts = dl.find_all("dt")
        dds = dl.find_all("dd")
        for dt,dd in zip(dts, dds):
            canon = label_to_canon(dt.get_text(" ", strip=True))
            if canon:
                kv.setdefault(canon, clean_val(dd.get_text(" ", strip=True)))
    return kv

def parse_building(url, html):
    soup = BeautifulSoup(html, "lxml")
    name = None
    h = soup.find(["h1","h2"])
    if h: name = norm(h.get_text(" ", strip=True))
    kv = {}
    kv.update(kv_from_tables(soup))
    kv.update(kv_from_deflists(soup))
    rec = {
        "source":"ctbuh","source_url":url,"name":name,
        "country":kv.get("country"),"state":kv.get("state"),"city":kv.get("city"),
        "status":kv.get("status"),
        "year_completed":kv.get("completion_year"),"start_year":kv.get("start_year"),
        "height_arch_m":kv.get("height_arch_m"),"height_arch_ft":kv.get("height_arch_ft"),
        "height_tip_m":kv.get("height_tip_m"),"height_tip_ft":kv.get("height_tip_ft"),
        "height_roof_m":kv.get("height_roof_m"),"height_roof_ft":kv.get("height_roof_ft"),
        "floors_total":kv.get("floors_total"),"floors_above":kv.get("floors_above"),"floors_below":kv.get("floors_below"),
        "function":kv.get("function"),"functions":kv.get("functions"),
        "material":kv.get("material"),"structural_system":kv.get("structural_system"),
        "architect_firm":kv.get("architect_firm"),"owner":kv.get("owner"),"developer":kv.get("developer"),
        "contractor":kv.get("contractor"),"structural_engineer":kv.get("structural_engineer"),"mep_engineer":kv.get("mep_engineer"),
        "elevators":kv.get("elevators"),"cost":kv.get("cost"),"aka_names":kv.get("aka_names"),
        "latitude":kv.get("latitude"),"longitude":kv.get("longitude"),
    }
    return rec

# ---------- Playwright discovery (renders + clicks + loads all) ----------
async def discover_building_links_with_browser(seeds, outdir: Path, delay_click=0.4):
    outdir.mkdir(parents=True, exist_ok=True)
    html_dir = outdir / "html"; html_dir.mkdir(exist_ok=True)

    links = set()
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context(user_agent=UA_STR)
        page = await context.new_page()

        for seed in seeds:
            if urlparse(seed).netloc.lower() != BASE_HOST:
                print(f"[skip external] {seed}"); continue
            try:
                await page.goto(seed, wait_until="domcontentloaded")
                await page.wait_for_timeout(600)  # small pause
                # Try clicking a tab named "Buildings" if visible
                try:
                    el = await page.get_by_role("link", name=re.compile(r"^Buildings$", re.I)).first
                    if await el.is_visible():
                        await el.click()
                        await page.wait_for_timeout(600)
                except Exception:
                    pass

                # Keep clicking "Load more" / next pagination buttons if present
                while True:
                    clicked = False
                    for text in ["Load more","Show more","More","Next","›"]:
                        try:
                            btn = await page.get_by_role("button", name=re.compile(text, re.I)).first
                            if await btn.is_visible():
                                await btn.click()
                                await page.wait_for_timeout(int(delay_click*1000))
                                clicked = True
                                break
                        except Exception:
                            pass
                    if not clicked:
                        # also try anchor pagination
                        try:
                            a_next = await page.get_by_role("link", name=re.compile(r"Next|›", re.I)).first
                            if await a_next.is_visible():
                                await a_next.click()
                                await page.wait_for_timeout(int(delay_click*1000))
                                clicked = True
                        except Exception:
                            pass
                    if not clicked:
                        break

                # collect all /building/ links from the rendered DOM
                anchors = await page.locator("a[href]").all()
                for a in anchors:
                    href = await a.get_attribute("href")
                    if not href: continue
                    absu = urljoin(seed, href)
                    if is_building_url(absu):
                        links.add(absu)

                # cache the final DOM
                html = await page.content()
                (html_dir / f"seed_{safe(seed)}.html").write_text(html, encoding="utf-8")
                print(f"[seed] {seed} -> {len([u for u in links if seed in u])} cumulative links")

            except Exception as e:
                print(f"[seed error] {seed}: {e}")

        await browser.close()
    return links

# ---------- main crawl ----------
async def main(seeds_file="seeds.txt", out_csv="ctbuh_world.csv", outdir="ctbuh_world_cache", sleep=0.9):
    seeds = [ln.strip() for ln in Path(seeds_file).read_text(encoding="utf-8").splitlines()
             if ln.strip() and not ln.startswith("#")]

    outdir = Path(outdir); outdir.mkdir(exist_ok=True)
    html_dir = outdir / "buildings"; html_dir.mkdir(exist_ok=True)

    # 1) Discover all building URLs with a browser (JS rendering)
    discovered = await discover_building_links_with_browser(seeds, outdir)

    # 2) Resume: skip already scraped URLs
    done = set()
    out_csv_p = Path(out_csv)
    if out_csv_p.exists():
        with out_csv_p.open(newline="", encoding="utf-8") as f:
            for row in csv.DictReader(f):
                if row.get("source_url"): done.add(row["source_url"])

    # 3) Write header (wide but stable)
    core = ["source","source_url","name","country","state","city","status",
            "year_completed","start_year",
            "height_arch_m","height_arch_ft","height_tip_m","height_tip_ft",
            "height_roof_m","height_roof_ft",
            "floors_total","floors_above","floors_below",
            "function","functions","material","structural_system",
            "architect_firm","owner","developer","contractor","structural_engineer",
            "mep_engineer","elevators","cost","aka_names","latitude","longitude"]
    wmode = "a" if out_csv_p.exists() else "w"
    f = out_csv_p.open(wmode, newline="", encoding="utf-8")
    w = csv.DictWriter(f, fieldnames=core)
    if wmode == "w": w.writeheader()

    # 4) Fetch + parse each building page (requests = faster)
    try:
        total = len(discovered)
        for i, url in enumerate(sorted(discovered), 1):
            if url in done: continue
            try:
                r = polite_get(url, sleep=sleep)
                html = r.text
                (html_dir / f"b_{safe(url)}.html").write_text(html, encoding="utf-8")
                row = parse_building(url, html)
                w.writerow(row); f.flush()
                if i % 50 == 0:
                    print(f"[{i}/{total}] wrote 50…")
            except Exception as e:
                print(f"[building error] {url}: {e}")
    finally:
        f.close()

if __name__ == "__main__":
    # Use await instead of asyncio.run() in Colab
    await main()

[seed] https://www.skyscrapercenter.com/city/new-york-city -> 0 cumulative links


In [None]:
import asyncio

# Run the main function that uses asyncio
await main()

In [None]:
# Create a dummy seeds.txt file if it doesn't exist
seeds_file_path = "seeds.txt"
if not Path(seeds_file_path).exists():
    with open(seeds_file_path, "w") as f:
        f.write("https://www.skyscrapercenter.com/city/new-york-city\n")
        f.write("https://www.skyscrapercenter.com/city/chicago\n")
    print(f"Created dummy seeds file: {seeds_file_path}")
else:
    print(f"Seeds file already exists: {seeds_file_path}")

ANALYSIS