In [1]:
import pandas as pd
import aiohttp, asyncio, json, re, random, time
from tqdm import tqdm
import nest_asyncio
nest_asyncio.apply()

# ---------- USER SETTINGS ----------
CSV_PATH        = "Downloads/AgeDataset-V1.csv"
TARGET_POLYS    = 100_000
BATCH_SIZE      = 50
MAX_CONCURRENT  = 5
CANDIDATE_CAP   = 1_200_000
MIN_BIRTH_YEAR  = -1000
MAX_BIRTH_YEAR  = 2022
POLYMATH_THRESH = 3
JSON_OUT        = "polymaths_enriched.json"
RANDOM_SEED     = 42
# ------------------------------------

random.seed(RANDOM_SEED)

bucket_map = {
    # --- STEM ---
    "Q901": "STEM",              # scientist
    "Q170790": "STEM",           # mathematician
    "Q169470": "STEM",           # physicist
    "Q593644": "STEM",           # chemist
    "Q81096": "STEM",            # engineer
    "Q39631": "STEM",            # physician
    "Q11063": "STEM",            # astronomer
    "Q19350898": "STEM",         # theoretical physicist
    "Q82594": "STEM",            # computer scientist
    "Q2732142": "STEM",          # statistician
    "Q2919046": "STEM",          # biochemist
    "Q6337803": "STEM",          # neuroscientist
    "Q29169143": "STEM",         # data scientist
    "Q520549": "STEM",           # geologist
    "Q350979": "STEM",           # zoologist
    "Q2374149": "STEM",          # botanist
    "Q2310145": "STEM",          # meteorologist
    "Q205375": "STEM",           # inventor
    "Q2055046": "STEM",          # physiologist
    "Q18805": "STEM",            # naturalist

    # --- Humanities ---
    "Q4964182": "Humanities",    # philosopher
    "Q201788": "Humanities",     # historian
    "Q1622272": "Humanities",    # university teacher
    "Q16533": "Humanities",      # judge
    "Q188094": "Humanities",     # economist
    "Q82955": "Humanities",      # politician (intellectuals)
    "Q37226": "Humanities",      # teacher
    "Q2306091": "Humanities",    # sociologist
    "Q15632482": "Humanities",   # epistemologist
    "Q1234713": "Humanities",    # theologian
    "Q4773904": "Humanities",    # anthropologist
    "Q14467526": "Humanities",   # linguist
    "Q14565331": "Humanities",   # logician
    "Q2468727": "Humanities",    # classicist

    # --- Arts ---
    "Q1028181": "Arts",          # painter
    "Q36834": "Arts",            # composer
    "Q36180": "Arts",            # writer
    "Q49757": "Arts",            # poet
    "Q42973": "Arts",            # architect
    "Q214917": "Arts",           # playwright
    "Q639669": "Arts",           # musician
    "Q3303330": "Arts",          # calligrapher
    "Q6625963": "Arts",          # novelist
    "Q11774202": "Arts",         # essayist
    "Q8178443": "Arts",          # librettist
    "Q1281618": "Arts",          # sculptor
    "Q11569986": "Arts",         # printmaker
    "Q3658608": "Arts",          # caricaturist
    "Q3391743": "Arts",          # visual artist
    "Q5322166": "Arts",          # designer
    "Q10774753": "Arts",         # performance artist

    # --- Filtering Domains ---
    "Q4991371": "Military",      # soldier
    "Q11545923": "Military",     # military commander
    "Q116": "Royalty",           # monarch
    "Q14828907": "Politics",     # dictator
    "Q121998": "Politics",       # ambassador (tentative)
}

good_domains = {"STEM", "Arts", "Humanities"}
power_domains = {"Politics", "Military", "Royalty"}

def domains_for(occ_ids):
    return {bucket_map.get(qid, "Other") for qid in occ_ids}

def is_real_polymath(occ_ids):
    doms = domains_for(occ_ids)
    good_doms = doms & good_domains

    # Must span at least 3 distinct domains
    if len(good_doms) < 3:
        return False
    # Must have at least one occupation per domain
    if len(set(bucket_map.get(q, "Other") for q in occ_ids if bucket_map.get(q) in good_domains)) < 3:
        return False
    # Must have at least POLYMATH_THRESH occupations total
    if len(occ_ids) < POLYMATH_THRESH:
        return False
    return True

print("Loading CSV & pre-filtering locally …")
df = pd.read_csv(CSV_PATH, usecols=["Id", "Name", "Birth year"], dtype=str)
df["Birth year"] = pd.to_numeric(df["Birth year"], errors="coerce")
df = df.dropna(subset=["Id", "Birth year"])
df = df[(df["Birth year"] >= MIN_BIRTH_YEAR) & (df["Birth year"] <= MAX_BIRTH_YEAR)]
title_re = re.compile(r"\\b(Dr|Prof|Sir|al-)\\b", re.I)
df["priority"] = df["Name"].str.contains(title_re, na=False)
df = df.sort_values("priority", ascending=False).sample(frac=1.0, random_state=RANDOM_SEED)
candidates = df.head(CANDIDATE_CAP)["Id"].tolist()
print(f"→ Candidate QIDs to check: {len(candidates):,}")

API = ("https://www.wikidata.org/w/api.php?action=wbgetentities&format=json&props=claims&ids=")

sem = asyncio.Semaphore(MAX_CONCURRENT)
polymaths = []
seen = set()

def extract_date(claims, prop):
    try:
        raw = claims[prop][0]['mainsnak']['datavalue']['value']['time']
        year_str = raw.strip('+').split('-')[0]
        return int(year_str) if year_str.isdigit() or (year_str.startswith('-') and year_str[1:].isdigit()) else None
    except (KeyError, IndexError, TypeError, ValueError):
        return None

def get_all_ids(claims, prop):
    try:
        return [c['mainsnak']['datavalue']['value']['id']
                for c in claims[prop] if 'datavalue' in c['mainsnak']]
    except Exception:
        return []

async def fetch(session, qids, retries=3):
    url = API + "|".join(qids)
    for attempt in range(retries):
        try:
            async with sem:
                async with session.get(url, timeout=aiohttp.ClientTimeout(total=60)) as r:
                    return await r.json()
        except (aiohttp.ClientError, asyncio.TimeoutError) as e:
            print(f"[Warning] Batch failed (attempt {attempt+1}/{retries}): {e}")
            await asyncio.sleep(1)
    print(f"[Error] Failed after {retries} retries: {qids}")
    return {"entities": {}}


async def pipeline(qids):
    async with aiohttp.ClientSession() as session:
        for i in tqdm(range(0, len(qids), BATCH_SIZE), desc="Fetching", unit="batch"):
            if len(polymaths) >= TARGET_POLYS:
                break
            batch = qids[i:i+BATCH_SIZE]
            data = await fetch(session, batch)
            for qid, ent in data["entities"].items():
                if qid in seen:
                    continue
                seen.add(qid)
                claims = ent.get("claims", {})
                occ_ids = get_all_ids(claims, "P106")
                if is_real_polymath(occ_ids):
                    polymaths.append({
                        "qid": qid,
                        "occ_ids": sorted(occ_ids),
                        "polymath_score": len(occ_ids),
                        "birth_year": extract_date(claims, "P569"),
                        "death_year": extract_date(claims, "P570"),
                        "place_of_birth": get_all_ids(claims, "P19"),
                        "citizenship": get_all_ids(claims, "P27"),
                        "gender": get_all_ids(claims, "P21"),
                        "educated_at": get_all_ids(claims, "P69"),
                        "notable_works": get_all_ids(claims, "P800"),
                        "influenced_by": get_all_ids(claims, "P737"),
                        "influenced": get_all_ids(claims, "P737"),
                        "fields_of_work": get_all_ids(claims, "P101"),
                        "awards": get_all_ids(claims, "P166"),
                        "member_of": get_all_ids(claims, "P463"),
                        "languages": get_all_ids(claims, "P407"),
                        "cause_of_death": get_all_ids(claims, "P509"),
                    })
            await asyncio.sleep(0.2)

async def label_lookup(qids):
    url = ("https://www.wikidata.org/w/api.php?action=wbgetentities&format=json&props=labels&languages=en&ids=" + "|".join(qids))
    async with aiohttp.ClientSession() as s:
        async with s.get(url, timeout=40) as r:
            data = await r.json()
            return {qid: d.get("labels", {}).get("en", {}).get("value", qid)
                    for qid, d in data["entities"].items()}

def chunked(seq, n):
    for k in range(0, len(seq), n):
        yield seq[k:k+n]

async def enrich_labels(rows):
    qid_pool = set()
    for row in rows:
        qid_pool.add(row["qid"])
        for k, v in row.items():
            if isinstance(v, list):
                qid_pool.update(v)

    label_map = {}
    for ids in tqdm(list(chunked(sorted(qid_pool), 50)), desc="Label batches"):
        label_map.update(await label_lookup(ids))
        await asyncio.sleep(0.2)

    for row in rows:
        row["name"] = label_map.get(row["qid"], row["qid"])
        row["occupations"] = [label_map.get(o, o) for o in row.pop("occ_ids")]
        for field in ["place_of_birth", "citizenship", "gender", "educated_at",
                      "notable_works", "influenced_by", "influenced",
                      "fields_of_work", "awards", "member_of",
                      "languages", "cause_of_death"]:
            row[field] = [label_map.get(q, q) for q in row.get(field, [])]

# Run everything
asyncio.run(pipeline(candidates))
print(f"✓ Collected {len(polymaths):,} high-quality polymaths")

asyncio.run(enrich_labels(polymaths))

with open(JSON_OUT, "w", encoding="utf-8") as f:
    json.dump(polymaths, f, ensure_ascii=False, indent=2)

print(f"Saved {len(polymaths):,} polymaths → {JSON_OUT}")

Loading CSV & pre-filtering locally …


  df["priority"] = df["Name"].str.contains(title_re, na=False)


→ Candidate QIDs to check: 1,200,000


Fetching:  19%|███████████▏                                                | 4481/24000 [1:55:55<9:49:55,  1.81s/batch]



Fetching: 100%|████████████████████████████████████████████████████████████| 24000/24000 [10:36:01<00:00,  1.59s/batch]


✓ Collected 2,182 high-quality polymaths


Label batches: 100%|█████████████████████████████████████████████████████████████████| 201/201 [03:27<00:00,  1.03s/it]

🚀  Saved 2,182 polymaths → polymaths_enriched.json





In [None]:
# Load enriched JSON
with open("polymaths_enriched.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Label-based bucket map (lowercased for matching)
label_map = {
    # STEM
    "scientist": "STEM", "mathematician": "STEM", "physicist": "STEM", "chemist": "STEM",
    "engineer": "STEM", "physician": "STEM", "astronomer": "STEM", "theoretical physicist": "STEM",
    "computer scientist": "STEM", "statistician": "STEM", "biochemist": "STEM", "neuroscientist": "STEM",
    "data scientist": "STEM", "geologist": "STEM", "zoologist": "STEM", "botanist": "STEM",
    "meteorologist": "STEM", "inventor": "STEM", "physiologist": "STEM", "naturalist": "STEM",

    # Humanities
    "philosopher": "Humanities", "historian": "Humanities", "university teacher": "Humanities",
    "judge": "Humanities", "economist": "Humanities", "politician": "Humanities", "teacher": "Humanities",
    "sociologist": "Humanities", "epistemologist": "Humanities", "theologian": "Humanities",
    "cultural theorist": "Humanities", "anthropologist": "Humanities", "linguist": "Humanities",
    "logician": "Humanities", "classicist": "Humanities",

    # Arts
    "painter": "Arts", "composer": "Arts", "writer": "Arts", "poet": "Arts", "architect": "Arts",
    "playwright": "Arts", "musician": "Arts", "calligrapher": "Arts", "novelist": "Arts", "essayist": "Arts",
    "librettist": "Arts", "sculptor": "Arts", "printmaker": "Arts", "caricaturist": "Arts",
    "visual artist": "Arts", "designer": "Arts", "performance artist": "Arts",
}

good_domains = {"STEM", "Arts", "Humanities"}

def is_valid_polymath_by_label(occupations):
    cleaned = [label.lower() for label in occupations]
    valid_labels = [label for label in cleaned if label in label_map]
    domains = {label_map[label] for label in valid_labels}
    return len(valid_labels) >= 3 and len(domains & good_domains) >= 3

# Apply to dataset
for row in data:
    occ_labels = row.get("occupations", [])
    cleaned = [label.lower() for label in occ_labels]
    valid_labels = [label for label in cleaned if label in label_map]
    row["polymath_score_mapped"] = len(valid_labels)
    row["valid_polymath"] = is_valid_polymath_by_label(occ_labels)

# Save
with open("polymaths_enriched_updated.json", "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print("Label-based scoring done and saved.")