In [1]:
!pip install mwparserfromhell

Collecting mwparserfromhell
  Downloading mwparserfromhell-0.6.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.3 kB)
Downloading mwparserfromhell-0.6.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (196 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m196.3/196.3 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mwparserfromhell
Successfully installed mwparserfromhell-0.6.6


In [2]:
import os
import json
import time
import logging
from urllib.parse import quote

import pandas as pd
import requests
import mwparserfromhell
from tqdm import tqdm
import google.generativeai as genai

In [None]:
# ——— Logging setup ———
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')

# ——— Configure LLM ———
genai.configure(api_key="")
model = genai.GenerativeModel("gemma-3-27b-it")

# ——— Configuration ———
CSV_PATH    = '/kaggle/input/part1fin/output_part1_fin.csv'
URL_COL     = 'link'
OUTPUT_FILE = f'all_figures_{int(time.time())}.json'

SCHEMA_TEMPLATE = {
    "name": "", "sex": "", "nick-names": [],
    "birth": {"date": "", "location": {"province": "", "city": "", "coordinates": {"latitude": "", "longitude": ""}}},
    "death": {"date": "", "location": {"province": "", "city": "", "coordinates": {"latitude": "", "longitude": ""}},
              "tomb_location": {"province": "", "city": "", "coordinates": {"latitude": "", "longitude": ""}}},
    "era": "", "occupation": [], "works": [], "events": [], "image": []
}

WIKIDATA_PROPS = {
    'birth_date': 'P569', 'death_date': 'P570',
    'birth_place': 'P19', 'death_place': 'P20',
    'gender': 'P21', 'occupation': 'P106',
    'notable_works': 'P800', 'nickname': 'P1449',
    'resting_place': 'P119', 'era': 'P2348'
}

# ——— Shared session with headers ———
session = requests.Session()
session.headers.update({
    'User-Agent': 'MyDataExtractorBot/1.0 (https://yourdomain.example; mailto:you@example.com)'
})


In [4]:
def retry_request(fn, *args, max_retries=5, backoff_factor=1.0, **kwargs):
    """Generic retry helper for HTTP calls and LLM generate_content."""
    backoff = backoff_factor
    for attempt in range(1, max_retries+1):
        try:
            return fn(*args, **kwargs)
        except requests.HTTPError as e:
            status = e.response.status_code
            if status == 429 and attempt < max_retries:
                logging.warning(f"429 Too Many Requests – retrying in {backoff}s...")
                time.sleep(backoff)
                backoff *= 2
                continue
            raise
        except requests.RequestException as e:
            if attempt < max_retries:
                logging.warning(f"Network error ({e}) – retrying in {backoff}s...")
                time.sleep(backoff)
                backoff *= 2
                continue
            raise
        except Exception as e:
            # LLM or other transient errors
            if 'rate limit' in str(e).lower() and attempt < max_retries:
                logging.warning(f"LLM rate limit – retrying in {backoff}s...")
                time.sleep(backoff)
                backoff *= 2
                continue
            raise

# Fetch summary + metadata

def fetch_wiki_summary(title):
    url = f'https://fa.wikipedia.org/api/rest_v1/page/summary/{title}'
    def _get():
        r = session.get(url, timeout=10)
        r.raise_for_status()
        return r.json()
    data = retry_request(_get)
    return data.get('extract', ''), data

# Fetch full Wikidata entity JSON
def fetch_wikidata_entity(qid):
    url = f'https://www.wikidata.org/wiki/Special:EntityData/{qid}.json'
    def _get():
        r = session.get(url, timeout=10)
        r.raise_for_status()
        return r.json()
    data = retry_request(_get)
    return data.get('entities', {}).get(qid, {})

# Fetch raw wikitext infobox

def fetch_infobox(title):
    api_url = 'https://fa.wikipedia.org/w/api.php'
    params = {
        'action': 'query', 'titles': title,
        'prop': 'revisions', 'rvslots': 'main', 'rvprop': 'content',
        'format': 'json'
    }
    def _get():
        r = session.get(api_url, params=params, timeout=10)
        r.raise_for_status()
        return r.json()
    pages = retry_request(_get).get('query', {}).get('pages', {})
    wikitext = next(iter(pages.values()), {}) \
               .get('revisions', [{}])[0] \
               .get('slots', {}).get('main', {}).get('*', '')
    if not wikitext:
        return {}
    wikicode = mwparserfromhell.parse(wikitext)
    for tmpl in wikicode.filter_templates():
        nm = tmpl.name.strip().lower()
        if 'جعبه اطلاعات' in nm or nm.startswith('infobox'):
            return {str(p.name).strip(): str(p.value).strip() for p in tmpl.params}
    return {}

# Parse Wikidata claims

def parse_wikidata(entity):
    result = {}
    claims = entity.get('claims', {})
    for key, pid in WIKIDATA_PROPS.items():
        vals = []
        for claim in claims.get(pid, []):
            snak = claim.get('mainsnak', {})
            dv = snak.get('datavalue', {})
            if not dv: continue
            dtype = dv.get('type')
            if dtype == 'string':
                v = dv['value']
            elif dtype == 'time':
                t = dv['value'].get('time', '')
                v = t.split('T')[0].lstrip('+') if t else ''
            elif dtype == 'wikibase-entityid':
                q = dv['value']['id']
                lbl_ent = fetch_wikidata_entity(q)
                v = lbl_ent.get('labels', {}).get('fa', {}).get('value') \
                    or lbl_ent.get('labels', {}).get('en', {}).get('value', '')
            else:
                continue
            if v:
                vals.append(v)
        # single vs multi
        if key in ('birth_date', 'death_date'):
            result[key] = vals[0] if vals else ''
        else:
            result[key] = vals if vals else []
    # images (P18)
    images = []
    for claim in claims.get('P18', []):
        pic = claim.get('mainsnak', {}).get('datavalue', {}).get('value','')
        if pic:
            fn = quote(pic.replace(' ', '_'), safe='')
            images.append(f'https://commons.wikimedia.org/wiki/Special:FilePath/{fn}')
    if images:
        result['image'] = images
    return result

# LLM-driven JSON fill

def llm_fill(schema, summary):
    prompt = f"""
You are a data extractor. Given the following Wikipedia summary in Farsi,
fill in this JSON schema. Leave fields empty if unknown.

Summary:
{summary}

JSON schema:
{json.dumps(schema, ensure_ascii=False, indent=2)}

Respond with only the completed JSON object.
"""
    def _gen():
        return model.generate_content(prompt)
    resp = retry_request(_gen)
    raw = resp.text.strip()
    inner = raw.split("```")[1].lstrip("json").strip() if raw.startswith("```") else raw
    try:
        return json.loads(inner)
    except json.JSONDecodeError:
        logging.warning("LLM output not valid JSON, using empty schema fallback.")
        return schema

In [5]:
# Main processing loop

def main():
    df = pd.read_csv(CSV_PATH)
    with open(OUTPUT_FILE, 'w', encoding='utf-8') as fp:
        fp.write('[\n')
        first = True

        for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing figures"):
            title = row[URL_COL].rsplit('/', 1)[-1]
            try:
                # Fetch data
                summary, meta = fetch_wiki_summary(title)
                qid = meta.get('wikibase_item', '')
                person = json.loads(json.dumps(SCHEMA_TEMPLATE))

                # 1) LLM-driven fill
                llm_data = llm_fill(person, summary)
                for k in person:
                    person[k] = llm_data.get(k, person[k])

                # 2) Wikidata parsing
                wd = parse_wikidata(fetch_wikidata_entity(qid)) if qid else {}
                # 3) Infobox fallback
                infobox = fetch_infobox(title)

                # Merge: LLM → Wikidata → Infobox
                # Image
                if not person['image']:
                    if wd.get('image'):
                        person['image'] = wd['image']
                    elif meta.get('originalimage', {}).get('source'):
                        person['image'] = [meta['originalimage']['source']]
                    elif meta.get('thumbnail', {}).get('source'):
                        person['image'] = [meta['thumbnail']['source']]
                # Birth date & place
                if not person['birth']['date']:
                    person['birth']['date'] = wd.get('birth_date', '') or infobox.get('زادروز', '')
                if not person['birth']['location']['city']:
                    bp = wd.get('birth_place', [])
                    person['birth']['location']['city'] = bp[0] if bp else infobox.get('زادگاه', '')
                # Death date & place
                if not person['death']['date']:
                    person['death']['date'] = wd.get('death_date', '') or infobox.get('درگذشت', '')
                if not person['death']['location']['city']:
                    dp = wd.get('death_place', [])
                    person['death']['location']['city'] = dp[0] if dp else infobox.get('محل درگذشت', '')
                # Tomb location
                if not person['death']['tomb_location']['city']:
                    rp = wd.get('resting_place', [])
                    person['death']['tomb_location']['city'] = rp[0] if rp else infobox.get('محل دفن', '')
                # Occupation
                if not person['occupation']:
                    person['occupation'] = wd.get('occupation', []) or (
                        [o.strip() for o in infobox.get('حرفه', '').split(',')] if infobox.get('حرفه') else []
                    )
                # Works
                if not person['works']:
                    person['works'] = wd.get('notable_works', []) or (
                        [w.strip() for w in infobox.get('آثار', '').split(',')] if infobox.get('آثار') else []
                    )
                # Nicknames
                if not person['nick-names']:
                    person['nick-names'] = wd.get('nickname', []) or (
                        [n.strip() for n in infobox.get('لقب', '').split(',')] if infobox.get('لقب') else []
                    )
                # Era
                if not person['era']:
                    person['era'] = wd.get('era', '') or infobox.get('دوران', '')

                # Write output
                if not first:
                    fp.write(',\n')
                fp.write(json.dumps(person, ensure_ascii=False))
                first = False

                # Throttle
                time.sleep(0.2)

            except Exception as e:
                logging.error(f"Failed on {title}: {e}", exc_info=True)
                continue

        fp.write('\n]')
    logging.info(f"Completed: processed {len(df)} entries -> {OUTPUT_FILE}")

if __name__ == '__main__':
    main()

Processing figures: 100%|██████████| 1048/1048 [2:06:11<00:00,  7.22s/it] 
