In [1]:
import json
import logging
import os
from tqdm import tqdm

# Part I: Cleaning data
---

In [2]:
# ——— Logging setup ———
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')

# ——— Logging setup ———
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')

# ——— Configuration ———
# Paths to the two JSON files generated earlier
INPUT_FILES = [
    '/kaggle/input/all-aft/all_figures_part1.json',
    '/kaggle/input/all-aft/all_figures_part2.json'
]
# Output file for concatenated and cleaned data
OUTPUT_FILE = 'all_figures_combined_clean.json'

In [3]:
def load_and_concatenate(files):
    """
    Load JSON records from each file in 'files' and concatenate into a single list.
    """
    all_records = []
    for filepath in files:
        if not os.path.exists(filepath):
            logging.warning(f"File not found, skipping: {filepath}")
            continue
        with open(filepath, 'r', encoding='utf-8') as f:
            try:
                records = json.load(f)
                logging.info(f"Loaded {len(records)} records from {filepath}")
                all_records.extend(records)
            except json.JSONDecodeError as e:
                logging.error(f"Failed to parse JSON in {filepath}: {e}")
    logging.info(f"Total concatenated records: {len(all_records)}")
    return all_records


def clean_data(records):
    """
    Filter records by:
      - 'occupation' is a non-empty list
      - At least one of 'birth.location.province' or 'birth.location.city' is non-empty
      - 'image' is a non-empty list
    Returns the filtered list.
    """
    clean_records = []
    total = len(records)
    for person in tqdm(records, desc="Cleaning records", unit="record"):
        # 1) occupation must be a non-empty list
        occ = person.get('occupation')
        if not occ or not isinstance(occ, list):
            continue
        if len(occ) == 0:
            continue

        # 2) birth place: either province or city must be non-empty
        birth = person.get('birth', {})
        loc = birth.get('location', {})
        province = loc.get('province', '')
        city = loc.get('city', '')
        if not (province or city):
            continue

        # 3) image must be a non-empty list
        imgs = person.get('image')
        if not imgs or not isinstance(imgs, list) or len(imgs) == 0:
            continue

        # 4) must have a birth date
        birth_date = person.get('birth').get('date')

        if not birth_date:
            continue

        clean_records.append(person)

    logging.info(f"Records kept: {len(clean_records)} / {total}")
    return clean_records


def save_records(records, output_file):
    """
    Save the list of records to 'output_file' as pretty JSON.
    """
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(records, f, ensure_ascii=False, indent=2)
    logging.info(f"Saved cleaned data to '{output_file}'")

In [4]:
if __name__ == '__main__':
    # Step 1: Load and concatenate
    combined = load_and_concatenate(INPUT_FILES)

    # Step 2: Clean combined records
    cleaned = clean_data(combined)

    # Step 3: Save final output
    save_records(cleaned, OUTPUT_FILE)

Cleaning records: 100%|██████████| 2097/2097 [00:00<00:00, 849474.16record/s]


In [5]:
print(f'Succesfully clean the data down to {len(cleaned)} figures!')

Succesfully clean the data down to 983 figures!


# Part II: Completion of unfilled values
---

In [1]:
import os
import json
import logging
import time
import re
from tqdm import tqdm
import requests
import google.generativeai as genai
from dateutil import parser as date_parser

# ——— Logging setup ———
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')

# ——— Configure LLM ———
genai.configure(api_key="AIzaSyCMekxl8NrGFutY8gT4n27buUrXZXuesTg")
model = genai.GenerativeModel("gemma-3-27b-it")

# ——— Retry logic for LLM calls ———
def retry_request(fn, *args, max_retries=5, backoff_factor=1.0, **kwargs):
    backoff = backoff_factor
    for attempt in range(max_retries):
        try:
            return fn(*args, **kwargs)
        except Exception as e:
            if attempt < max_retries - 1:
                logging.warning(f"Error '{e}', retrying in {backoff}s...")
                time.sleep(backoff)
                backoff *= 2
                continue
            logging.error(f"Final attempt failed: {e}")
            return None

# ——— Date normalization helpers ———
def normalize_date(ds):
    """Try to parse into YYYY-MM-DD, else return original."""
    if not isinstance(ds, str) or not ds.strip():
        return ds
    try:
        dt = date_parser.parse(ds, dayfirst=True, fuzzy=True)
        return dt.strftime('%Y-%m-%d')
    except Exception:
        return ds

def needs_llm_date(ds):
    """Check if date string is not in ISO format YYYY-MM-DD."""
    return not bool(re.match(r'^\d{4}-\d{2}-\d{2}$', ds))

# ——— LLM date conversion ———
def generate_date_prompt(date_str):
    instr = (
        "Convert the following date (which may be in Persian solar calendar or any format) "
        "to Gregorian date in YYYY-MM-DD format. Reply only with the date between tags."
    )
    data = f"<START OF DATA>{date_str}<END OF DATA>"
    return instr + "\n" + data

# ——— LLM helper ———
def ask_llm(prompt, is_json=True):
    def _gen(): return model.generate_content(prompt)
    resp = retry_request(_gen)
    if not resp or not hasattr(resp, 'text'):
        return None
    raw = resp.text.strip()
    start_tag, end_tag = '<START OF DATA>', '<END OF DATA>'
    start, end = raw.find(start_tag), raw.find(end_tag)
    if start != -1 and end != -1 and start < end:
        content = raw[start + len(start_tag):end].strip()
    else:
        content = raw
    content = re.sub(r'^```(?:json)?\s*', '', content)
    content = re.sub(r'\s*```$', '', content).strip()
    if is_json:
        try:
            return json.loads(content)
        except json.JSONDecodeError:
            logging.warning("JSON parse error, returning None.")
            return None
    else:
        return content

# ——— English detection ———
def is_english_sentence(s):
    try:
        s.encode('utf-8').decode('ascii')
        return True
    except Exception:
        return False

# ——— Prompt generators ———
def generate_sex_prompt(datum):
    instr = (
        "Predict the sex (MALE or FEMALE) for the following name. "
        "Reply only with MALE or FEMALE between tags."
    )
    data = f"<START OF DATA>{datum.get('name','')}<END OF DATA>"
    return instr + "\n" + data, False

def generate_translation_prompt(datum, field):
    serial = json.dumps(datum[field], ensure_ascii=False)
    instr = (
        "Translate the following JSON value into Persian, preserving the JSON structure exactly. "
        "Reply only with valid JSON between tags."
    )
    data = f"<START OF DATA>{serial}<END OF DATA>"
    return instr + "\n" + data, True

def generate_geo_prompt(datum, field):
    instr = (
        "Given this JSON location with 'city', fill in missing 'province', 'latitude', and 'longitude'. "
        "Output a JSON object with keys 'province' and 'coordinates' (with 'latitude' and 'longitude') between tags."
    )
    serial = json.dumps(datum[field]['location'], ensure_ascii=False)
    data = f"<START OF DATA>{serial}<END OF DATA>"
    return instr + "\n" + data, True

def generate_text_translation_prompt(text):
    instr = (
        "Translate the following text into Persian. Reply only with the translated text between tags."
    )
    data = f"<START OF DATA>{text}<END OF DATA>"
    return instr + "\n" + data, False

# ——— Main augmentation & date normalization ———
if __name__ == '__main__':
    INPUT_FILE = '/kaggle/input/dataaaa/part1.json'
    if not os.path.exists(INPUT_FILE):
        logging.error(f"File not found: {INPUT_FILE}")
        exit(1)

    with open(INPUT_FILE, 'r', encoding='utf-8') as f:
        records = json.load(f)

    subset = records
    logging.info(f"Processing {len(subset)} records for augmentation and date normalization")

    for person in tqdm(subset, desc="Augmenting records", unit="person"):
        # 1) Fill sex as lowercase
        if not person.get('sex'):
            prompt, is_json = generate_sex_prompt(person)
            result = ask_llm(prompt, is_json)
            if isinstance(result, str):
                person['sex'] = result.strip().lower()

        # 2) Translate fields
        for field in ['era','occupation','works','events']:
            val = person.get(field)
            if isinstance(val,(str,list)) and (
               (isinstance(val,str) and is_english_sentence(val)) or
               (isinstance(val,list) and all(is_english_sentence(x) for x in val))
            ):
                prompt, is_json = generate_translation_prompt(person, field)
                translated = ask_llm(prompt, is_json)
                if translated is not None:
                    person[field] = translated

        # 3) Geo for birth and death
        for field in ['birth','death']:
            for location in ['location','tomb_location']:
                loc = person.get(field,{}).get('location',{})
                coords = loc.get('coordinates',{})
                if loc.get('city') and (
                   not loc.get('province') or
                   not coords.get('latitude') or
                   not coords.get('longitude')
                ):
                    prompt, is_json = generate_geo_prompt(person, field)
                    filled = ask_llm(prompt, is_json)
                    if isinstance(filled,dict):
                        province = filled.get('province')
                        if province:
                            clean_prov = province.strip().strip('<>').strip()
                            # translate province if in English
                            if is_english_sentence(clean_prov):
                                t_prompt, t_json = generate_text_translation_prompt(clean_prov)
                                tprov = ask_llm(t_prompt, t_json)
                                if isinstance(tprov, str):
                                    clean_prov = tprov.strip().strip('<>').strip()
                            person[field]['location']['province'] = clean_prov
                        cf = filled.get('coordinates',{})
                        if isinstance(cf,dict):
                            for k,v in cf.items():
                                if v: person[field]['location']['coordinates'][k]=v

        # 4) Normalize & LLM-convert dates
        for date_field in ['birth','death']:
            ds = person.get(date_field,{}).get('date','')
            if isinstance(ds,str) and ds.strip():
                iso = normalize_date(ds)
                if needs_llm_date(iso):
                    dprompt = generate_date_prompt(ds)
                    conv = ask_llm(dprompt, is_json=False)
                    if isinstance(conv,str):
                        conv_clean = conv.strip().strip('<>').strip()
                        if re.match(r'^\d{4}-\d{2}-\d{2}$',conv_clean):
                            person[date_field]['date']=conv_clean
                        else:
                            person[date_field]['date']=iso if iso else ds
                    else:
                        person[date_field]['date']=iso if iso else ds
                else:
                    person[date_field]['date']=iso

        # Save augmented JSON for all persons
    OUTPUT_FILE = '/kaggle/working/all_figures_augmented_part1.json'
    with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
        json.dump(subset, f, ensure_ascii=False, indent=2)
    logging.info(f"Augmented data saved to {OUTPUT_FILE} ({len(subset)} records)")

Augmenting records: 100%|██████████| 492/492 [59:10<00:00,  7.22s/person]  
