In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
export_term_pairs.py
--------------------
Flatten the `term_pairs` in dataset.json into a tidy CSV.

Usage:
    python3 export_term_pairs.py
"""

import json
import pandas as pd
from pathlib import Path

# --------------------------------------------------------------------------- #
# Configuration – change these paths if your files live elsewhere
# --------------------------------------------------------------------------- #
DATA_PATH = Path("dataset.json")        # input JSON
OUT_CSV   = Path("dataset.xlsx")      # output CSV

# --------------------------------------------------------------------------- #
# Helper functions
# --------------------------------------------------------------------------- #
def load_json(path: Path):
    """Read a JSON file and return the parsed Python object."""
    with path.open(encoding="utf-8") as f:
        return json.load(f)

def flatten_term_pairs(records):
    """
    Turn each (sentence, term_pair) combination into a single flat dict.

    Returns a pandas DataFrame.
    """
    rows = []
    for sent in records:
        # sentence-level fields to carry over
        base = {
            "id"             : sent.get("id"),
            "global_id"      : sent.get("global_id"),
            "sentence_id"    : sent.get("sentence_id"),
            "data_source"    : sent.get("data_source"),
            "category"       : sent.get("category"),
            "wikipedia_id"   : sent.get("wikipedia_id"),
            "yok_id"         : sent.get("yok_id"),
            "source_sentence": sent.get("source_sentence"),
            "target_sentence": sent.get("target_sentence"),
        }

        for pair in sent.get("term_pairs", []):
            # merge the two dicts (Python 3.9+ “|” operator)
            rows.append(base | pair)

    df = pd.DataFrame(rows)

    # optional: remove exact duplicates
    df = df.drop_duplicates(subset=["global_id", "sentence_id", "pair_id"])

    # nicer column names
    df = df.rename(columns={"en": "english", "tr": "turkish"})

    # re-order a bit (totally optional)
    preferred = [
        "global_id", "sentence_id", "pair_id",
        "english", "turkish", "correction", "link",
        "en_start", "en_end", "tr_start", "tr_end",
        "category", "data_source", "id", "wikipedia_id",
        "yok_id", "source_sentence", "target_sentence"
    ]
    df = df[[c for c in preferred if c in df.columns]]

    return df

# --------------------------------------------------------------------------- #
# Main script logic
# --------------------------------------------------------------------------- #
def main():
    if not DATA_PATH.exists():
        raise FileNotFoundError(f"⚠️  {DATA_PATH.resolve()} not found.")

    data = load_json(DATA_PATH)
    df   = flatten_term_pairs(data)

    # write the xlsx
    df.to_excel(OUT_CSV, index=False)


    # show a quick peek so you can verify
    with pd.option_context("display.max_columns", None,
                           "display.width", 120,
                           "display.max_rows",   5):
        print("\nPreview:")
        print(df.head())
        return df

# --------------------------------------------------------------------------- #

In [None]:
df_m = main()

In [None]:
df_m