# ChineseIsEasy - Generation of Categories, Examples and Explanations for Anki Cards

## Packages

In [None]:
import pandas as pd
from pathlib import Path
from openai import OpenAI
from pycccedict.cccedict import CcCedict
import json

## General Configuration

In [None]:
path_api_key = Path('secrets/api_key.txt')
path_excel = Path('data/SUBTLEX-CH-WF.xlsx')
path_prompt_cat = Path('prompts/prompt_categorie.txt')
path_prompt_ex = Path('prompts/prompt_exemples.txt')
path_prompt_exp = Path('prompts/prompt_explications.txt')
path_return = Path('generated_data/words_with_categories.parquet')
batch_input_file = Path('batch_input.jsonl')

MAX_WORDS = 2500 # If you use batches, the number of tokens per request is quite low so we need to limit the number of words per batch

api_key = path_api_key.read_text().strip()
client = OpenAI(api_key=api_key)
cccedict = CcCedict()

## Utilities

In [None]:
def analyse_mot(mot: str):
    """Retourne un dict avec les infos d’un mot chinois via CCCEDICT."""
    entries = cccedict.get_entries()
    resultats = [e for e in entries if e["simplified"] == mot or e["traditional"] == mot]
    if not resultats:
        return {"simplifie": mot, "traditionnel": "", "pinyin": [], "sens": ["[introuvable]"]}
    pinyins = list({e["pinyin"] for e in resultats})
    sens = []
    for e in resultats:
        sens.extend(e["definitions"])
    sens = list(dict.fromkeys(sens))
    return {
        "simplifie": resultats[0]["simplified"],
        "traditionnel": resultats[0]["traditional"],
        "pinyin": pinyins,
        "sens": sens
    }

def numero_vers_accent(pinyin_num: str) -> str:
    accents = {
        'a': ['ā', 'á', 'ǎ', 'à'],
        'e': ['ē', 'é', 'ě', 'è'],
        'i': ['ī', 'í', 'ǐ', 'ì'],
        'o': ['ō', 'ó', 'ǒ', 'ò'],
        'u': ['ū', 'ú', 'ǔ', 'ù'],
        'ü': ['ǖ', 'ǘ', 'ǚ', 'ǜ']
    }
    def convertir_syllabe(s):
        if not s or not s[-1].isdigit(): return s
        ton = int(s[-1])
        base = s[:-1]
        if ton < 1 or ton > 4: return base
        for v in "a o e i u ü".split():
            if v in base:
                return base.replace(v, accents[v][ton - 1], 1)
        return base
    return " ".join(convertir_syllabe(s) for s in pinyin_num.split())

## Dataset Loading

In [None]:
xls = pd.read_excel(path_excel)
df_existing = pd.read_parquet(path_return) if path_return.exists() else pd.DataFrame(columns=["Word"])
xls = xls[~xls['Word'].isin(df_existing['Word'])].head(MAX_WORDS)

# Ajouter les infos linguistiques
xls['infos'] = xls['Word'].apply(analyse_mot)
xls['Traditionnel'] = xls['infos'].apply(lambda x: x["traditionnel"])
xls['Pinyin'] = xls['infos'].apply(lambda x: "; ".join(numero_vers_accent(p) for p in x["pinyin"]))
xls['Signification'] = xls['infos'].apply(lambda x: "; ".join(x["sens"]))

## Prompts Loading

In [None]:
instructions_cat = path_prompt_cat.read_text().strip()
instructions_ex = path_prompt_ex.read_text().strip()
instructions_exp = path_prompt_exp.read_text().strip()

## Generation of the batch

In [None]:
with open(batch_input_file, "w", encoding="utf-8") as f:
    for mot in xls["Word"]:
        prompts = {
            "cat": instructions_cat,
            "ex": instructions_ex,
            "exp": instructions_exp.replace("AREMPLACER", mot)
        }
        for typ, instr in prompts.items():
            custom_id = f"{mot}_{typ}"
            body = {
                "model": "gpt-4o-mini",
                "messages": [
                    {"role": "system", "content": "Tu es un assistant spécialisé en chinois."},
                    {"role": "user", "content": f"{instr}\n{mot}\n\n"}
                ],
                "temperature": 0.7,
            }
            req = {
                "custom_id": custom_id,
                "method": "POST",
                "url": "/v1/chat/completions",
                "body": body
            }
            f.write(json.dumps(req, ensure_ascii=False) + "\n")

print(f"Fichier batch généré : {batch_input_file}")

In [None]:
file_obj = client.files.create(
    file=open("batch_input.jsonl", "rb"),
    purpose="batch"
)
print("File ID:", file_obj.id)

batch = client.batches.create(
    input_file_id=file_obj.id,
    endpoint="/v1/chat/completions",
    completion_window="24h"
)
print("Batch ID:", batch.id)
print("Statut initial :", batch.status)

## To run later (once the batch is processed by OpenAI)

In [None]:
# Vérifie l’état et télécharge les résultats une fois complété :
batch_status = client.batches.retrieve(batch.id)
if batch_status.status == "completed":
    output = client.files.content(batch_status.output_file_id).text
    with open("batch_output.jsonl", "w", encoding="utf-8") as f:
        f.write(output)
    print("Résultats enregistrés dans batch_output.jsonl")

## Results Processing

In [None]:
results = {}
if Path("batch_output.jsonl").exists():
    with open("batch_output.jsonl", encoding="utf-8") as f:
        for line in f:
            obj = json.loads(line)
            cid = obj["custom_id"]
            try:
                content = obj["response"]["body"]["choices"][0]["message"]["content"]
                results[cid] = content.strip()
            except Exception as e:
                print(f"⚠️ Erreur pour {cid}: {e}")
                results[cid] = ""

In [None]:
# Remplir les colonnes avec les résultats GPT
xls["Catégorie"]   = xls["Word"].map(lambda w: results.get(f"{w}_cat", ""))
xls["Exemples"]    = xls["Word"].map(lambda w: results.get(f"{w}_ex", ""))
xls["Explication"] = xls["Word"].map(lambda w: results.get(f"{w}_exp", ""))

In [None]:
# Fusionner avec les données existantes
df_final = pd.concat([df_existing, xls], ignore_index=True)

In [None]:
# Sauvegarder
df_final.to_parquet(path_return, index=False)
print("Résultats fusionnés dans", path_return)