# Génération d'un paquet anki word-examples

## Packages

In [1]:
import math
import hashlib
from pathlib import Path
import pandas as pd
import genanki
import re

In [2]:
try:
    from pypinyin import lazy_pinyin, Style
    USE_PYPINYIN = True
except Exception:
    USE_PYPINYIN = False

## Global variables

In [3]:
# --- Entrées ---
HSK_FILE = [Path("data/hsk3.csv"), Path("data/hsk5.csv")]        
WORDS_FILE = Path("../AnkiWords/generated_data/words_with_categories.parquet")

# --- Médias ---
MEDIA_DIR = Path("../AnkiWords/data/media/data")                             # JSON des strokes (un fichier par caractère)
FONT_FILE = Path("../AnkiWords/data/media/FZKai.ttf")
HANZI_JS_FILE = Path("../AnkiWords/data/media/hanzi-writer.min.js")

# --- Sortie ---
OUTPUT_APKG = "DictHSK.apkg"

# --- Divers ---
CHUNK_SIZE = 100          # nombre de cartes par sous-deck
DECK_ROOT_C2P = "ChineseIsEasy-HSK::Caractere→PinyinSignification"
DECK_ROOT_P2C = "ChineseIsEasy-HSK::PinyinSignification→Caractere"

## Utils

In [4]:
def stable_id_from_key(key: str) -> int:
    h = hashlib.sha1(key.encode("utf-8")).hexdigest()
    return int(h[:10], 16)

def stable_guid(key: str) -> str:
    return hashlib.sha1(key.encode("utf-8")).hexdigest()


In [5]:
def read_any_table(path: Path) -> pd.DataFrame:
    """Lit CSV/TSV/Parquet/XLSX selon l'extension."""
    ext = path.suffix.lower()
    if ext == ".csv":
        return pd.read_csv(path, sep=";")
    if ext in (".tsv", ".tab"):
        return pd.read_csv(path, sep="\t")
    if ext == ".parquet":
        return pd.read_parquet(path)
    if ext in (".xlsx", ".xls"):
        return pd.read_excel(path)
    raise ValueError(f"Extension non supportée: {ext} pour {path}")

def safe_str(x, default=""):
    try:
        if pd.isna(x):
            return default
        s = str(x)
        return s if s.lower() != "nan" else default
    except Exception:
        return default

def load_hanzi_js(js_path: Path) -> str:
    return js_path.read_text(encoding="utf-8")


## Chargement des données

In [6]:
# Dataset riche
df_rich = pd.read_parquet(WORDS_FILE)
# colonnes attendues: Word, Traditionnel, Pinyin, Signification, Exemples
df_rich = df_rich.copy()
for col in ["Word", "Traditionnel", "Pinyin", "Signification", "Exemples"]:
    if col not in df_rich.columns:
        df_rich[col] = ""

# Dataset HSK
df_hsk_raw = pd.concat([read_any_table(file) for file in HSK_FILE], ignore_index=True)

In [7]:
# Normalisation noms de colonnes 
rename_map = {
    "Mot": "Word",
    "Exemple": "ExemplesBase",
    "例句": "ExemplesBase",  
    "HSK": "HSK",
    "Chapitre": "Chapitre",
    "Pinyin": "Pinyin",
    "Signification": "Signification",
}

In [8]:
df_hsk = df_hsk_raw.rename(columns=rename_map)

# Colonnes minimales
for col in ["Word", "Pinyin", "Signification", "ExemplesBase", "HSK", "Chapitre"]:
    if col not in df_hsk.columns:
        df_hsk[col] = ""


In [9]:
# Cast pour HSK/Chapitre 
def to_int_or_str(v):
    try:
        if pd.isna(v) or v == "":
            return "0"
        iv = int(float(v))
        return iv
    except Exception:
        return safe_str(v, "0")

df_hsk["HSK"] = df_hsk["HSK"].apply(to_int_or_str)
df_hsk["Chapitre"] = df_hsk["Chapitre"].apply(to_int_or_str)

In [10]:
# Si pinyin absent dans HSK et disponible via pypinyin, on le recalcule
if USE_PYPINYIN:
    def fill_pinyin_if_missing(row):
        if safe_str(row["Pinyin"]) == "":
            chars = safe_str(row["Word"])
            if chars:
                # pinyin avec tons (marques) collés par espace
                return " ".join(lazy_pinyin(chars, style=Style.TONE3)).replace("u:", "ü")
        return row["Pinyin"]
    df_hsk["Pinyin"] = df_hsk.apply(fill_pinyin_if_missing, axis=1)


In [11]:
# Fusion enrichie sur Word
df_rich_unique = df_rich.drop_duplicates(subset=["Word"], keep="first")
df_enriched = df_hsk.merge(
    df_rich_unique[["Word", "Traditionnel", "Pinyin", "Signification", "Exemples"]]
        .rename(columns={
            "Pinyin": "Pinyin_rich",
            "Signification": "Signification_rich",
            "Exemples": "Exemples_rich",
        }),
    on="Word",
    how="left"
)

In [12]:
# Préférence des champs: HSK d’abord, sinon dataset riche
def choose(base, rich):
    b = safe_str(base)
    r = safe_str(rich)
    return b if b else r

df_enriched["Pinyin_final"] = df_enriched.apply(lambda r: choose(r["Pinyin"], r["Pinyin_rich"]), axis=1)
df_enriched["Signif_final"] = df_enriched.apply(lambda r: choose(r["Signification"], r["Signification_rich"]), axis=1)
df_enriched["Traditionnel_final"] = df_enriched["Traditionnel"].apply(safe_str)


In [13]:
# Concat des exemples (base puis rich, avec double interligne)
def join_examples(ex1, ex2):
    ex1 = safe_str(ex1)
    ex2 = safe_str(ex2)
    if ex1 and ex2:
        return f"{ex1}\n\n{ex2}"
    return ex1 or ex2

df_enriched["Exemples_final"] = df_enriched.apply(
    lambda r: join_examples(r["ExemplesBase"], r["Exemples_rich"]),
    axis=1
)

# Nettoyage minimal / filtres utiles
df_enriched["Word"] = df_enriched["Word"].apply(safe_str)
df_enriched = df_enriched[df_enriched["Word"] != ""]
df_enriched = df_enriched[df_enriched["Signif_final"] != ""]

## Préparation médias (strokes + font)

In [14]:
media_files = [str(FONT_FILE)]
# Ajout de tous les JSON trouvés (ça accélère la dispo des strokes hors génération à la volée)
if MEDIA_DIR.exists():
    media_files += [str(p) for p in MEDIA_DIR.glob("*.json")]

hanzi_js = load_hanzi_js(HANZI_JS_FILE)


In [15]:
# Expression régulière pour détecter les caractères chinois (bloc CJK unifié)
CHINESE_CHAR_RE = re.compile(r'[\u4e00-\u9fff]')

def stroke_json_for_word(word: str) -> str:
    """Construit une liste JSON de datas HanziWriter (une entrée par caractère chinois uniquement)."""
    items = []

    for ch in word:
        # Ignorer tout caractère non chinois (lettres, chiffres, ponctuation, etc.)
        if not CHINESE_CHAR_RE.match(ch):
            continue

        jf = MEDIA_DIR / f"{ch}.json"
        if jf.exists():
            try:
                items.append(jf.read_text(encoding="utf-8"))
            except Exception:
                items.append("{}")
        else:
            items.append("{}")

    # Si aucun caractère chinois valide n’a été trouvé, renvoyer une liste vide
    if not items:
        items.append("{}")

    return "[" + ",".join(items) + "]"


# ===================== Modèles =====================
def build_model_char_to_pinyin():
    return genanki.Model(
        model_id=stable_id_from_key("model_hsk_char_to_pinyin"),
        name="HSK - Caractère→Pinyin/Signification",
        fields=[
            {"name": "Word"},
            {"name": "Traditionnel"},
            {"name": "Pinyin"},
            {"name": "Signification"},
            {"name": "Exemples"},
            {"name": "StrokeJSON"},
        ],
        templates=[{
            "name": "Caractère→Pinyin/Signification",
            "qfmt": r"""
<div id="writer-container" style="display:flex; justify-content:center; gap:10px;"></div>

<span id="replay-btn" style="color: gray; cursor: pointer; font-size:14px; margin-top:10px; display:inline-block;">
  Rejouer
</span>

<script>
""" + hanzi_js + r"""

var strokes = JSON.parse(`{{StrokeJSON}}`);
var container = document.getElementById("writer-container");
var writers = [];

strokes.forEach(function(data, idx) {
  var div = document.createElement("div");
  div.id = "writer_"+idx;
  div.style.width = "120px";
  div.style.height = "120px";
  container.appendChild(div);

  var writer = HanziWriter.create(div.id, '', {
    width: 120, height: 120, padding: 5,
    strokeAnimationSpeed: 1, delayBetweenStrokes: 300,
    charDataLoader: function(c, onComplete) { onComplete(data); }
  });
  writer.animateCharacter();
  writers.push(writer);
});

document.getElementById("replay-btn").addEventListener("click", function() {
  writers.forEach(function(writer) {
    writer.hideCharacter();
    writer.showCharacter();
    writer.animateCharacter();
  });
});
</script>
""",
            "afmt": r"""
{{FrontSide}}<hr id="answer">

<div style="font-size: 36px; margin-top: 10px;">
  <b>{{Word}} {{#Traditionnel}}({{Traditionnel}}){{/Traditionnel}}</b>
</div>
<div style="font-size: 34px; margin-top: 10px;"><b>{{Pinyin}}</b></div>
<div style="font-size: 30px; margin-top: 10px;">{{Signification}}</div>
<div style="font-size: 26px; margin-top: 15px; color: gray; white-space: pre-line;">{{Exemples}}</div>
"""
        }]
    )

def build_model_pinyin_to_char():
    return genanki.Model(
        model_id=stable_id_from_key("model_hsk_pinyin_to_char"),
        name="HSK - Pinyin/Signification→Caractère",
        fields=[
            {"name": "Word"},
            {"name": "Traditionnel"},
            {"name": "Pinyin"},
            {"name": "Signification"},
            {"name": "Exemples"},
            {"name": "StrokeJSON"},
        ],
        templates=[{
            "name": "Pinyin/Signification→Caractère",
            "qfmt": r"""
<div style="font-size: 24px;"><b>{{Pinyin}}</b></div>
<div style="font-size: 20px; margin-top: 10px;">{{Signification}}</div>
""",
            "afmt": r"""
{{FrontSide}}<hr id="answer">

<div id="writer-container" style="display:flex; justify-content:center; gap:10px; margin-top:15px;"></div>

<span id="replay-btn" style="color: gray; cursor: pointer; font-size:14px; margin-top:10px; display:inline-block;">
  Rejouer
</span>

<script>
""" + hanzi_js + r"""

var strokes = JSON.parse(`{{StrokeJSON}}`);
var container = document.getElementById("writer-container");
var writers = [];

strokes.forEach(function(data, idx) {
  var div = document.createElement("div");
  div.id = "writer_"+idx;
  div.style.width = "120px";
  div.style.height = "120px";
  container.appendChild(div);

  var writer = HanziWriter.create(div.id, '', {
    width: 120, height: 120, padding: 5,
    strokeAnimationSpeed: 1, delayBetweenStrokes: 300,
    charDataLoader: function(c, onComplete) { onComplete(data); }
  });
  writer.animateCharacter();
  writers.push(writer);
});

document.getElementById("replay-btn").addEventListener("click", function() {
  writers.forEach(function(writer) {
    writer.hideCharacter();
    writer.showCharacter();
    writer.animateCharacter();
  });
});
</script>

<div style="font-size: 36px; margin-top: 10px;">
  <b>{{Word}} {{#Traditionnel}}({{Traditionnel}}){{/Traditionnel}}</b>
</div>
<div style="font-size: 26px; margin-top: 15px; color: gray; white-space: pre-line;">{{Exemples}}</div>
"""
        }]
    )

## Construction des decks

In [16]:
def build_decks_hsk(df: pd.DataFrame,
                    model_c2p: genanki.Model,
                    model_p2c: genanki.Model) -> list[genanki.Deck]:
    """
    Construit des decks organisés par HSK -> Chapitre, puis en tranches de CHUNK_SIZE cartes.
    """
    decks = []

    # tri stable: HSK croissant, Chapitre croissant, puis Word
    df_sorted = df.sort_values(by=["HSK", "Chapitre", "Word"], kind="mergesort").reset_index(drop=True)

    for (hsk, chap), df_grp in df_sorted.groupby(["HSK", "Chapitre"], dropna=False):
        # tranche en sous-decks de taille fixe
        n = len(df_grp)
        n_chunks = math.ceil(n / CHUNK_SIZE)

        for i in range(n_chunks):
            chunk = df_grp.iloc[i*CHUNK_SIZE:(i+1)*CHUNK_SIZE]
            tranche_name = f"{i*CHUNK_SIZE:03d}-{(i+1)*CHUNK_SIZE-1:03d}"

            deck1 = genanki.Deck(
                stable_id_from_key(f"hsk_c2p_{hsk}_{chap}_{i}"),
                f"{DECK_ROOT_C2P}::HSK{hsk}::Chapitre{chap}::{tranche_name}",
            )
            deck2 = genanki.Deck(
                stable_id_from_key(f"hsk_p2c_{hsk}_{chap}_{i}"),
                f"{DECK_ROOT_P2C}::HSK{hsk}::Chapitre{chap}::{tranche_name}",
            )

            for _, r in chunk.iterrows():
                word = safe_str(r["Word"])
                trad = safe_str(r["Traditionnel_final"])
                pinyin_final = safe_str(r["Pinyin_final"])
                signif_final = safe_str(r["Signif_final"])
                ex_final = safe_str(r["Exemples_final"])

                # Strokes (liste JSON par caractère)
                stroke_json = stroke_json_for_word(word)

                note1 = genanki.Note(
                    model=model_c2p,
                    fields=[word, trad, pinyin_final, signif_final, ex_final, stroke_json],
                    guid=stable_guid(f"{word}_c2p_hsk{hsk}_c{chap}_chunk{i}")
                )
                note2 = genanki.Note(
                    model=model_p2c,
                    fields=[word, trad, pinyin_final, signif_final, ex_final, stroke_json],
                    guid=stable_guid(f"{word}_p2c_hsk{hsk}_c{chap}_chunk{i}")
                )

                deck1.add_note(note1)
                deck2.add_note(note2)

            decks.extend([deck1, deck2])

    return decks

## Écriture du package

In [17]:
def build_hsk_package():
    model_c2p = build_model_char_to_pinyin()
    model_p2c = build_model_pinyin_to_char()

    decks = build_decks_hsk(df_enriched, model_c2p, model_p2c)

    package = genanki.Package(decks)
    package.media_files = []
    # ajoute police si présente
    if FONT_FILE.exists():
        package.media_files.append(str(FONT_FILE))
    # ajoute tous les JSON de strokes présents
    if MEDIA_DIR.exists():
        package.media_files += [str(p) for p in MEDIA_DIR.glob("*.json")]

    package.write_to_file(OUTPUT_APKG)
    print(f"Paquet généré : {OUTPUT_APKG}  —  {len(decks)} sous-decks, {len(df_enriched)} entrées.")


## Run

In [18]:
build_hsk_package()

Paquet généré : DictHSK.apkg  —  10 sous-decks, 234 entrées.
