# Génération d'un paquet pour les mots du HSK

## Packages

In [120]:
import math
import hashlib
from pathlib import Path
import pandas as pd
import genanki

try:
    from pypinyin import lazy_pinyin, Style
    USE_PYPINYIN = True
except Exception:
    USE_PYPINYIN = False

## Global variables

In [121]:
HSK_FILE = [Path("data/hsk3.csv"), Path("data/hsk5.csv")]
WORDS_FILE = Path("../AnkiWords/generated_data/words_with_categories.parquet")

MEDIA_STROKES_DIR = Path("../AnkiWords/data/media/data")
FONT_FILE = Path("../AnkiWords/data/media/FZKai.ttf")
HANZI_JS_FILE = Path("../AnkiWords/data/media/_hanzi-writer.min.js")
SHARED_JS_PATH = Path("../MinimalExample/js/_shared_hanzi.js")

OUTPUT_APKG = "ChineseIsEasy-HSK.apkg"

CHUNK_SIZE = 100
DECK_ROOT_C2P = "ChineseIsEasy-HSK::Caractere→PinyinSignification"
DECK_ROOT_P2C = "ChineseIsEasy-HSK::PinyinSignification→Caractere"

## Utils

In [122]:
def stable_id_from_key(key: str) -> int:
    return int(hashlib.sha1(key.encode()).hexdigest()[:10], 16)

def stable_guid(key: str) -> str:
    return hashlib.sha1(key.encode()).hexdigest()

def safe_str(x, default=""):
    try:
        if pd.isna(x): return default
        s = str(x)
        return s if s.lower() != "nan" else default
    except:
        return default

def read_any_table(path: Path) -> pd.DataFrame:
    ext = path.suffix.lower()
    if ext == ".csv":
        return pd.read_csv(path, sep=";")
    if ext in (".tsv", ".tab"):
        return pd.read_csv(path, sep="\t")
    if ext == ".parquet":
        return pd.read_parquet(path)
    if ext in (".xlsx", ".xls"):
        return pd.read_excel(path)
    raise ValueError(f"Unsupported extension: {ext}")

## Chargement des données

In [123]:
df_rich = pd.read_parquet(WORDS_FILE)

for col in ["Word", "Traditionnel", "Pinyin", "Signification", "Exemples"]:
    if col not in df_rich.columns:
        df_rich[col] = ""

# Load HSK datasets
df_hsk_raw = pd.concat([read_any_table(f) for f in HSK_FILE], ignore_index=True)
df_hsk = df_hsk_raw.rename(columns={
    "Mot": "Word",
    "Exemple": "ExemplesBase",
    "例句": "ExemplesBase",
    "HSK": "HSK",
    "Chapitre": "Chapitre"
})

for col in ["Word", "Pinyin", "Signification", "ExemplesBase", "HSK", "Chapitre"]:
    if col not in df_hsk.columns:
        df_hsk[col] = ""

In [124]:
def to_int_or_str(v):
    try:
        if pd.isna(v) or v == "":
            return "0"
        return int(float(v))
    except:
        return safe_str(v, "0")

df_hsk["HSK"] = df_hsk["HSK"].apply(to_int_or_str)
df_hsk["Chapitre"] = df_hsk["Chapitre"].apply(to_int_or_str)

In [125]:
if USE_PYPINYIN:
    def fill_pinyin_if_missing(row):
        if safe_str(row["Pinyin"]) == "":
            chars = safe_str(row["Word"])
            if chars:
                return " ".join(lazy_pinyin(chars, style=Style.TONE3)).replace("u:", "ü")
        return row["Pinyin"]
    df_hsk["Pinyin"] = df_hsk.apply(fill_pinyin_if_missing, axis=1)

df_rich_unique = df_rich.drop_duplicates(subset=["Word"], keep="first")

df_enriched = df_hsk.merge(
    df_rich_unique[["Word", "Traditionnel", "Pinyin", "Signification", "Exemples", "Explication"]].rename(columns={
        "Pinyin": "Pinyin_rich",
        "Signification": "Signification_rich",
        "Exemples": "Exemples_rich",
    }), on="Word", how="left"
)

def choose(base, rich):
    base = safe_str(base)
    rich = safe_str(rich)
    return base if base else rich

df_enriched["Pinyin_final"] = df_enriched.apply(lambda r: choose(r["Pinyin"], r["Pinyin_rich"]), axis=1)
df_enriched["Signif_final"] = df_enriched.apply(lambda r: choose(r["Signification"], r["Signification_rich"]), axis=1)
df_enriched["Traditionnel_final"] = df_enriched["Traditionnel"].apply(safe_str)

In [126]:
def join_examples(a, b):
    a = safe_str(a)
    b = safe_str(b)
    if a and b:
        return a + "\n\n" + b
    return a or b

df_enriched["Exemples_final"] = df_enriched.apply(
    lambda r: join_examples(r["ExemplesBase"], r["Exemples_rich"]),
    axis=1
)

df_enriched["Word"] = df_enriched["Word"].apply(safe_str)
df_enriched = df_enriched[df_enriched["Word"] != ""]
df_enriched = df_enriched[df_enriched["Signif_final"] != ""]

## Préparation médias (strokes + font)

In [127]:
WRITER_HTML = r"""
<div id="writer-wrapper" style="display:flex; flex-direction:column; align-items:center; gap:12px;">

  <div id="writer-container"
       style="display:flex; gap:20px; justify-content:center; flex-wrap:nowrap;">
  </div>

  <button id="replay-btn"
          style="
            background:#444cf7;
            color:white;
            border:none;
            padding:8px 22px;
            font-size:16px;
            border-radius:20px;
            cursor:pointer;
            transition:0.2s;
          ">Rejouer</button>

  <div id="hanzi-data" style="display:none;">{{Characters}}</div>

</div>

<script src="_hanzi-writer.min.js"></script>
<script src="_shared_hanzi.js"></script>

<script>
console.log("[Template] Calling initHanziWriter('{{Characters}}')");
initHanziWriter("{{Characters}}");
</script>
"""

## Construction des decks

In [128]:
def build_model_char_to_pinyin():
    return genanki.Model(
        model_id=stable_id_from_key("model_char_to_pinyin"),
        name="HSK - Char→Pinyin",
        fields=[
            {"name": "Characters"},
            {"name": "Traditionnel"},
            {"name": "Pinyin"},
            {"name": "Signification"},
            {"name": "Exemples"},
        ],
        templates=[{
            "name": "Char→Pinyin",
            "qfmt": WRITER_HTML,
            "afmt": r"""
{{FrontSide}}<hr>
<div style="font-size:32px;"><b>{{Characters}}</b></div>
<div style="font-size:30px; margin-top:10px;">{{Pinyin}}</div>
<div style="font-size:28px; margin-top:10px;">{{Signification}}</div>
<div style="white-space:pre-line; color:gray; font-size:24px; margin-top:10px;">{{Exemples}}</div>
"""
        }]
    )

def build_model_pinyin_to_char():
    return genanki.Model(
        model_id=stable_id_from_key("model_pinyin_to_char"),
        name="HSK - Pinyin→Char",
        fields=[
            {"name": "Characters"},
            {"name": "Traditionnel"},
            {"name": "Pinyin"},
            {"name": "Signification"},
            {"name": "Exemples"},
        ],
        templates=[{
            "name": "Pinyin→Char",

            "qfmt": r"""
<div style="font-size:30px;"><b>{{Pinyin}}</b></div>
<div style="font-size:24px;">{{Signification}}</div>
""",

            "afmt": r"""
{{FrontSide}}<hr>

<!-- Affichage du caractère -->
<div style="font-size:32px; margin-top:10px;"><b>{{Characters}}</b></div>

<!-- Exemples -->
<div style="white-space:pre-line; color:gray; font-size:24px; margin-top:10px;">{{Exemples}}</div>

<!-- Animation HanziWriter -->
""" + WRITER_HTML + """
"""
        }]
    )


## Écriture du package

In [129]:
valid_char = [p.name.replace(".json", "") for p in MEDIA_STROKES_DIR.glob("*.json")]

def to_valid_char(s: str) -> str:
    return "".join([c for c in s if c in valid_char])

In [130]:
def build_decks_hsk(df: pd.DataFrame, model_c2p, model_p2c):
    decks = []

    df_sorted = df.sort_values(by=["HSK", "Chapitre", "Word"], kind="mergesort")

    for (hsk, chap), df_grp in df_sorted.groupby(["HSK", "Chapitre"], dropna=False):
        n = len(df_grp)
        n_chunks = math.ceil(n / CHUNK_SIZE)

        for i in range(n_chunks):
            chunk = df_grp.iloc[i*CHUNK_SIZE:(i+1)*CHUNK_SIZE]
            name = f"{i*CHUNK_SIZE:03d}-{(i+1)*CHUNK_SIZE-1:03d}"

            deck1 = genanki.Deck(
                stable_id_from_key(f"c2p_{hsk}_{chap}_{i}"),
                f"{DECK_ROOT_C2P}::HSK{hsk}::Chapitre{chap}::{name}"
            )
            deck2 = genanki.Deck(
                stable_id_from_key(f"p2c_{hsk}_{chap}_{i}"),
                f"{DECK_ROOT_P2C}::HSK{hsk}::Chapitre{chap}::{name}"
            )

            for _, r in chunk.iterrows():
                word = to_valid_char(safe_str(r["Word"]))
                trad = safe_str(r["Traditionnel_final"])
                pinyin = safe_str(r["Pinyin_final"])
                signif = safe_str(r["Signif_final"])
                ex = safe_str(r["Exemples_final"])
                exp = safe_str(r["Explication"])

                if exp:
                    ex = f"<i style='color: gray;'>{exp}</i>\n\n{ex}"

                chars = word  # NO SPACES anymore

                note1 = genanki.Note(
                    model=model_c2p,
                    fields=[chars, trad, pinyin, signif, ex],
                    guid=stable_guid(f"{chars}_c2p_{hsk}_{chap}_{i}")
                )
                note2 = genanki.Note(
                    model=model_p2c,
                    fields=[chars, trad, pinyin, signif, ex],
                    guid=stable_guid(f"{chars}_p2c_{hsk}_{chap}_{i}")
                )

                deck1.add_note(note1)
                deck2.add_note(note2)

            decks.append(deck1)
            decks.append(deck2)

    return decks

In [131]:
def build_hsk_package():
    model_c2p = build_model_char_to_pinyin()
    model_p2c = build_model_pinyin_to_char()

    decks = build_decks_hsk(df_enriched, model_c2p, model_p2c)

    package = genanki.Package(decks)
    package.media_files = []

    if FONT_FILE.exists():
        package.media_files.append(str(FONT_FILE))

    package.media_files += [str(p) for p in MEDIA_STROKES_DIR.glob("*.json")]

    package.media_files.append(str(HANZI_JS_FILE))
    package.media_files.append(str(SHARED_JS_PATH))

    package.write_to_file(OUTPUT_APKG)

    print(f"✔ Paquet généré : {OUTPUT_APKG}")
    print(f"✔ Decks : {len(decks)}")
    print(f"✔ Items : {len(df_enriched)}")
    print(f"✔ JS partagé : {SHARED_JS_PATH}")

## Run

In [132]:
build_hsk_package()

✔ Paquet généré : ChineseIsEasy-HSK.apkg
✔ Decks : 14
✔ Items : 273
✔ JS partagé : ../MinimalExample/js/_shared_hanzi.js
