# ChineseIsEasy - Generation of the Anki cards

## Packages

In [113]:
import pandas as pd
import genanki
from pathlib import Path
import hashlib
import math

## Global variables

In [114]:
WORDS_FILE = Path("generated_data/words_with_categories.parquet")
MEDIA_STROKES_DIR = Path("data/media/data")

OUTPUT_FILE = "ChineseIsEasy-WordsByFrequency.apkg"

HANZI_JS_FILE = Path("data/media/_hanzi-writer.min.js")
SHARED_JS_PATH = Path("../MinimalExample/js/_shared_hanzi.js")

FONT_FILE = Path("data/media/FZKai.ttf")

In [115]:
media_files = []
media_files.append(str(FONT_FILE))

media_files += [str(p) for p in MEDIA_STROKES_DIR.glob("*.json")]

media_files.append(str(HANZI_JS_FILE))
media_files.append(str(SHARED_JS_PATH))

## Utils

In [116]:
def stable_id_from_key(key: str) -> int:
    return int(hashlib.sha1(key.encode("utf-8")).hexdigest()[:10], 16)

def stable_guid(key: str) -> str:
    return hashlib.sha1(key.encode("utf-8")).hexdigest()

In [117]:
def safe_str(x, default=""):
    try:
        if pd.isna(x): return default
        s = str(x)
        return s if s.lower() != "nan" else default
    except:
        return default

def hide_pinyin_and_fr(block: str) -> str:
    """
    Exemples au format :
    L0 : chinois
    L1 : pinyin
    L2 : français
    puis encore L3 chinois, L4 pinyin, L5 français, etc.
    On masque uniquement pinyin / fr.
    """
    block = safe_str(block)
    if not block.strip():
        return ""

    lines = [ln.strip() for ln in block.split("\n") if ln.strip()]
    out = []

    for i, line in enumerate(lines):
        # Index modulo 3 → 0 = CH, 1 = PINYIN, 2 = FR
        if i % 3 == 0:
            # chinois → visible
            out.append(line)
        else:
            # pinyin / fr → caché
            out.append(
                f"<span class='reveal'><span class='reveal-content'>{line}</span></span>"
            )

    return "<br>".join(out)

In [118]:
# --- Charger dataset ---
df_words = pd.read_parquet(WORDS_FILE)

> The cell below is not really good for reproducibility, but for personal use it was more convenient. Since I used a small model (`GPT-4o-mini`) to generate the categories labels, it didn't follow strictly the instructions. So I had to manually correct some labels.

In [119]:
mapping = {
    "Particules": ["Particules", "Punctuation", "Symboles"],
    "Connecteurs logiques": ["Connecteurs logiques"],
    "Pronoms": ["Pronoms"],
    "Déterminants": ["Déterminants"],
    "Nombres": ["Nombres", "Quantité"],
    "Temps": ["Temps", "Événements", "Histoire"],
    "Mesures": ["Mesures"],
    "Relations sociales": ["Relations sociales", "Métiers", "Personnages"],
    "Vie quotidienne": ["Vie quotidienne", "Vêtements", "Loisirs", "Monnaie", "Mobilier"],
    "Nourriture": ["Nourriture"],
    "Nature": ["Nature", "Animaux", "Matériaux"],
    "Corps": ["Corps", "Santé", "Maladie", "Symptômes", "Médecine", "Médecins"],
    "Sentiments": ["Sentiments", "Sensations"],
    "Actions générales": ["Actions générales", "Sport", "Sports", "Outils", "Commerce"],
    "Transport": ["Transport", "Transports"],
    "Lieux publics": ["Lieux publics", "Lois publics"],
    "Technologie": ["Technologie", "Science"],
    "Culture": ["Culture", "Sons", "Art", "文化"],
    "Société": ["Société", "Économie", "Institutions"],
    "Institutions": ["Institutions"], 
    "Concepts abstraits": ["Concepts abstraits", "概念抽象"],
    "Autres": [
        "Adjectifs", "Couleurs", "Couleur", "Objet", "Objets",
        "Armement", "Armes", "Instruments", "Autres", "Noms", ""
    ],
}

def mapping_of_wrong_category(cat: str) -> str | None:
    for k, v in mapping.items():
        if cat in v:
            return k
    return None


def mapping_of_wrong_category(cat: str) -> str | None:
    for k, v in mapping.items():
        if cat in v:
            return k
    return None

In [120]:
df_words['Catégorie'] = df_words['Catégorie'].apply(mapping_of_wrong_category)
df_words = df_words.dropna()

In [121]:
df_words = df_words[~df_words['Pinyin'].apply(lambda x: len(x)==0)]
df_words = df_words[~df_words['Exemples'].apply(lambda x: len(x)==0)]
df_words = df_words[~df_words['Signification'].apply(lambda x: len(x)==0)]

## HTML partagé

In [122]:
REVEAL_CSS = r"""
<style>
.reveal {
    opacity: 0.08;
    transition: opacity 0.2s;
    cursor: pointer;
}
.reveal:hover {
    opacity: 1;
}
.reveal.active {
    opacity: 1 !important;
}
.reveal-content {
    color: #444;
    font-size: 22px;
}
</style>
"""

REVEAL_JS = r"""
<script>
document.addEventListener("click", function(e){
    const r = e.target.closest(".reveal");
    if (r) r.classList.toggle("active");
});
</script>
"""

WRITER_HTML = r"""
<div id="writer-wrapper" style="display:flex; flex-direction:column; align-items:center; gap:12px;">

  <div id="writer-container"
       style="display:flex; gap:20px; justify-content:center; flex-wrap:nowrap;">
  </div>

  <button id="replay-btn"
          style="
            background:#444cf7;
            color:white;
            border:none;
            padding:8px 22px;
            font-size:16px;
            border-radius:20px;
            cursor:pointer;
            transition:0.2s;
          ">
    Rejouer
  </button>

  <div id="hanzi-data" style="display:none;">{{Word}}</div>

</div>

<script src="_hanzi-writer.min.js"></script>
<script src="_shared_hanzi.js"></script>

<script>
console.log("[Template] Calling initHanziWriter('{{Word}}')");
initHanziWriter("{{Word}}");
</script>
"""

## Construction

In [123]:
def model_char_to_pinyin():
    return genanki.Model(
        stable_id_from_key("model_words_char_to_pinyin_v2"),
        "Mots - Char→Pinyin/Signification",
        fields=[
            {"name": "Word"},
            {"name": "Traditionnel"},
            {"name": "Pinyin"},
            {"name": "Signification"},
            {"name": "Exemples"},
        ],
        templates=[{
            "name": "Char→Pinyin/Signification",

            "qfmt":
                REVEAL_CSS
                + WRITER_HTML,  

            "afmt":
                REVEAL_CSS
                + r"""
{{FrontSide}}

<hr>

""" + WRITER_HTML + r"""  
<div style="font-size:36px; margin-top:10px;">
  <b>{{Word}} ({{Traditionnel}})</b>
</div>

<div style="font-size:34px; margin-top:10px;">
  <b>{{Pinyin}}</b>
</div>

<div style="font-size:30px; margin-top:10px;">
  {{Signification}}
</div>

<div style="font-size:26px; color:gray; margin-top:15px;">
  {{Exemples}}
</div>
"""
                + REVEAL_JS
        }]
    )

def model_pinyin_to_char():
    return genanki.Model(
        stable_id_from_key("model_words_pinyin_to_char_v2"),
        "Mots - Pinyin/Signification→Caractère",
        fields=[
            {"name": "Word"},
            {"name": "Traditionnel"},
            {"name": "Pinyin"},
            {"name": "Signification"},
            {"name": "Exemples"},
        ],
        templates=[{
            "name": "Pinyin→Caractère",

            "qfmt":
                REVEAL_CSS
                + r"""
<div style="font-size:24px;"><b>{{Pinyin}}</b></div>
<div style="font-size:20px; margin-top:10px;">{{Signification}}</div>
""",

            "afmt":
                REVEAL_CSS
                + r"""
{{FrontSide}}

<hr>

""" + WRITER_HTML + r"""   
<div style="font-size:36px; margin-top:10px;">
  <b>{{Word}} ({{Traditionnel}})</b>
</div>

<div style="font-size:26px; margin-top:15px; color:gray;">
  {{Exemples}}
</div>
"""
                + REVEAL_JS
        }]
    )


In [124]:
def build_category_decks(df_cat, category, model_c2p, model_p2c):
    decks = []

    n = len(df_cat)
    chunks = math.ceil(n / 100)

    for i in range(chunks):
        chunk = df_cat.iloc[i*100:(i+1)*100]
        name = f"{i*100:03d}-{(i+1)*100-1:03d}"

        deck1 = genanki.Deck(
            stable_id_from_key(f"words_c2p_{category}_{i}"),
            f"ChineseIsEasy-Words::Caractere→PinyinSignification::{category}::{name}",
        )
        deck2 = genanki.Deck(
            stable_id_from_key(f"words_p2c_{category}_{i}"),
            f"ChineseIsEasy-Words::PinyinSignification→Caractere::{category}::{name}",
        )

        for _, row in chunk.iterrows():
            word = row["Word"]
            trad = row.get("Traditionnel", "")
            pin = row.get("Pinyin", "")
            sig = row.get("Signification", "")
            ex = row.get("Exemples", "")
            ex = hide_pinyin_and_fr(ex)
            exp = row.get("Explication", "")

            if exp:
                ex = f"<i style='color:gray;'>{exp}</i><br><br>{ex}"

            n1 = genanki.Note(
                model=model_c2p,
                fields=[word, trad, pin, sig, ex],
                guid=stable_guid(word+"_c2p_"+category+str(i))
            )

            n2 = genanki.Note(
                model=model_p2c,
                fields=[word, trad, pin, sig, ex],
                guid=stable_guid(word+"_p2c_"+category+str(i))
            )

            deck1.add_note(n1)
            deck2.add_note(n2)

        decks.extend([deck1, deck2])

    return decks


In [125]:
def build_dictwords():
    model_c2p = model_char_to_pinyin()
    model_p2c = model_pinyin_to_char()

    all_decks = []

    for category, df_cat in df_words.groupby("Catégorie"):
        all_decks.extend(build_category_decks(df_cat, category, model_c2p, model_p2c))

    pkg = genanki.Package(all_decks)
    pkg.media_files = media_files
    pkg.write_to_file(OUTPUT_FILE)

    print(f"✔ Done: {OUTPUT_FILE}  ({len(all_decks)} decks)")


## Run

In [126]:
build_dictwords()


✔ Done: ChineseIsEasy-WordsByFrequency.apkg  (528 decks)
