# ChineseIsEasy - Generation of the Anki cards

## Packages

In [1]:
import pandas as pd
import genanki
from pathlib import Path
import hashlib
import math

## Global variables

In [2]:
# --- Fichiers ---
WORDS_FILE = Path("generated_data/words_with_categories.parquet")
MEDIA_PATH = Path("data/media/data")
OUTPUT_FILE = "DictWords.apkg"

# --- Charger strokes + police ---
media_files = ["data/media/FZKai.ttf"]
media_files += [str(p) for p in MEDIA_PATH.glob("*.json")]

hanzi_js = Path("data/media/hanzi-writer.min.js").read_text(encoding="utf-8")

## Utils

In [3]:
# --- Utils ---
def stable_id_from_key(key: str) -> int:
    h = hashlib.sha1(key.encode("utf-8")).hexdigest()
    return int(h[:10], 16)

def stable_guid(key: str) -> str:
    return hashlib.sha1(key.encode("utf-8")).hexdigest()

In [4]:
# --- Charger dataset ---
df_words = pd.read_parquet(WORDS_FILE)

> The cell below is not really good for reproducibility, but for personal use it was more convenient. Since I used a small model (`GPT-4o-mini`) to generate the categories labels, it didn't follow strictly the instructions. So I had to manually correct some labels.

In [5]:
mapping = {
    "Particules": ["Particules", "Punctuation", "Symboles"],
    "Connecteurs logiques": ["Connecteurs logiques"],
    "Pronoms": ["Pronoms"],
    "Déterminants": ["Déterminants"],
    "Nombres": ["Nombres", "Quantité"],
    "Temps": ["Temps", "Événements", "Histoire"],
    "Mesures": ["Mesures"],
    "Relations sociales": ["Relations sociales", "Métiers", "Personnages"],
    "Vie quotidienne": ["Vie quotidienne", "Vêtements", "Loisirs", "Monnaie", "Mobilier"],
    "Nourriture": ["Nourriture"],
    "Nature": ["Nature", "Animaux", "Matériaux"],
    "Corps": ["Corps", "Santé", "Maladie", "Symptômes", "Médecine", "Médecins"],
    "Sentiments": ["Sentiments", "Sensations"],
    "Actions générales": ["Actions générales", "Sport", "Sports", "Outils", "Commerce"],
    "Transport": ["Transport", "Transports"],
    "Lieux publics": ["Lieux publics", "Lois publics"],
    "Technologie": ["Technologie", "Science"],
    "Culture": ["Culture", "Sons", "Art", "文化"],
    "Société": ["Société", "Économie", "Institutions"],
    "Institutions": ["Institutions"], 
    "Concepts abstraits": ["Concepts abstraits", "概念抽象"],
    "Autres": [
        "Adjectifs", "Couleurs", "Couleur", "Objet", "Objets",
        "Armement", "Armes", "Instruments", "Autres", "Noms", ""
    ],
}

def mapping_of_wrong_category(cat: str) -> str | None:
    for k, v in mapping.items():
        if cat in v:
            return k
    return None


def mapping_of_wrong_category(cat: str) -> str | None:
    for k, v in mapping.items():
        if cat in v:
            return k
    return None

In [6]:
df_words['Catégorie'] = df_words['Catégorie'].apply(mapping_of_wrong_category)
df_words = df_words.dropna()

In [7]:
df_words = df_words[~df_words['Pinyin'].apply(lambda x: len(x)==0)]
df_words = df_words[~df_words['Exemples'].apply(lambda x: len(x)==0)]
df_words = df_words[~df_words['Signification'].apply(lambda x: len(x)==0)]

> Below, I only take the 15000 first words because ankiweb has a limit on the size of the database we can use, this way I ensure I don't go over the limit. But for a more complete dataset, just remove the slicing.

In [8]:
df_words = df_words.head(15000)

## Construction

In [9]:
def build_model_char_to_pinyin():
    return genanki.Model(
        model_id=stable_id_from_key("model_words_char_to_pinyin"),
        name="Mots - Caractère→Pinyin/Signification",
        fields=[
            {"name": "Word"},
            {"name": "Traditionnel"},
            {"name": "Pinyin"},
            {"name": "Signification"},
            {"name": "Exemples"},
            {"name": "StrokeJSON"},
        ],
        templates=[{
            "name": "Caractère→Pinyin/Signification",
            "qfmt": r"""
<div id="writer-container" style="display:flex; justify-content:center; gap:10px;"></div>

<!-- bouton rejouer -->
<span id="replay-btn" style="color: gray; cursor: pointer; font-size:14px; margin-top:10px; display:inline-block;">
  Rejouer
</span>

<script>
""" + hanzi_js + r"""

var strokes = JSON.parse(`{{StrokeJSON}}`);
var container = document.getElementById("writer-container");
var writers = [];

strokes.forEach(function(data, idx) {
  var div = document.createElement("div");
  div.id = "writer_"+idx;
  div.style.width = "120px";
  div.style.height = "120px";
  container.appendChild(div);

  var writer = HanziWriter.create(div.id, '', {
    width: 120, height: 120, padding: 5,
    strokeAnimationSpeed: 1, delayBetweenStrokes: 300,
    charDataLoader: function(c, onComplete) { onComplete(data); }
  });
  writer.animateCharacter();
  writers.push(writer);
});

// bouton rejouer
document.getElementById("replay-btn").addEventListener("click", function() {
  writers.forEach(function(writer) {
    writer.hideCharacter();
    writer.showCharacter();
    writer.animateCharacter();
  });
});
</script>
""",
            "afmt": r"""
{{FrontSide}}<hr id="answer">

<div style="font-size: 36px; margin-top: 10px;">
  <b>{{Word}} ({{Traditionnel}})</b>
</div>
<div style="font-size: 34px; margin-top: 10px;"><b>{{Pinyin}}</b></div>
<div style="font-size: 30px; margin-top: 10px;">{{Signification}}</div>
<div style="font-size: 26px; margin-top: 15px; color: gray; white-space: pre-line;">{{Exemples}}</div>
"""
        }]
    )



def build_model_pinyin_to_char():
    return genanki.Model(
        model_id=stable_id_from_key("model_words_pinyin_to_char"),
        name="Mots - Pinyin/Signification→Caractère",
        fields=[
            {"name": "Word"},
            {"name": "Traditionnel"},
            {"name": "Pinyin"},
            {"name": "Signification"},
            {"name": "Exemples"},
            {"name": "StrokeJSON"},
        ],
        templates=[{
            "name": "Pinyin/Signification→Caractère",
            "qfmt": r"""
<div style="font-size: 24px;"><b>{{Pinyin}}</b></div>
<div style="font-size: 20px; margin-top: 10px;">{{Signification}}</div>
""",
            "afmt": r"""
{{FrontSide}}<hr id="answer">

<!-- strokes -->
<div id="writer-container" style="display:flex; justify-content:center; gap:10px; margin-top:15px;"></div>

<!-- bouton rejouer -->
<span id="replay-btn" style="color: gray; cursor: pointer; font-size:14px; margin-top:10px; display:inline-block;">
  Rejouer
</span>

<script>
""" + hanzi_js + r"""

var strokes = JSON.parse(`{{StrokeJSON}}`);
var container = document.getElementById("writer-container");
var writers = [];

strokes.forEach(function(data, idx) {
  var div = document.createElement("div");
  div.id = "writer_"+idx;
  div.style.width = "120px";
  div.style.height = "120px";
  container.appendChild(div);

  var writer = HanziWriter.create(div.id, '', {
    width: 120, height: 120, padding: 5,
    strokeAnimationSpeed: 1, delayBetweenStrokes: 300,
    charDataLoader: function(c, onComplete) { onComplete(data); }
  });
  writer.animateCharacter();
  writers.push(writer);
});

document.getElementById("replay-btn").addEventListener("click", function() {
  writers.forEach(function(writer) {
    writer.hideCharacter();
    writer.showCharacter();
    writer.animateCharacter();
  });
});
</script>

<div style="font-size: 36px; margin-top: 10px;">
  <b>{{Word}} ({{Traditionnel}})</b>
</div>

<div style="font-size: 26px; margin-top: 15px; color: gray; white-space: pre-line;">{{Exemples}}</div>
"""
        }]
    )


# ------------------ DECKS ------------------

def build_decks_for_words(df_cat, category, model_c2p, model_p2c):
    decks = []
    n = len(df_cat)
    n_chunks = math.ceil(n / 100)

    for i in range(n_chunks):
        chunk = df_cat.iloc[i*100:(i+1)*100]
        tranche_name = f"{i*100:03d}-{(i+1)*100-1:03d}"

        deck1 = genanki.Deck(
            stable_id_from_key(f"deck_words_c2p_{category}_{i}"),
            f"ChineseIsEasy-By-Word::Caractere→PinyinSignification::{category}::{tranche_name}",
        )
        deck2 = genanki.Deck(
            stable_id_from_key(f"deck_words_p2c_{category}_{i}"),
            f"ChineseIsEasy-By-Word::PinyinSignification→Caractere::{category}::{tranche_name}",
        )

        for _, row in chunk.iterrows():
            word = str(row["Word"])
            trad = str(row.get("Traditionnel", ""))
            pinyin = str(row.get("Pinyin", ""))
            signification = str(row.get("Signification", ""))
            exp = str(row.get("Explication", ""))
            ex = str(row.get("Exemples", ""))

            if exp:
                ex = f"<i style='color: gray;'>{exp}</i>\n\n{ex}"

            stroke_list = []
            for ch in word:
                json_file = MEDIA_PATH / f"{ch}.json"
                if json_file.exists():
                    stroke_list.append(json_file.read_text(encoding="utf-8"))
                else:
                    stroke_list.append("{}")

            stroke_json = "[" + ",".join(stroke_list) + "]"

            note1 = genanki.Note(
                model=model_c2p,
                fields=[word, trad, pinyin, signification, ex, stroke_json],
                guid=stable_guid(word+"_c2p_"+category+str(i))
            )
            note2 = genanki.Note(
                model=model_p2c,
                fields=[word, trad, pinyin, signification, ex, stroke_json],
                guid=stable_guid(word+"_p2c_"+category+str(i))
            )

            deck1.add_note(note1)
            deck2.add_note(note2)

        decks.extend([deck1, deck2])
    return decks


# ------------------ PACKAGE GLOBAL ------------------

def build_global_words_package():
    model_c2p = build_model_char_to_pinyin()
    model_p2c = build_model_pinyin_to_char()
    all_decks = []
    for category, df_cat in df_words.groupby("Catégorie"):
        all_decks.extend(build_decks_for_words(df_cat, category, model_c2p, model_p2c))
    package = genanki.Package(all_decks)
    package.media_files = media_files
    package.write_to_file(OUTPUT_FILE)
    print(f"Paquet généré : {OUTPUT_FILE} ({len(all_decks)} sous-decks)")


In [10]:
# --- Exécution ---
build_global_words_package()


Paquet généré : DictWords.apkg (322 sous-decks)
