In [3]:
import csv
import pandas as pd
import os

In [10]:
# check for missing columns

all_rows = []
header = None

input_file = "N5_kanji.csv"

with open(input_file, "r", encoding="utf-8-sig") as f:
    reader = csv.reader(f)
    header = next(reader)
        
    for line_num, row in enumerate(reader, start=2):
        # Check if row is empty
        if not row or all(cell.strip() == "" for cell in row):
            print(f"[INFO] {input_file}, line {line_num}: Empty row skipped")
            continue
        # Check if row has exactly 6 columns:
        if len(row) != 6:
            print(f"[WARNING] {input_file}, line {line_num}: Expected 6 columns, found {len(row)} => {row}")
        else:
            all_rows.append(row)

print(f"{input_file} check complete")

N5_kanji.csv check complete


In [2]:
# replacing more suitable kanjis for each kanji

updated_examples = {
     "日": "日本(にほん): Japan; 日曜日(にちようび): Sunday; 今日(きょう): today",
    "一": "一人(ひとり): one person; 一月(いちがつ): January; 一日(ついたち): first day (of month)",
    "国": "国(くに): country; 外国(がいこく): foreign country; 中国(ちゅうごく): China",
    "人": "人(ひと): person; 日本人(にほんじん): Japanese person; 二人(ふたり): two people",
    "年": "今年(ことし): this year; 来年(らいねん): next year; 去年(きょねん): last year",
    "大": "大きい(おおきい): big; 大人(おとな): adult; 大学(だいがく): university",
    "十": "十日(とおか): tenth day; 十人(じゅうにん): ten people; 十分(じゅっぷん): ten minutes",
    "二": "二人(ふたり): two people; 二月(にがつ): February; 二時(にじ): two o’clock",
    "本": "本(ほん): book; 日本(にほん): Japan; 本屋(ほんや): bookstore",
    "中": "中国(ちゅうごく): China; 中学校(ちゅうがっこう): middle school; 真ん中(まんなか): center",
    "長": "校長(こうちょう): principal; 社長(しゃちょう): company president; 長い(ながい): long",
    "出": "出る(でる): to exit; 出口(でぐち): exit; 出来る(できる): can do",
    "三": "三日(みっか): third day; 三角(さんかく): triangle; 三人(さんにん): three people",
    "時": "時間(じかん): time; 時計(とけい): clock; 何時(なんじ): what time",
    "行": "銀行(ぎんこう): bank; 行く(いく): to go; 旅行(りょこう): travel",
    "見": "見る(みる): to see; 見せる(みせる): to show; 見える(みえる): to be visible",
    "月": "月曜日(げつようび): Monday; 一月(いちがつ): January; 今月(こんげつ): this month",
    "分": "分かる(わかる): to understand; 分ける(わける): to divide; 一分(いっぷん): one minute",
    "後": "午後(ごご): afternoon; 後ろ(うしろ): behind; 後(あと): after",
    "前": "午前(ごぜん): morning (a.m.); 前(まえ): in front/before; 駅前(えきまえ): in front of station",
    "生": "学生(がくせい): student; 生まれる(うまれる): to be born; 生きる(いきる): to live",
    "五": "五人(ごにん): five people; 五月(ごがつ): May; 五日(いつか): fifth day",
    "間": "時間(じかん): time; 間(あいだ): interval; 一週間(いっしゅうかん): one week",
    "上": "上(うえ): above; 上がる(あがる): to go up; 上手(じょうず): skillful",
    "東": "東京(とうきょう): Tokyo; 東(ひがし): east; 東口(ひがしぐち): east exit",
    "四": "四月(しがつ): April; 四時(よじ): four o’clock; 四人(よにん): four people",
    "今": "今朝(けさ): this morning; 今日(きょう): today; 今週(こんしゅう): this week",
    "金": "お金(おかね): money; 金曜日(きんようび): Friday; 金(きん): gold",
    "九": "九月(くがつ): September; 九人(きゅうにん): nine people; 九時(くじ): nine o’clock",
    "入": "入る(はいる): to enter; 入れる(いれる): to put in; 入口(いりぐち): entrance",
    "学": "学生(がくせい): student; 学校(がっこう): school; 留学(りゅうがく): study abroad",
    "高": "高い(たかい): tall/expensive; 高校(こうこう): high school; 高校生(こうこうせい): high school student",
    "円": "円(えん): yen; 一円(いちえん): one yen; 円い(まるい): round",
    "子": "子供(こども): child; 女の子(おんなのこ): girl; 小学生(しょうがくせい): elementary student",
    "外": "外(そと): outside; 外国(がいこく): foreign country; 外食(がいしょく): eating out",
    "八": "八月(はちがつ): August; 八人(はちにん): eight people; 八日(ようか): eighth day",
    "六": "六月(ろくがつ): June; 六人(ろくにん): six people; 六時(ろくじ): six o’clock",
    "下": "下(した): below; 下がる(さがる): to go down; 下手(へた): unskillful",
    "来": "来る(くる): to come; 来月(らいげつ): next month; 来年(らいねん): next year",
    "気": "元気(げんき): healthy; 天気(てんき): weather; 気分(きぶん): mood",
    "小": "小さい(ちいさい): small; 小学校(しょうがっこう): elementary school; 小学生(しょうがくせい): elementary student",
    "七": "七月(しちがつ): July; 七人(しちにん): seven people; 七日(なのか): seventh day",
    "山": "山(やま): mountain; 火山(かざん): volcano; 富士山(ふじさん): Mt. Fuji",
    "話": "話す(はなす): to speak; 電話(でんわ): telephone; 会話(かいわ): conversation",
    "女": "女(おんな): woman; 女の子(おんなのこ): girl; 彼女(かのじょ): she/girlfriend",
    "北": "北(きた): north; 北海道(ほっかいどう): Hokkaido; 北口(きたぐち): north exit",
    "午": "午前(ごぜん): a.m.; 午後(ごご): p.m.; 正午(しょうご): noon",
    "百": "百(ひゃく): hundred; 百円(ひゃくえん): one hundred yen; 三百(さんびゃく): three hundred",
    "書": "書く(かく): to write; 辞書(じしょ): dictionary; 図書館(としょかん): library",
    "先": "先生(せんせい): teacher; 先月(せんげつ): last month; 先(さき): ahead",
    "名": "名前(なまえ): name; 有名(ゆうめい): famous; 名(な): name",
    "川": "川(かわ): river; 小川(おがわ): stream; 川口(かわぐち): rivermouth",
    "千": "千(せん): thousand; 千円(せんえん): one thousand yen; 三千(さんぜん): three thousand",
    "水": "水(みず): water; 水曜日(すいようび): Wednesday; 水道(すいどう): water supply",
    "半": "半分(はんぶん): half; 半年(はんとし): half a year; 一時半(いちじはん): 1:30",
    "男": "男(おとこ): man; 男の子(おとこのこ): boy; 男性(だんせい): male",
    "西": "西(にし): west; 西口(にしぐち): west exit; 西洋(せいよう): the West",
    "電": "電車(でんしゃ): train; 電気(でんき): electricity; 電話(でんわ): telephone",
    "校": "高校(こうこう): high school; 小学校(しょうがっこう): elementary school; 中学校(ちゅうがっこう): middle school",
    "語": "日本語(にほんご): Japanese; 英語(えいご): English; 中国語(ちゅうごくご): Chinese",
    "土": "土(つち): soil; 土曜日(どようび): Saturday; 土地(とち): land",
    "木": "木(き): tree; 木曜日(もくようび): Thursday; 大木(たいぼく): big tree",
    "聞": "聞く(きく): to listen; 新聞(しんぶん): newspaper; 聞こえる(きこえる): can be heard",
    "食": "食べる(たべる): to eat; 食べ物(たべもの): food; 食堂(しょくどう): cafeteria",
    "車": "車(くるま): car; 電車(でんしゃ): train; 自転車(じてんしゃ): bicycle",
    "何": "何時(なんじ): what time; 何人(なんにん): how many people; 何か(なにか): something",
    "南": "南(みなみ): south; 南口(みなみぐち): south exit; 南国(なんごく): southern country",
    "万": "万(まん): ten thousand; 一万(いちまん): ten thousand; 十万(じゅうまん): one hundred thousand",
    "毎": "毎日(まいにち): every day; 毎週(まいしゅう): every week; 毎回(まいかい): every time",
    "白": "白(しろ): white color; 白い(しろい): white; 真っ白(まっしろ): pure white",
    "天": "天気(てんき): weather; 天国(てんごく): heaven; 天(てん): sky/heavens",
    "母": "母(はは): mother; お母さん(おかあさん): mother; 母の日(ははのひ): Mother's Day",
    "火": "火曜日(かようび): Tuesday; 火山(かざん): volcano; 火事(かじ): fire (accident)",
    "右": "右(みぎ): right; 右側(みぎがわ): right side; 右手(みぎて): right hand",
    "読": "読む(よむ): to read; 読み方(よみかた): way of reading; 読書(どくしょ): reading",
    "友": "友(とも): friend; 友達(ともだち): friend; 親友(しんゆう): best friend",
    "左": "左(ひだり): left; 左側(ひだりがわ): left side; 左手(ひだりて): left hand",
    "休": "休む(やすむ): to rest; 休み(やすみ): holiday/break; 休日(きゅうじつ): holiday",
    "父": "父(ちち): father; お父さん(おとうさん): father; 父の日(ちちのひ): Father’s Day",
    "雨": "雨(あめ): rain; 大雨(おおあめ): heavy rain; 梅雨(つゆ): rainy season"
}

input_file = "N5_kanji.csv" 
output_file = "N5_kanji_updated.csv"

with open(input_file, mode="r", encoding="utf-8") as infile, \
     open(output_file, mode="w", encoding="utf-8", newline="") as outfile:
    
    # Use DictReader to handle columns by name
    reader = csv.DictReader(infile)
    fieldnames = reader.fieldnames
    
    writer = csv.DictWriter(outfile, fieldnames=fieldnames)

    writer.writeheader()
    
    for row in reader:
        # Check if the current row's Kanji is in our update dictionary
        kanji = row["Kanji"]
        if kanji in updated_examples:
            # Replace the "Example kanjis" column with the updated examples
            row["Example kanjis"] = updated_examples[kanji]
        
        # Write the (possibly updated) row to the new file
        writer.writerow(row)

print("CSV update complete. Check", output_file)

CSV update complete. Check N5_kanji_updated.csv


In [13]:
# function to verify audio file creation

def verify_audio_files(df):
    """
    Verifies that all expected audio files exist for each row in the DataFrame.

    Expects:
    - A 'Kanji' column containing the kanji string.
    - An 'Example Words' column with multiple example words/phrases separated by ';'.
    - Audio files named '{kanji}_example_{i}.mp3' for each example word i.
    - Separate subdirectories for female and male voice files.

    Args:
        df (pandas.DataFrame): DataFrame with columns ['Kanji', 'Example Words', ...].

    Returns:
        list: A list of file paths that were expected but not found.
    """
    missing_files = []

    for index, row in df.iterrows():
        kanji = row.get("Kanji")
        examples = row.get("Example Words")

        # Skip rows if no Kanji or no Example Words
        if pd.isna(kanji) or pd.isna(examples):
            continue

        # Split out each example word by semicolon
        example_list = [ex.strip() for ex in examples.split(";") if ex.strip()]

        # Verify each example's audio files (female/male)
        for i, ex_text in enumerate(example_list, start=1):
            # Female file path
            female_path = f"audio/words/female/{kanji}_example_{i}.mp3"
            if not os.path.exists(female_path):
                missing_files.append(female_path)

            # Male file path
            male_path = f"audio/words/male/{kanji}_example_{i}.mp3"
            if not os.path.exists(male_path):
                missing_files.append(male_path)

    return missing_files

In [14]:
# confirm creation of all audio files

# Load CSV
csv_file = "N5_kanji_updated.csv"
df = pd.read_csv(csv_file)

# Perform audio file verification
missing_files = verify_audio_files(df)

# Print results
if missing_files:
    print("\nMissing files:")
    for missing in missing_files:
        print(missing)
else:
    print("\nAll audio files were successfully generated.")


All audio files were successfully generated.
