In [37]:
import csv
import os
import pandas as pd
import re

In [81]:
# filtering public JLPT vocab file for N5 only

input_file = "../jlpt_vocab.csv"
temp_file = "N5_vocab_filtered.csv"

new_column_names = ["Word", "Reading", "Meaning", "JLPT"]

with open(input_file, "r", encoding="utf-8-sig") as infile, open(temp_file, "w", encoding="utf-8-sig", newline="") as outfile:
    reader = csv.reader(infile)
    writer = csv.writer(outfile)

    header = next(reader)
    writer.writerow(header)

    jlpt_column_index = header.index("JLPT Level")

    for row in reader:
        if row[jlpt_column_index].strip() == "N5":
            writer.writerow(row)

print(f"Filtered rows with JLPT Level 'N5' saved to temporary file '{temp_file}'.")

Filtered rows with JLPT Level 'N5' saved to temporary file 'N5_vocab_filtered.csv'.


In [82]:
# change column names to Word, Reading, Meaning and JLPT

output_file = "N5_vocab.csv"

new_column_names = ["Word", "Reading", "Meaning", "JLPT"]

with open(temp_file, "r", encoding="utf-8-sig") as infile, open(output_file, "w", encoding="utf-8-sig", newline="") as outfile:
    reader = csv.reader(infile)
    writer = csv.writer(outfile)

    header = next(reader)

    writer.writerow(new_column_names)

    for row in reader:
        writer.writerow(row)

os.remove(temp_file)

print(f"Column names updated and output saved to '{output_file}'.")

Column names updated and output saved to 'N5_vocab.csv'.


In [93]:
# check for duplicate rows

all_rows = []
duplicates = []
seen = set()  # To track unique (Word, Reading) pairs

with open(output_file, "r", encoding="utf-8-sig") as f:
    reader = csv.reader(f)

    header = next(reader, None)

    for line_num, row in enumerate(reader, start=2):
        # Skip empty rows
        if not row or all(cell.strip() == "" for cell in row):
            continue
        
        # Create a unique key based on Word and Reading (indices 0 and 1)
        key = (row[0].strip(), row[1].strip())  # Word and Reading
        if key in seen:
            duplicates.append((output_file, line_num, row)) 
        else:
            seen.add(key)

        all_rows.append(row)

if duplicates:
    print("Duplicates found:")
    for file, line, row in duplicates:
        print(f"File: {file}, Line: {line}, Duplicate Entry: {row}")
else:
    print("No duplicates found across all files.")

No duplicates found across all files.


In [118]:
import requests
import csv
import time
import urllib.parse
import json

In [119]:
# add example sentences for each word in both Japanese and English function

def get_word_type_and_sentences(word):
    jisho_url = f"https://jisho.org/api/v1/search/words?keyword={word}"
    
    # Always URL-encode the search term to avoid issues with special characters
    encoded_word = urllib.parse.quote(word)
    tatoeba_url = f"https://tatoeba.org/en/api_v0/search?query={encoded_word}&from=jpn&to=eng"

    try:
        # --- 1) Fetch word type from Jisho ---
        response = requests.get(jisho_url)
        response.raise_for_status()
        jisho_data = response.json()

        word_type = "Unknown"
        if jisho_data.get("data"):
            senses = jisho_data["data"][0].get("senses", [])
            if senses:
                parts = senses[0].get("parts_of_speech", [])
                if parts:
                    word_type = parts[0]

        # --- 2) Fetch example sentences from Tatoeba ---
        response = requests.get(tatoeba_url)
        response.raise_for_status()
        tatoeba_data = response.json()

        sentences = []
        for sentence in tatoeba_data.get("results", []):
            jp_sentence = sentence.get("text", "")
            
            # 'translations' is a list of lists. Flatten them:
            translations = sentence.get("translations", [])
            all_translations = []
            for sublist in translations:
                all_translations.extend(sublist)
            
            # Grab the first English translation if there is one
            en_sentence = ""
            if all_translations:
                en_sentence = all_translations[0].get("text", "")
            
            if jp_sentence and en_sentence:
                sentences.append((jp_sentence, en_sentence))

        return word_type, sentences

    except requests.RequestException as e:
        print(f"Request failed for {word}: {e}")
        return "Unknown", []
    except (IndexError, KeyError, AttributeError, TypeError) as e:
        print(f"Unexpected data structure for {word}: {e}")
        return "Unknown", []


In [120]:
# add example sentences for each word in both Japanese and English

input_csv = "N5_vocab.csv"
output_csv = "N5_vocab_with_examples.csv"

with open(input_csv, "r", encoding="utf-8-sig") as infile, open(output_csv, "w", encoding="utf-8-sig", newline="") as outfile:
    reader = csv.DictReader(infile)
    fieldnames = reader.fieldnames + ["Word Type", "new_sentence_jp", "new_sentence_en"]
    writer = csv.DictWriter(outfile, fieldnames=fieldnames)

    writer.writeheader()

    for row in reader:
        word = row["Word"]
        word_type, examples = get_word_type_and_sentences(word)

        row["Word Type"] = word_type
        if examples:
            # Use the first example sentence
            row["new_sentence_jp"], row["new_sentence_en"] = examples[0]
        else:
            row["new_sentence_jp"] = "No example available"
            row["new_sentence_en"] = "No example available"

        writer.writerow(row)
        print(f"Processed: {word}")
        time.sleep(2)

print(f"Updated file with word types and example sentences saved to '{output_csv}'.")

Processed: ああ
Processed: 会う
Processed: 青
Processed: 青い
Processed: 赤
Processed: 赤い
Processed: 明るい
Processed: 秋
Processed: 開く
Processed: 開ける
Processed: 上げる
Processed: 朝
Processed: 朝御飯
Processed: 明後日
Processed: 足; 脚
Processed: 明日
Processed: あそこ
Processed: 遊ぶ
Processed: 暖かい
Processed: 頭
Processed: 新しい
Processed: あちら
Processed: 暑い
Processed: 熱い
Processed: 厚い
Processed: あっち
Processed: 後
Processed: あなた
Processed: 兄
Processed: 姉
Processed: アパート
Processed: あの
Processed: 浴びる
Processed: 危ない
Processed: 甘い
Processed: 余り
Processed: 雨
Processed: 飴
Processed: 洗う
Processed: 在る
Processed: 有る
Processed: 歩く
Processed: あれ
Processed: いい; よい
Processed: いいえ
Processed: 言う
Processed: 家
Processed: いかが
Processed: 行く
Processed: いくつ
Processed: いくら
Processed: 池
Processed: 医者
Processed: 椅子
Processed: 忙しい
Processed: 痛い
Processed: 一
Processed: 一日
Processed: 一番
Processed: いつ
Processed: 五日
Processed: 一緒
Processed: 五つ
Processed: いつも
Processed: 犬
Processed: 今
Processed: 意味
Processed: 妹
Processed: 嫌
Processed: 入口
Processed:

In [122]:
# find rows with unknown values

input_file = "N5_vocab_with_examples.csv"

with open(input_file, "r", encoding="utf-8-sig") as f:
    reader = csv.reader(f)
    header = next(reader) 

    word_type_index = header.index("Word Type")
    example_jp_index = header.index("new_sentence_jp")
    example_en_index = header.index("new_sentence_en")

    rows_with_issues = []
    for line_num, row in enumerate(reader, start=2):
        if (
            row[word_type_index] == "Unknown"
            or row[example_jp_index] == "No example available"
            or row[example_en_index] == "No example available"
        ):
            rows_with_issues.append((line_num, row))

if rows_with_issues:
    print(f"Found {len(rows_with_issues)} rows with issues:")
    for line_num, row in rows_with_issues:
        print(f"Line {line_num}: {row}")
else:
    print("No issues found in the file.")


Found 14 rows with issues:
Line 2: ['ああ', 'ああ', 'Ah!, Oh!', 'N5', 'Unknown', 'ああ！', 'Ah!']
Line 14: ['朝御飯', 'あさごはん', 'breakfast', 'N5', 'Noun', 'No example available', 'No example available']
Line 16: ['足; 脚', 'あし', 'foot; leg', 'N5', 'Noun', 'No example available', 'No example available']
Line 46: ['いいえ', 'いいえ', 'no, not at all', 'N5', 'Unknown', 'いいえ？', 'No?']
Line 92: ['ええ', 'ええ', 'yes', 'N5', 'Unknown', 'ええ。', 'Yes.']
Line 111: ['伯父; 叔父さん', 'おじさん', 'uncle, middle-aged man', 'N5', 'Noun', 'No example available', 'No example available']
Line 129: ['伯母さん; 叔母さん', 'おばさん', 'aunt', 'N5', 'Noun', 'No example available', 'No example available']
Line 182: ['～がる', '～がる', 'feel', 'N5', 'Unknown', '何で強がるの？', 'Why are you pretending to be tough?']
Line 275: ['さあ', 'さあ', 'come now, well', 'N5', 'Unknown', 'さあね。', 'You can search me!']
Line 451: ['どうも', 'どうも', 'Thank you; somehow; no matter how hard one may try', 'N5', 'Unknown', 'どうも！', 'Thanks!']
Line 521: ['～杯', '～はい', 'counter for cupfuls', 'N

In [123]:
# filling the unknown sections with handmade data, different for each dataset

input_file = "N5_vocab_with_examples.csv"
output_file = "N5_vocab_filled.csv"

substitutions = {
    2: ["ああ", "ああ", "Ah!, Oh!", "N5", "Interjection", "ああ、疲れた！", "Ah, I’m tired!"],
    14: ["朝御飯", "あさごはん", "breakfast", "N5", "Noun", "朝御飯にパンを食べます。", "I eat bread for breakfast."],
    16: ["足; 脚", "あし", "foot; leg", "N5", "Noun", "足が速いです。", "My legs are fast."],
    46: ["いいえ", "いいえ", "no, not at all", "N5", "Interjection", "いいえ、大丈夫です。", "No, I’m fine."],
    92: ["ええ", "ええ", "yes", "N5", "Interjection", "ええ、分かりました。", "Yes, I understand."],
    111: ["伯父; 叔父さん", "おじさん", "uncle, middle-aged man", "N5", "Noun", "おじさんは家にいます。", "My uncle is at home."],
    129: ["伯母さん; 叔母さん", "おばさん", "aunt", "N5", "Noun", "おばさんが料理を作っています。", "My aunt is cooking."],
    182: ["～がる", "～がる", "feel", "N5", "Auxiliary verb", "彼は寒がっています。", "He’s feeling cold."],
    275: ["さあ", "さあ", "come now, well", "N5", "Interjection", "さあ、始めましょう！", "Well, let’s get started!"],
    451: ["どうも", "どうも", "Thank you; somehow", "N5", "Adverb / Interjection", "どうもありがとうございました！", "Thank you very much!"],
    521: ["～杯", "～はい", "counter for cupfuls", "N5", "Counter", "ビールを一杯ください。", "One glass of beer, please."],
    575: ["昼御飯", "ひるごはん", "lunch, midday meal", "N5", "Noun", "昼御飯にカレーを食べました。", "I ate curry for lunch."],
    661: ["もしもし", "もしもし", "Hello? (used on the phone)", "N5", "Interjection", "もしもし、田中さんですか？", "Hello, is this Tanaka-san?"],
    698: ["ラジオカセ", "ラジオカセ", "radio cassette player", "N5", "Noun", "ラジオカセで音楽を聞きます。", "I listen to music on the radio cassette player."],
}

with open(input_file, "r", encoding="utf-8-sig") as infile, open(output_file, "w", encoding="utf-8-sig", newline="") as outfile:
    reader = csv.reader(infile)
    writer = csv.writer(outfile)

    for line_num, row in enumerate(reader, start=1):
        if line_num in substitutions:
            writer.writerow(substitutions[line_num])
        else:
            writer.writerow(row)

print(f"Missing data filled and saved to {output_file}.")

Missing data filled and saved to N5_vocab_filled.csv.


In [126]:
# double check there are no remaining unknown values

input_file = "N5_vocab_filled.csv"

with open(input_file, "r", encoding="utf-8-sig") as f:
    reader = csv.reader(f)
    header = next(reader) 

    word_type_index = header.index("Word Type")
    example_jp_index = header.index("new_sentence_jp")
    example_en_index = header.index("new_sentence_en")

    rows_with_issues = []
    for line_num, row in enumerate(reader, start=2):
        if (
            row[word_type_index] == "Unknown"
            or row[example_jp_index] == "No example available"
            or row[example_en_index] == "No example available"
        ):
            rows_with_issues.append((line_num, row))

if rows_with_issues:
    print(f"Found {len(rows_with_issues)} rows with issues:")
    for line_num, row in rows_with_issues:
        print(f"Line {line_num}: {row}")
else:
    print("No issues found in the file.")

No issues found in the file.


In [127]:
# check for missing columns

all_rows = []
header = None

with open(input_file, "r", encoding="utf-8-sig") as f:
    reader = csv.reader(f)
    header = next(reader)
        
    for line_num, row in enumerate(reader, start=2):
        # Check if row is empty
        if not row or all(cell.strip() == "" for cell in row):
            print(f"[INFO] {output_file}, line {line_num}: Empty row skipped")
            continue
        # Check if row has exactly 7 columns:
        if len(row) != 7:
            print(f"[WARNING] {output_file}, line {line_num}: Expected 6 columns, found {len(row)} => {row}")
        else:
            all_rows.append(row)

print(f"{output_file} check complete")

N5_vocab_filled.csv check complete


In [4]:
# adjusting example sentences to match JLPT Level difficulty

input_csv = "N5_vocab_filled.csv"
output_csv = "N5_vocab_refined_examples.csv"

replacements = {
    "本を開くな。": ("本を開けないで。", "Don't open your book."),
    "窓を開けるな。": ("窓を開けないで。", "Don't open the window."),
    "頭を上げるなよ。": ("頭を上げないで。", "Don't raise your head."),
    "ここで遊ぶな。": ("ここで遊ばないで。", "Don't play here."),
    "分厚い本だな。": ("厚い本だな。", "That book is thick."),
    "後は任せた！": ("後はお願いね！", "The rest is up to you!"),
    "桃は甘い。": ("りんごは甘い。", "Apples are sweet."),
    "洗う必要がある。": ("洗ってください。", "Please wash it."),
    "神は存在する。": ("神はいるよ。", "God exists."),
    "よく言うよ。": ("そんなこと言わないで。", "Don't say that."),
    "退屈だなあ。": ("つまらないなあ。", "I'm bored."),
    "入池禁止。": ("池に入らないでください。", "Please don't enter the pond."),
    "猛犬注意！": ("犬に注意！", "Beware of the dog!"),
    "入口の側で立った。": ("入口の前に立った。", "He stood in front of the entrance."),
    "奴ら、中に居るの？": ("彼ら、中にいるの？", "Are they inside?"),
    "上出来！": ("よくできたね！", "Well done!"),
    "あの人は影が薄い。": ("あの人はあまり目立たない。", "He doesn't stand out much."),
    "大海の一滴。": ("海は広い。", "The sea is vast."),
    "俺の絵だよ。": ("私の絵だよ。", "This is my painting."),
    "東京駅なう。": ("東京駅にいるよ。", "I'm at Tokyo Station now."),
    "疑問が多い。": ("質問が多い。", "I have a lot of questions."),

    "通路に物を置くな。": ("通路に物を置かないで。", "Don't put your things in the passage."),
    "彼に教えるな。": ("彼に教えないで。", "Don't tell him."),
    "ゲス男。": ("ひどい男。", "What a jerk."),
    "歴史は繰り返す。": ("同じことが起きるんだ。", "The same thing happens again."),
    "妊娠４か月です。": ("今は四ヶ月目です。", "I'm four months along."),
    "嘘も方便。": ("嘘はよくないよ。", "Lies aren't good."),
    "氷山の一角だよ。": ("ほんの一部だけだよ。", "That's just a small part of it."),
    "河童の川流れ。": ("誰でも失敗はある。", "Anyone can make mistakes."),
    "女性優先。": ("女性が先だよ。", "Ladies go first."),
    "読書の選択やまた読書の仕方について学生達から質問を受けたことが度々ある。これに対する自分の答はいつも不得要領に終る外はなかった。如何なる人に如何なる恋をしたらいいかと聞かれるのと大した相違はないような気がする。": (
        "授業が終わった。",
        "Class has ended."
    ),

    "猫に九生あり。": (
        "猫は丈夫だね。",
        "Cats are tough."
    ),
    "そろそろ切るぞ？": (
        "そろそろ切るね。",
        "Shall we hang up now?"
    ),
    "恩に着るよ。": (
        "本当に助かるよ。",
        "You're really helping me out."
    ),
    "牛乳が腐った。": (
        "牛乳が悪くなった。",
        "The milk has gone bad."
    ),
    "一部のプログラマーは忘れているようだが、良いドキュメントを書くということは、良いプログラムを書くことよりも重要とは言わないまでも、同じ程度には重要なことなのだ。": (
        "文書を書くのも大切だよ。",
        "Writing documentation is important too."
    ),
    "白先黒死。": (
        "最後まで勝負は分からない。",
        "You never know who'll win until the end."
    ),
    "彼は腹黒い。": (
        "彼は性格が悪い。",
        "He’s nasty."
    ),
    "明日の朝九時から胃の検査を行いますので、前日の夜九時以降は何も飲んだり食べたりしないでください。": (
        "明日の朝九時から検査があるので、夜九時以降は何も食べないで。",
        "There's a test at 9 a.m. tomorrow, so please don't eat after 9 p.m. tonight."
    ),
    "この野郎！": (
        "こら！",
        "Hey!"
    ),
    "映画館は、インターネット上に出回る違法コピーのせいで、収益をますます失いつつある。": (
        "映画館は大変だね。",
        "Movie theaters are having a tough time."
    ),
    "頭を下げろ！": (
        "頭を下げて！",
        "Lower your head!"
    ),

     "死ぬな！": (
        "死なないで！",
        "Don't die!"
    ),
    "彼は生き字引と呼ばれている。": (
        "彼は物知りだね。",
        "He's really knowledgeable."
    ),
    "自分でおやり！": (
        "自分でやって！",
        "Do it yourself!"
    ),
    "近所の神社の境内には、樹齢800年といわれる銀杏の大木がある。": (
        "近所の神社に大きな木があるよ。",
        "There’s a big tree at the local shrine."
    ),
    "十人十色。": (
        "人それぞれだね。",
        "Everyone’s different."
    ),
    "中古だよ。": (
        "これは古いものだよ。",
        "It's used."
    ),
    "鯔背だね。": (
        "かっこいいね。",
        "He looks cool."
    ),
    "口を出すな。": (
        "口を出さないで。",
        "Don't butt in."
    ),
    "貧乏人の子沢山。": (
        "子供がたくさんいるね。",
        "They have a lot of kids."
    ),
    "立ちなさい。": (
        "立ってください。",
        "Please stand up."
    ),
    "林に竹が目立つ。": (
        "林に竹がある。",
        "There's bamboo in the woods."
    ),

    "王手！": (
        "やった！",
        "I did it!"
    ),
    "格好つけるな。": (
        "格好つけないで。",
        "Don't show off."
    ),
    "彼はコンサルティングファームに勤める。": (
        "彼は会社に勤めている。",
        "He works at a company."
    ),
    "立ち止まるな。": (
        "立ち止まらないで。",
        "Don't stop moving."
    ),
    "一石二鳥": (
        "一回で二つできる。",
        "Two things done at once."
    ),
    "弱肉強食。": (
        "強い者が勝つ。",
        "The strong win."
    ),  

    "佳人薄命": (
        "綺麗な人は大変だね。",
        "Beautiful people have it tough."
    ),
    "春は曙。": (
        "春はいいね。",
        "Spring is nice."
    ),
    "図に乗るな。": (
        "調子に乗らないで。",
        "Don't push your luck."
    ),
    "働くしかない。": (
        "働かないとだめだ。",
        "We have to work."
    ),
    "五十歩百歩。": (
        "大差ないね。",
        "There’s not much difference."
    ),
    "お前はトイレだ！": (
        "そこはトイレだよ。",
        "That's the bathroom."
    ),
    "母は頭が古い。": (
        "母は考え方が古いね。",
        "My mom's way of thinking is old-fashioned."
    ),
    "税金を払うようになって、初めて人生が始まる。": (
        "社会人になって、初めて人生が始まる。",
        "Life begins when you enter society."
    ),
    "二つ折り厳禁。": (
        "二つに折らないで。",
        "Don't fold it in half."
    ),
    "ガソリンが無くなりつつあります。": (
        "ガソリンがなくなりそう。",
        "We're running out of gas."
    ),
    "私が出すの頼んでおいた手紙にさ、切手貼るの忘れないでよ。": (
        "手紙に切手を貼るのを忘れないでね。",
        "Don't forget to put stamps on the letters."
    ),
    "彼女は木を抱き締める人だ。": (
        "彼女は自然が大好きなんだね。",
        "She really loves nature."
    ),
    "彼は素早い。": (
        "彼は動きが速いね。",
        "He moves quickly."
    ),
    "バスケ得意？": (
        "バスケ上手？",
        "Are you good at basketball?"
    ),
    "トムが葉書を送ってきたんだ。": (
        "トムが葉書を送ったよ。",
        "Tom sent me a postcard."
    ),
    "母似です。": (
        "母に似ているよ。",
        "I look like my mother."
    ),
    "二日酔いだ。": (
        "お酒を飲みすぎた。",
        "I drank too much."
    ),
    "お飲み物はいかが？": (
        "飲み物いる？",
        "Want something to drink?"
    ),

    "彼は二枚目だ。": (
        "彼はかっこいい。",
        "He is handsome."
    ),
    "まっすぐ行け。": (
        "まっすぐ行って。",
        "Go straight ahead."
    ),
    "三つ子の魂百まで。": (
        "子供のころの性格は変わらない。",
        "A person's nature rarely changes."
    ),
    "旅は道連れ。": (
        "一人より一緒のほうが楽しい。",
        "It's more fun with company."
    ),
    "死を忘れるな。": (
        "死を忘れないで。",
        "Don't forget about death."
    ),
    "悪い子ね！": (
        "いけない子ね！",
        "You're being naughty!"
    ),
    "まん丸に見えるあのお月さまはね、実は円錐の底面なんだよ。": (
        "お月さまが丸く見えるね。",
        "The moon looks round."
    ),
}

with open(input_csv, "r", newline="", encoding="utf-8-sig") as infile, \
     open(output_csv, "w", newline="", encoding="utf-8-sig") as outfile:
    
    reader = csv.DictReader(infile)
    fieldnames = reader.fieldnames
    writer = csv.DictWriter(outfile, fieldnames=fieldnames, extrasaction='ignore')
    
    # Write header to the new CSV
    writer.writeheader()
    
    for row in reader:
        original_jp = row["new_sentence_jp"]
        
        if original_jp in replacements:
            new_jp, new_en = replacements[original_jp]
            row["new_sentence_jp"] = new_jp
            row["new_sentence_en"] = new_en
        
        # Write (potentially modified) row to the output CSV
        writer.writerow(row)

print("Done! Your updated CSV is:", output_csv)

Done! Your updated CSV is: N5_vocab_refined_examples.csv


In [18]:
# check how many words contain more than one example sentence

# Load CSV file
csv_file = "N5_vocab_refined_examples.csv"
df = pd.read_csv(csv_file)

# Define the column containing example sentences
example_column = "new_sentence_jp"

# Function to check if a row contains more than one example sentence
def has_multiple_examples(text):
    if pd.isna(text):  # Handle NaN values
        return False
    return len(text.split('」「')) > 1 

# Apply the function to the example sentence column
df["Multiple Examples"] = df[example_column].apply(has_multiple_examples)

# Filter rows with multiple examples
rows_with_multiple_examples = df[df["Multiple Examples"]]

# Display the result
print("Rows with more than one example sentence:")
print(rows_with_multiple_examples)

Rows with more than one example sentence:
    Word Reading          Meaning JLPT Word Type    Example Sentence JP  \
285  さ来年   さらいねん  year after next   N5      Noun  「娘さんて、今年就職？」「ううん。来年よ」   

                                   Example Sentence EN  Multiple Examples  
285  "Your daughter – will she be looking for work ...               True  


In [21]:
# fix rows with more than one example

# Load CSV file
csv_file = "N5_vocab_refined_examples.csv"
df = pd.read_csv(csv_file)

# Define a dictionary of vocab points and new example sentences
replacements = {
    "さ来年": {
        "new_sentence_jp": "さ来年、大学を卒業します。",
        "new_sentence_en": "I will graduate from university the year after next."
    }
}

# Function to replace sentences based on the vocab
def replace_sentence(row):
    vocab_point = row["Word"]
    if vocab_point in replacements:
        row["Example Sentence JP"] = replacements[vocab_point]["new_sentence_jp"]
        row["Example Sentence EN"] = replacements[vocab_point]["new_sentence_en"]
    return row

# Apply the replacement function to the DataFrame
df = df.apply(replace_sentence, axis=1)

# Save the updated DataFrame to a new CSV file
updated_csv_file = "N5_vocab_refined_examples_updated.csv"
df.to_csv(updated_csv_file, index=False)

print(f"Updated CSV saved to {updated_csv_file}")

Updated CSV saved to N5_vocab_refined_examples_updated.csv


In [22]:
# fixing example sentences that do not in fact use the corresponding vocab well

# Load CSV file
csv_file = "N5_vocab_refined_examples_updated.csv"
df = pd.read_csv(csv_file)

# Replacement dictionary with corrections
replacements = {
   "開く": {
        "new_sentence_jp": "ドアが開く。",
        "new_sentence_en": "The door opens."
    },
    "在る": {
        "new_sentence_jp": "お金がある。",
        "new_sentence_en": "I have money."
    },
    "有る": {
        "new_sentence_jp": "時間が有る。",
        "new_sentence_en": "I have time."
    },
    "いくつ": {
        "new_sentence_jp": "いくつありますか？",
        "new_sentence_en": "How many are there?"
    },
    "薄い": {
        "new_sentence_jp": "薄い本ですね。",
        "new_sentence_en": "It’s a thin book."
    },
    "返す": {
        "new_sentence_jp": "本を返す。",
        "new_sentence_en": "I return the book."
    },
    "かける": {
        "new_sentence_jp": "電話をかける。",
        "new_sentence_en": "I will make a phone call."
    },
    "方": {
        "new_sentence_jp": "漢字の書き方を教えてください。",
        "new_sentence_en": "Please teach me how to write kanji."
    },
    "角": {
        "new_sentence_jp": "角を右に曲がる。",
        "new_sentence_en": "Turn right at the corner."
    },
    "川; 河": {
        "new_sentence_jp": "川で泳ぐのが好きです。",
        "new_sentence_en": "I like swimming in the river."
    },
     "九": {
        "new_sentence_jp": "猫は九歳です。",
        "new_sentence_en": "The cat is nine years old."
    },
    "嫌い": {
        "new_sentence_jp": "私は納豆が嫌いです。",
        "new_sentence_en": "I dislike natto."
    },
    "切る": {
        "new_sentence_jp": "電話を切らないでください。",
        "new_sentence_en": "Please don't hang up the phone."
    },
    "着る": {
        "new_sentence_jp": "コートを着て出かけます。",
        "new_sentence_en": "I put on a coat and go out."
    },
    "キロ; キログラム": {
        "new_sentence_jp": "この箱は10キロです。",
        "new_sentence_en": "This box is 10 kilograms."
    },
    "キロ; キロメートル": {
        "new_sentence_jp": "ここから駅まで1キロです。",
        "new_sentence_en": "It is 1 kilometer from here to the station."
    },
    "黒い": {
        "new_sentence_jp": "彼のカバンは黒いです。",
        "new_sentence_en": "His bag is black."
    },
    "静か": {
        "new_sentence_jp": "図書館では静かにしてください。",
        "new_sentence_en": "Please be quiet in the library."
    },
        "九": {
        "new_sentence_jp": "猫は九歳です。",
        "new_sentence_en": "The cat is nine years old."
    },
    "昨日": {
        "new_sentence_jp": "昨日は忙しかったです。",
        "new_sentence_en": "Yesterday was busy."
    },
    "今日": {
        "new_sentence_jp": "今日は晴れています。",
        "new_sentence_en": "Today is sunny."
    },
    "茶碗": {
        "new_sentence_jp": "茶碗を片付けてください。",
        "new_sentence_en": "Please clear away the rice bowls."
    },
    "つける": {
        "new_sentence_jp": "電気をつけるのを忘れないで。",
        "new_sentence_en": "Don't forget to turn on the light."
    },
    "猫": {
        "new_sentence_jp": "あの猫は本当にかわいいですね。",
        "new_sentence_en": "That cat is really cute, isn't it?"
    },
    "温い": {
        "new_sentence_jp": "このスープ、温すぎると思わない？",
        "new_sentence_en": "Don't you think this soup is too lukewarm?"
    },
    "バス": {
        "new_sentence_jp": "バスで学校に通っています。",
        "new_sentence_en": "I go to school by bus."
    },
    "葉書": {
        "new_sentence_jp": "旅先から葉書を送ります。",
        "new_sentence_en": "I'll send you a postcard from my trip."
    },
    "フォーク": {
        "new_sentence_jp": "フォークをテーブルに置いてください。",
        "new_sentence_en": "Please place the fork on the table."
    },
    "冬": {
        "new_sentence_jp": "冬になるとスキーに行きます。",
        "new_sentence_en": "I go skiing in the winter."
    },
    "部屋": {
        "new_sentence_jp": "部屋を片付けたほうがいい。",
        "new_sentence_en": "You should tidy up your room."
    },
    "ペン": {
        "new_sentence_jp": "ペンを貸してくれますか？",
        "new_sentence_en": "Could you lend me a pen?"
    },
    "文章": {
        "new_sentence_jp": "文章を書くのが得意です。",
        "new_sentence_en": "I am good at writing texts."
    },
    "ベッド": {
        "new_sentence_jp": "ベッドがとても快適です。",
        "new_sentence_en": "The bed is very comfortable."
    },
    "便利": {
        "new_sentence_jp": "これ、本当に便利ですね。",
        "new_sentence_en": "This is really convenient, isn't it?"
    },
    "帽子": {
        "new_sentence_jp": "帽子をかぶって出かけよう。",
        "new_sentence_en": "Let's put on a hat and go out."
    },
    "ボールペン": {
        "new_sentence_jp": "そのボールペン、貸してもらえますか？",
        "new_sentence_en": "Can I borrow that ballpoint pen?"
    },
    "ポケット": {
        "new_sentence_jp": "ポケットに何が入っているの？",
        "new_sentence_en": "What’s in your pocket?"
    },
    "欲しい": {
        "new_sentence_jp": "新しい自転車が欲しいです。",
        "new_sentence_en": "I want a new bicycle."
    },
    "細い": {
        "new_sentence_jp": "細い道を通るのは怖い。",
        "new_sentence_en": "It's scary to go through a narrow road."
    },
    "ホテル": {
        "new_sentence_jp": "このホテルはサービスが素晴らしいです。",
        "new_sentence_en": "The service at this hotel is excellent."
    },
    "毎朝": {
        "new_sentence_jp": "私は毎朝ジョギングをしています。",
        "new_sentence_en": "I jog every morning."
    },
    "毎年": {
        "new_sentence_jp": "毎年家族で旅行に行きます。",
        "new_sentence_en": "We go on a family trip every year."
    },
    "窓": {
        "new_sentence_jp": "窓を開けて、空気を入れ替えましょう。",
        "new_sentence_en": "Let’s open the window and let some fresh air in."
    },
    "見る": {
        "new_sentence_jp": "新しい映画を見たいです。",
        "new_sentence_en": "I want to watch a new movie."
    },
    "難しい": {
        "new_sentence_jp": "この問題はとても難しいです。",
        "new_sentence_en": "This problem is very difficult."
    },
    "耳": {
        "new_sentence_jp": "耳が痛くて、医者に行きました。",
        "new_sentence_en": "My ear hurt, so I went to the doctor."
    },
    "練習": {
        "new_sentence_jp": "発音の練習を毎日しています。",
        "new_sentence_en": "I practice pronunciation every day."
    },
    "若い": {
        "new_sentence_jp": "若い時に多くの経験をしたい。",
        "new_sentence_en": "I want to gain a lot of experience while I’m young."
    }
}

# Function to replace sentences based on the vocab point
def replace_sentences(row):
    vocab_point = row["Word"]
    if vocab_point in replacements:
        row["Example Sentence JP"] = replacements[vocab_point]["new_sentence_jp"]
        row["Example Sentence EN"] = replacements[vocab_point]["new_sentence_en"]
    return row

# Apply the replacement function to each row in the DataFrame
df = df.apply(replace_sentences, axis=1)

# Save the updated DataFrame to a new CSV file
updated_csv_file = "N5_vocab_refined_examples_updated_again.csv"
df.to_csv(updated_csv_file, index=False)

print(f"Updated CSV saved to {updated_csv_file}")

Updated CSV saved to N5_vocab_refined_examples_updated_again.csv


In [27]:
# fixing example sentences that do not in fact use the corresponding vocab well

# Load CSV file
csv_file = "N5_vocab_refined_examples_updated_again.csv"
df = pd.read_csv(csv_file)

# Replacement dictionary with corrections
replacements = {
  "ポスト争いは厳しい。": {
    "new_sentence_jp": "ポストに手紙を入れたよ。",
    "new_sentence_en": "I put a letter in the mailbox."
  },

  "そんな本読むな。": {
    "new_sentence_jp": "そんな本読まないで。",
    "new_sentence_en": "Don't read that kind of book."
  },

  "映画館は大変だね。": {
    "new_sentence_jp": "コピーしてくれる？",
    "new_sentence_en": "Could you make a copy for me?"
  },

  "杖がないと困る。": {
    "new_sentence_jp": "お金がないと困る。",
    "new_sentence_en": "I need money."
  },

  "俺の白のワイシャツ、どこ？": {
    "new_sentence_jp": "わたしの白いシャツ、どこ？",
    "new_sentence_en": "Where's my white shirt?"
  },
   "戸を閉めろ。": {
        "new_sentence_jp": "戸を閉めて。",
        "new_sentence_en": "Close the door."
    },
    
    # 4) "待つな。" -> negative request
    "待つな。": {
        "new_sentence_jp": "待たないで。",
        "new_sentence_en": "Don't wait."
    },
    
    # 5) The ZZ TOP reference line under そう; そうです
    "なんか誰かに雰囲気似てるなあと思ってましたが、言われてみてああそうそうですね。確かにZZ TOPってこういう感じでしたよね。": {
        "new_sentence_jp": "そうですね。髭が長い人ですよね？",
        "new_sentence_en": "Yes, they're the ones with the long beards, right?"
    },
        "綺麗な人は大変だね。": {
        "new_sentence_jp": "靴をはいてください。",
        "new_sentence_en": "Please put on your shoes."
    }
}
# Function to replace sentences based on the vocab point
def replace_sentences(row):
    vocab_point = row["Example Sentence JP"]
    if vocab_point in replacements:
        row["Example Sentence JP"] = replacements[vocab_point]["new_sentence_jp"]
        row["Example Sentence EN"] = replacements[vocab_point]["new_sentence_en"]
    return row

# Apply the replacement function to each row in the DataFrame
df = df.apply(replace_sentences, axis=1)

# Save the updated DataFrame to a new CSV file
updated_csv_file = "N5_vocab_refined_examples_updated_again_again.csv"
df.to_csv(updated_csv_file, index=False)

print(f"Updated CSV saved to {updated_csv_file}")

Updated CSV saved to N5_vocab_refined_examples_updated_again_again.csv


In [32]:
# fixing vocab

CORRECTIONS = {
    # Existing corrections for あちら, 青, etc.
    "あちら": {
        "Meaning": "that way (over there, polite)",
    },
    "青": {
        "Example Sentence JP": "海の青はきれいですね。",
        "Example Sentence EN": "The ocean’s blue color is beautiful."
    },
    "青い": {
        "Example Sentence JP": "空は青いですね。",
        "Example Sentence EN": "The sky is really blue."
    },
    "赤": {
        "Example Sentence JP": "赤は好き？",
        "Example Sentence EN": "Do you like red?"
    },
    "赤い": {
        "Example Sentence JP": "顔が赤いよ。",
        "Example Sentence EN": "Your face is red."
    },

    # NEW: Relabels
    # 1) 半分 -> Noun
    "半分": {
        "Word Type": "Noun"
    },
    # 2) はい -> Interjection
    "はい": {
        "Word Type": "Interjection"
    },
    # 3) 暇(ひま) -> Na-adjective (keiyodoshi)
    "暇": {
        "Word Type": "Na-adjective (keiyodoshi)"
    },
     "～本": {
        "Word Type": "Counter"
    },
    "～屋": {
        "Word Type": "Suffix"
    },

    # Example Sentence Corrections
    "黒": {
        "Example Sentence JP": "黒が好きですか",
        "Example Sentence EN": "Do you like black?"
    }
}

def fix_n5_data(input_csv, output_csv):
    df = pd.read_csv(input_csv)

    # 1) Merge 在る／有る into "ある"
    merged_arus = {
        "Word": "ある",
        "Reading": "ある",
        "Meaning": "to be; to have (inanimate)",
        "JLPT": "N5",
        "Word Type": "Godan verb with 'ru' ending (irregular verb)",
        "Example Sentence JP": None,
        "Example Sentence EN": None
    }
    saw_aru = False

    # 2) Merge 川; 河 into "川(かわ)"
    merged_kawa = {
        "Word": "川/河",
        "Reading": "かわ",
        "Meaning": "river",
        "JLPT": "N5",
        "Word Type": "Noun",
        "Example Sentence JP": None,
        "Example Sentence EN": None
    }
    saw_kawa = False

    # 3) Merge 九(きゅう / く) -> "九(きゅう/く)"
    merged_kyuuku = {
        "Word": "九",
        "Reading": "きゅう/く",
        "Meaning": "nine",
        "JLPT": "N5",
        "Word Type": "Numeric",
        "Example Sentence JP": None,
        "Example Sentence EN": None
    }
    saw_kyuu_ku = False

    # 4) Merge 七(しち / なな) -> "七(しち/なな)"
    merged_shichinana = {
        "Word": "七",
        "Reading": "しち/なな",
        "Meaning": "seven",
        "JLPT": "N5",
        "Word Type": "Numeric",
        "Example Sentence JP": None,
        "Example Sentence EN": None
    }
    saw_shichi_nana = False

    # 5) Merge 何(なん / なに) -> "何(なん/なに)"
    merged_nanni = {
        "Word": "何",
        "Reading": "なん/なに",
        "Meaning": "what",
        "JLPT": "N5",
        "Word Type": "Pronoun",
        "Example Sentence JP": None,
        "Example Sentence EN": None
    }
    saw_nanni = False

    fixed_rows = []

    for _, row in df.iterrows():
        word = str(row["Word"]).strip()
        reading = str(row["Reading"]).strip()

        # --------- MERGE LOGIC FOR 有る / 在る ---------
        if word == "有る":
            if merged_arus["Example Sentence JP"] is None and pd.notna(row["Example Sentence JP"]):
                merged_arus["Example Sentence JP"] = row["Example Sentence JP"]
                merged_arus["Example Sentence EN"] = row["Example Sentence EN"]
            continue
        if word == "在る":
            saw_aru = True
            if merged_arus["Example Sentence JP"] is None and pd.notna(row["Example Sentence JP"]):
                merged_arus["Example Sentence JP"] = row["Example Sentence JP"]
                merged_arus["Example Sentence EN"] = row["Example Sentence EN"]
            continue

        # --------- MERGE LOGIC FOR 川; 河 ---------
        if word == "川; 河":
            saw_kawa = True
            if merged_kawa["Example Sentence JP"] is None and pd.notna(row["Example Sentence JP"]):
                merged_kawa["Example Sentence JP"] = row["Example Sentence JP"]
                merged_kawa["Example Sentence EN"] = row["Example Sentence EN"]
            continue

        # --------- MERGE LOGIC FOR 九(きゅう/く) ---------
        if word == "九" and reading in ["きゅう", "く"]:
            saw_kyuu_ku = True
            if merged_kyuuku["Example Sentence JP"] is None and pd.notna(row["Example Sentence JP"]):
                merged_kyuuku["Example Sentence JP"] = row["Example Sentence JP"]
                merged_kyuuku["Example Sentence EN"] = row["Example Sentence EN"]
            continue

        # --------- MERGE LOGIC FOR 七(しち/なな) ---------
        if word == "七" and reading in ["しち", "なな"]:
            saw_shichi_nana = True
            if merged_shichinana["Example Sentence JP"] is None and pd.notna(row["Example Sentence JP"]):
                merged_shichinana["Example Sentence JP"] = row["Example Sentence JP"]
                merged_shichinana["Example Sentence EN"] = row["Example Sentence EN"]
            continue

        # --------- MERGE LOGIC FOR 何(なん/なに) ---------
        if word == "何" and reading in ["なん", "なに"]:
            saw_nanni = True
            if merged_nanni["Example Sentence JP"] is None and pd.notna(row["Example Sentence JP"]):
                merged_nanni["Example Sentence JP"] = row["Example Sentence JP"]
                merged_nanni["Example Sentence EN"] = row["Example Sentence EN"]
            continue

        # Otherwise keep row
        fixed_rows.append(row.to_dict())

    # After looping, append merges
    if saw_aru:
        if merged_arus["Example Sentence JP"] is None:
            merged_arus["Example Sentence JP"] = "お金がある。"
            merged_arus["Example Sentence EN"] = "I have money."
        fixed_rows.append(merged_arus)

    if saw_kawa:
        if merged_kawa["Example Sentence JP"] is None:
            merged_kawa["Example Sentence JP"] = "川で泳ぐのが好きです。"
            merged_kawa["Example Sentence EN"] = "I like swimming in the river."
        fixed_rows.append(merged_kawa)

    if saw_kyuu_ku:
        if merged_kyuuku["Example Sentence JP"] is None:
            merged_kyuuku["Example Sentence JP"] = "猫は九歳です。"
            merged_kyuuku["Example Sentence EN"] = "The cat is nine years old."
        fixed_rows.append(merged_kyuuku)

    if saw_shichi_nana:
        if merged_shichinana["Example Sentence JP"] is None:
            merged_shichinana["Example Sentence JP"] = "七転び八起き。"
            merged_shichinana["Example Sentence EN"] = "Fall seven times, stand up eight."
        fixed_rows.append(merged_shichinana)

    if saw_nanni:
        if merged_nanni["Example Sentence JP"] is None:
            merged_nanni["Example Sentence JP"] = "何？"
            merged_nanni["Example Sentence EN"] = "What?"
        fixed_rows.append(merged_nanni)

    # Step 5) Apply manual CORRECTIONS, e.g., Word Type changes for 半分, はい, 暇
    for row in fixed_rows:
        w = str(row["Word"]).strip()
        if w in CORRECTIONS:
            for col_name, corrected_value in CORRECTIONS[w].items():
                row[col_name] = corrected_value

    # Convert to DataFrame & save
    fixed_df = pd.DataFrame(fixed_rows, columns=df.columns)
    fixed_df.to_csv(output_csv, index=False, encoding="utf-8")
    print(f"Saved corrected file to:", output_csv)


if __name__ == "__main__":
    input_csv_path = "N5_vocab_refined_examples_updated_again_again.csv" 
    output_csv_path = "N5_vocab_refined_examples_updated_again_again_again.csv"
    fix_n5_data(input_csv_path, output_csv_path)


Saved corrected file to: N5_vocab_refined_examples_updated_again_again_again.csv


In [38]:
# function to verify audio file creation

def sanitize_filename(filename: str) -> str:
    """
    Replace invalid file system characters with an underscore.
    Adjust the regex for your needs/operating system.
    """
    # Characters typically invalid on Windows: \ / : * ? " < > |
    return re.sub(r'[\\/*?:"<>|]', '_', filename)

def verify_audio_files(df):
    """
    Verifies that all expected audio files exist based on the sanitized file names.

    Args:
        df (pandas.DataFrame): DataFrame containing words (Grammar) and example sentences.

    Returns:
        list: A list of file paths that are missing (i.e., not found).
    """
    missing_files = []

    for index, row in df.iterrows():
        word = row.get("Word")
        example = row.get("Example Sentence JP")

        # If we have a word, sanitize it
        if pd.notna(word):
            safe_word = sanitize_filename(str(word))

            female_word_path = f"audio/words/female/{safe_word}.mp3"
            male_word_path = f"audio/words/male/{safe_word}.mp3"

            if not os.path.exists(female_word_path):
                missing_files.append(female_word_path)
            if not os.path.exists(male_word_path):
                missing_files.append(male_word_path)

        # If we have an example sentence, we also use the sanitized word
        if pd.notna(word) and pd.notna(example):
            safe_word = sanitize_filename(str(word))  # same as above

            female_example_path = f"audio/examples/female/{safe_word}_example.mp3"
            male_example_path = f"audio/examples/male/{safe_word}_example.mp3"

            if not os.path.exists(female_example_path):
                missing_files.append(female_example_path)
            if not os.path.exists(male_example_path):
                missing_files.append(male_example_path)

    return missing_files

In [39]:
# confirm creation of all audio files

# Load CSV
csv_file = "N5_vocab_refined_examples_updated_again_again_again.csv"
df = pd.read_csv(csv_file)

# Perform audio file verification
missing_files = verify_audio_files(df)

# Print results
if missing_files:
    print("\nMissing files:")
    for missing in missing_files:
        print(missing)
else:
    print("\nAll audio files were successfully generated.")


All audio files were successfully generated.
