In [2]:
import csv
import os

In [81]:
# filtering public JLPT vocab file for N5 only

input_file = "../jlpt_vocab.csv"
temp_file = "N5_vocab_filtered.csv"

new_column_names = ["Word", "Reading", "Meaning", "JLPT"]

with open(input_file, "r", encoding="utf-8-sig") as infile, open(temp_file, "w", encoding="utf-8-sig", newline="") as outfile:
    reader = csv.reader(infile)
    writer = csv.writer(outfile)

    header = next(reader)
    writer.writerow(header)

    jlpt_column_index = header.index("JLPT Level")

    for row in reader:
        if row[jlpt_column_index].strip() == "N5":
            writer.writerow(row)

print(f"Filtered rows with JLPT Level 'N5' saved to temporary file '{temp_file}'.")

Filtered rows with JLPT Level 'N5' saved to temporary file 'N5_vocab_filtered.csv'.


In [82]:
# change column names to Word, Reading, Meaning and JLPT

output_file = "N5_vocab.csv"

new_column_names = ["Word", "Reading", "Meaning", "JLPT"]

with open(temp_file, "r", encoding="utf-8-sig") as infile, open(output_file, "w", encoding="utf-8-sig", newline="") as outfile:
    reader = csv.reader(infile)
    writer = csv.writer(outfile)

    header = next(reader)

    writer.writerow(new_column_names)

    for row in reader:
        writer.writerow(row)

os.remove(temp_file)

print(f"Column names updated and output saved to '{output_file}'.")

Column names updated and output saved to 'N5_vocab.csv'.


In [93]:
# check for duplicate rows

all_rows = []
duplicates = []
seen = set()  # To track unique (Word, Reading) pairs

with open(output_file, "r", encoding="utf-8-sig") as f:
    reader = csv.reader(f)

    header = next(reader, None)

    for line_num, row in enumerate(reader, start=2):
        # Skip empty rows
        if not row or all(cell.strip() == "" for cell in row):
            continue
        
        # Create a unique key based on Word and Reading (indices 0 and 1)
        key = (row[0].strip(), row[1].strip())  # Word and Reading
        if key in seen:
            duplicates.append((output_file, line_num, row)) 
        else:
            seen.add(key)

        all_rows.append(row)

if duplicates:
    print("Duplicates found:")
    for file, line, row in duplicates:
        print(f"File: {file}, Line: {line}, Duplicate Entry: {row}")
else:
    print("No duplicates found across all files.")

No duplicates found across all files.


In [118]:
import requests
import csv
import time
import urllib.parse
import json

In [119]:
# add example sentences for each word in both Japanese and English function

def get_word_type_and_sentences(word):
    jisho_url = f"https://jisho.org/api/v1/search/words?keyword={word}"
    
    # Always URL-encode the search term to avoid issues with special characters
    encoded_word = urllib.parse.quote(word)
    tatoeba_url = f"https://tatoeba.org/en/api_v0/search?query={encoded_word}&from=jpn&to=eng"

    try:
        # --- 1) Fetch word type from Jisho ---
        response = requests.get(jisho_url)
        response.raise_for_status()
        jisho_data = response.json()

        word_type = "Unknown"
        if jisho_data.get("data"):
            senses = jisho_data["data"][0].get("senses", [])
            if senses:
                parts = senses[0].get("parts_of_speech", [])
                if parts:
                    word_type = parts[0]

        # --- 2) Fetch example sentences from Tatoeba ---
        response = requests.get(tatoeba_url)
        response.raise_for_status()
        tatoeba_data = response.json()

        sentences = []
        for sentence in tatoeba_data.get("results", []):
            jp_sentence = sentence.get("text", "")
            
            # 'translations' is a list of lists. Flatten them:
            translations = sentence.get("translations", [])
            all_translations = []
            for sublist in translations:
                all_translations.extend(sublist)
            
            # Grab the first English translation if there is one
            en_sentence = ""
            if all_translations:
                en_sentence = all_translations[0].get("text", "")
            
            if jp_sentence and en_sentence:
                sentences.append((jp_sentence, en_sentence))

        return word_type, sentences

    except requests.RequestException as e:
        print(f"Request failed for {word}: {e}")
        return "Unknown", []
    except (IndexError, KeyError, AttributeError, TypeError) as e:
        print(f"Unexpected data structure for {word}: {e}")
        return "Unknown", []


In [120]:
# add example sentences for each word in both Japanese and English

input_csv = "N5_vocab.csv"
output_csv = "N5_vocab_with_examples.csv"

with open(input_csv, "r", encoding="utf-8-sig") as infile, open(output_csv, "w", encoding="utf-8-sig", newline="") as outfile:
    reader = csv.DictReader(infile)
    fieldnames = reader.fieldnames + ["Word Type", "Example Sentence JP", "Example Sentence EN"]
    writer = csv.DictWriter(outfile, fieldnames=fieldnames)

    writer.writeheader()

    for row in reader:
        word = row["Word"]
        word_type, examples = get_word_type_and_sentences(word)

        row["Word Type"] = word_type
        if examples:
            # Use the first example sentence
            row["Example Sentence JP"], row["Example Sentence EN"] = examples[0]
        else:
            row["Example Sentence JP"] = "No example available"
            row["Example Sentence EN"] = "No example available"

        writer.writerow(row)
        print(f"Processed: {word}")
        time.sleep(2)

print(f"Updated file with word types and example sentences saved to '{output_csv}'.")

Processed: ああ
Processed: 会う
Processed: 青
Processed: 青い
Processed: 赤
Processed: 赤い
Processed: 明るい
Processed: 秋
Processed: 開く
Processed: 開ける
Processed: 上げる
Processed: 朝
Processed: 朝御飯
Processed: 明後日
Processed: 足; 脚
Processed: 明日
Processed: あそこ
Processed: 遊ぶ
Processed: 暖かい
Processed: 頭
Processed: 新しい
Processed: あちら
Processed: 暑い
Processed: 熱い
Processed: 厚い
Processed: あっち
Processed: 後
Processed: あなた
Processed: 兄
Processed: 姉
Processed: アパート
Processed: あの
Processed: 浴びる
Processed: 危ない
Processed: 甘い
Processed: 余り
Processed: 雨
Processed: 飴
Processed: 洗う
Processed: 在る
Processed: 有る
Processed: 歩く
Processed: あれ
Processed: いい; よい
Processed: いいえ
Processed: 言う
Processed: 家
Processed: いかが
Processed: 行く
Processed: いくつ
Processed: いくら
Processed: 池
Processed: 医者
Processed: 椅子
Processed: 忙しい
Processed: 痛い
Processed: 一
Processed: 一日
Processed: 一番
Processed: いつ
Processed: 五日
Processed: 一緒
Processed: 五つ
Processed: いつも
Processed: 犬
Processed: 今
Processed: 意味
Processed: 妹
Processed: 嫌
Processed: 入口
Processed:

In [122]:
# find rows with unknown values

input_file = "N5_vocab_with_examples.csv"

with open(input_file, "r", encoding="utf-8-sig") as f:
    reader = csv.reader(f)
    header = next(reader) 

    word_type_index = header.index("Word Type")
    example_jp_index = header.index("Example Sentence JP")
    example_en_index = header.index("Example Sentence EN")

    rows_with_issues = []
    for line_num, row in enumerate(reader, start=2):
        if (
            row[word_type_index] == "Unknown"
            or row[example_jp_index] == "No example available"
            or row[example_en_index] == "No example available"
        ):
            rows_with_issues.append((line_num, row))

if rows_with_issues:
    print(f"Found {len(rows_with_issues)} rows with issues:")
    for line_num, row in rows_with_issues:
        print(f"Line {line_num}: {row}")
else:
    print("No issues found in the file.")


Found 14 rows with issues:
Line 2: ['ああ', 'ああ', 'Ah!, Oh!', 'N5', 'Unknown', 'ああ！', 'Ah!']
Line 14: ['朝御飯', 'あさごはん', 'breakfast', 'N5', 'Noun', 'No example available', 'No example available']
Line 16: ['足; 脚', 'あし', 'foot; leg', 'N5', 'Noun', 'No example available', 'No example available']
Line 46: ['いいえ', 'いいえ', 'no, not at all', 'N5', 'Unknown', 'いいえ？', 'No?']
Line 92: ['ええ', 'ええ', 'yes', 'N5', 'Unknown', 'ええ。', 'Yes.']
Line 111: ['伯父; 叔父さん', 'おじさん', 'uncle, middle-aged man', 'N5', 'Noun', 'No example available', 'No example available']
Line 129: ['伯母さん; 叔母さん', 'おばさん', 'aunt', 'N5', 'Noun', 'No example available', 'No example available']
Line 182: ['～がる', '～がる', 'feel', 'N5', 'Unknown', '何で強がるの？', 'Why are you pretending to be tough?']
Line 275: ['さあ', 'さあ', 'come now, well', 'N5', 'Unknown', 'さあね。', 'You can search me!']
Line 451: ['どうも', 'どうも', 'Thank you; somehow; no matter how hard one may try', 'N5', 'Unknown', 'どうも！', 'Thanks!']
Line 521: ['～杯', '～はい', 'counter for cupfuls', 'N

In [123]:
# filling the unknown sections with handmade data, different for each dataset

input_file = "N5_vocab_with_examples.csv"
output_file = "N5_vocab_filled.csv"

substitutions = {
    2: ["ああ", "ああ", "Ah!, Oh!", "N5", "Interjection", "ああ、疲れた！", "Ah, I’m tired!"],
    14: ["朝御飯", "あさごはん", "breakfast", "N5", "Noun", "朝御飯にパンを食べます。", "I eat bread for breakfast."],
    16: ["足; 脚", "あし", "foot; leg", "N5", "Noun", "足が速いです。", "My legs are fast."],
    46: ["いいえ", "いいえ", "no, not at all", "N5", "Interjection", "いいえ、大丈夫です。", "No, I’m fine."],
    92: ["ええ", "ええ", "yes", "N5", "Interjection", "ええ、分かりました。", "Yes, I understand."],
    111: ["伯父; 叔父さん", "おじさん", "uncle, middle-aged man", "N5", "Noun", "おじさんは家にいます。", "My uncle is at home."],
    129: ["伯母さん; 叔母さん", "おばさん", "aunt", "N5", "Noun", "おばさんが料理を作っています。", "My aunt is cooking."],
    182: ["～がる", "～がる", "feel", "N5", "Auxiliary verb", "彼は寒がっています。", "He’s feeling cold."],
    275: ["さあ", "さあ", "come now, well", "N5", "Interjection", "さあ、始めましょう！", "Well, let’s get started!"],
    451: ["どうも", "どうも", "Thank you; somehow", "N5", "Adverb / Interjection", "どうもありがとうございました！", "Thank you very much!"],
    521: ["～杯", "～はい", "counter for cupfuls", "N5", "Counter", "ビールを一杯ください。", "One glass of beer, please."],
    575: ["昼御飯", "ひるごはん", "lunch, midday meal", "N5", "Noun", "昼御飯にカレーを食べました。", "I ate curry for lunch."],
    661: ["もしもし", "もしもし", "Hello? (used on the phone)", "N5", "Interjection", "もしもし、田中さんですか？", "Hello, is this Tanaka-san?"],
    698: ["ラジオカセ", "ラジオカセ", "radio cassette player", "N5", "Noun", "ラジオカセで音楽を聞きます。", "I listen to music on the radio cassette player."],
}

with open(input_file, "r", encoding="utf-8-sig") as infile, open(output_file, "w", encoding="utf-8-sig", newline="") as outfile:
    reader = csv.reader(infile)
    writer = csv.writer(outfile)

    for line_num, row in enumerate(reader, start=1):
        if line_num in substitutions:
            writer.writerow(substitutions[line_num])
        else:
            writer.writerow(row)

print(f"Missing data filled and saved to {output_file}.")

Missing data filled and saved to N5_vocab_filled.csv.


In [126]:
# double check there are no remaining unknown values

input_file = "N5_vocab_filled.csv"

with open(input_file, "r", encoding="utf-8-sig") as f:
    reader = csv.reader(f)
    header = next(reader) 

    word_type_index = header.index("Word Type")
    example_jp_index = header.index("Example Sentence JP")
    example_en_index = header.index("Example Sentence EN")

    rows_with_issues = []
    for line_num, row in enumerate(reader, start=2):
        if (
            row[word_type_index] == "Unknown"
            or row[example_jp_index] == "No example available"
            or row[example_en_index] == "No example available"
        ):
            rows_with_issues.append((line_num, row))

if rows_with_issues:
    print(f"Found {len(rows_with_issues)} rows with issues:")
    for line_num, row in rows_with_issues:
        print(f"Line {line_num}: {row}")
else:
    print("No issues found in the file.")

No issues found in the file.


In [127]:
# check for missing columns

all_rows = []
header = None

with open(input_file, "r", encoding="utf-8-sig") as f:
    reader = csv.reader(f)
    header = next(reader)
        
    for line_num, row in enumerate(reader, start=2):
        # Check if row is empty
        if not row or all(cell.strip() == "" for cell in row):
            print(f"[INFO] {output_file}, line {line_num}: Empty row skipped")
            continue
        # Check if row has exactly 7 columns:
        if len(row) != 7:
            print(f"[WARNING] {output_file}, line {line_num}: Expected 6 columns, found {len(row)} => {row}")
        else:
            all_rows.append(row)

print(f"{output_file} check complete")

N5_vocab_filled.csv check complete


In [4]:
# adjusting example sentences to match JLPT Level difficulty

input_csv = "N5_vocab_filled.csv"
output_csv = "N5_vocab_refined_examples.csv"

replacements = {
    "本を開くな。": ("本を開けないで。", "Don't open your book."),
    "窓を開けるな。": ("窓を開けないで。", "Don't open the window."),
    "頭を上げるなよ。": ("頭を上げないで。", "Don't raise your head."),
    "ここで遊ぶな。": ("ここで遊ばないで。", "Don't play here."),
    "分厚い本だな。": ("厚い本だな。", "That book is thick."),
    "後は任せた！": ("後はお願いね！", "The rest is up to you!"),
    "桃は甘い。": ("りんごは甘い。", "Apples are sweet."),
    "洗う必要がある。": ("洗ってください。", "Please wash it."),
    "神は存在する。": ("神はいるよ。", "God exists."),
    "よく言うよ。": ("そんなこと言わないで。", "Don't say that."),
    "退屈だなあ。": ("つまらないなあ。", "I'm bored."),
    "入池禁止。": ("池に入らないでください。", "Please don't enter the pond."),
    "猛犬注意！": ("犬に注意！", "Beware of the dog!"),
    "入口の側で立った。": ("入口の前に立った。", "He stood in front of the entrance."),
    "奴ら、中に居るの？": ("彼ら、中にいるの？", "Are they inside?"),
    "上出来！": ("よくできたね！", "Well done!"),
    "あの人は影が薄い。": ("あの人はあまり目立たない。", "He doesn't stand out much."),
    "大海の一滴。": ("海は広い。", "The sea is vast."),
    "俺の絵だよ。": ("私の絵だよ。", "This is my painting."),
    "東京駅なう。": ("東京駅にいるよ。", "I'm at Tokyo Station now."),
    "疑問が多い。": ("質問が多い。", "I have a lot of questions."),

    "通路に物を置くな。": ("通路に物を置かないで。", "Don't put your things in the passage."),
    "彼に教えるな。": ("彼に教えないで。", "Don't tell him."),
    "ゲス男。": ("ひどい男。", "What a jerk."),
    "歴史は繰り返す。": ("同じことが起きるんだ。", "The same thing happens again."),
    "妊娠４か月です。": ("今は四ヶ月目です。", "I'm four months along."),
    "嘘も方便。": ("嘘はよくないよ。", "Lies aren't good."),
    "氷山の一角だよ。": ("ほんの一部だけだよ。", "That's just a small part of it."),
    "河童の川流れ。": ("誰でも失敗はある。", "Anyone can make mistakes."),
    "女性優先。": ("女性が先だよ。", "Ladies go first."),
    "読書の選択やまた読書の仕方について学生達から質問を受けたことが度々ある。これに対する自分の答はいつも不得要領に終る外はなかった。如何なる人に如何なる恋をしたらいいかと聞かれるのと大した相違はないような気がする。": (
        "授業が終わった。",
        "Class has ended."
    ),

    "猫に九生あり。": (
        "猫は丈夫だね。",
        "Cats are tough."
    ),
    "そろそろ切るぞ？": (
        "そろそろ切るね。",
        "Shall we hang up now?"
    ),
    "恩に着るよ。": (
        "本当に助かるよ。",
        "You're really helping me out."
    ),
    "牛乳が腐った。": (
        "牛乳が悪くなった。",
        "The milk has gone bad."
    ),
    "一部のプログラマーは忘れているようだが、良いドキュメントを書くということは、良いプログラムを書くことよりも重要とは言わないまでも、同じ程度には重要なことなのだ。": (
        "文書を書くのも大切だよ。",
        "Writing documentation is important too."
    ),
    "白先黒死。": (
        "最後まで勝負は分からない。",
        "You never know who'll win until the end."
    ),
    "彼は腹黒い。": (
        "彼は性格が悪い。",
        "He’s nasty."
    ),
    "明日の朝九時から胃の検査を行いますので、前日の夜九時以降は何も飲んだり食べたりしないでください。": (
        "明日の朝九時から検査があるので、夜九時以降は何も食べないで。",
        "There's a test at 9 a.m. tomorrow, so please don't eat after 9 p.m. tonight."
    ),
    "この野郎！": (
        "こら！",
        "Hey!"
    ),
    "映画館は、インターネット上に出回る違法コピーのせいで、収益をますます失いつつある。": (
        "映画館は大変だね。",
        "Movie theaters are having a tough time."
    ),
    "頭を下げろ！": (
        "頭を下げて！",
        "Lower your head!"
    ),

     "死ぬな！": (
        "死なないで！",
        "Don't die!"
    ),
    "彼は生き字引と呼ばれている。": (
        "彼は物知りだね。",
        "He's really knowledgeable."
    ),
    "自分でおやり！": (
        "自分でやって！",
        "Do it yourself!"
    ),
    "近所の神社の境内には、樹齢800年といわれる銀杏の大木がある。": (
        "近所の神社に大きな木があるよ。",
        "There’s a big tree at the local shrine."
    ),
    "十人十色。": (
        "人それぞれだね。",
        "Everyone’s different."
    ),
    "中古だよ。": (
        "これは古いものだよ。",
        "It's used."
    ),
    "鯔背だね。": (
        "かっこいいね。",
        "He looks cool."
    ),
    "口を出すな。": (
        "口を出さないで。",
        "Don't butt in."
    ),
    "貧乏人の子沢山。": (
        "子供がたくさんいるね。",
        "They have a lot of kids."
    ),
    "立ちなさい。": (
        "立ってください。",
        "Please stand up."
    ),
    "林に竹が目立つ。": (
        "林に竹がある。",
        "There's bamboo in the woods."
    ),

    "王手！": (
        "やった！",
        "I did it!"
    ),
    "格好つけるな。": (
        "格好つけないで。",
        "Don't show off."
    ),
    "彼はコンサルティングファームに勤める。": (
        "彼は会社に勤めている。",
        "He works at a company."
    ),
    "立ち止まるな。": (
        "立ち止まらないで。",
        "Don't stop moving."
    ),
    "一石二鳥": (
        "一回で二つできる。",
        "Two things done at once."
    ),
    "弱肉強食。": (
        "強い者が勝つ。",
        "The strong win."
    ),  

    "佳人薄命": (
        "綺麗な人は大変だね。",
        "Beautiful people have it tough."
    ),
    "春は曙。": (
        "春はいいね。",
        "Spring is nice."
    ),
    "図に乗るな。": (
        "調子に乗らないで。",
        "Don't push your luck."
    ),
    "働くしかない。": (
        "働かないとだめだ。",
        "We have to work."
    ),
    "五十歩百歩。": (
        "大差ないね。",
        "There’s not much difference."
    ),
    "お前はトイレだ！": (
        "そこはトイレだよ。",
        "That's the bathroom."
    ),
    "母は頭が古い。": (
        "母は考え方が古いね。",
        "My mom's way of thinking is old-fashioned."
    ),
    "税金を払うようになって、初めて人生が始まる。": (
        "社会人になって、初めて人生が始まる。",
        "Life begins when you enter society."
    ),
    "二つ折り厳禁。": (
        "二つに折らないで。",
        "Don't fold it in half."
    ),
    "ガソリンが無くなりつつあります。": (
        "ガソリンがなくなりそう。",
        "We're running out of gas."
    ),
    "私が出すの頼んでおいた手紙にさ、切手貼るの忘れないでよ。": (
        "手紙に切手を貼るのを忘れないでね。",
        "Don't forget to put stamps on the letters."
    ),
    "彼女は木を抱き締める人だ。": (
        "彼女は自然が大好きなんだね。",
        "She really loves nature."
    ),
    "彼は素早い。": (
        "彼は動きが速いね。",
        "He moves quickly."
    ),
    "バスケ得意？": (
        "バスケ上手？",
        "Are you good at basketball?"
    ),
    "トムが葉書を送ってきたんだ。": (
        "トムが葉書を送ったよ。",
        "Tom sent me a postcard."
    ),
    "母似です。": (
        "母に似ているよ。",
        "I look like my mother."
    ),
    "二日酔いだ。": (
        "お酒を飲みすぎた。",
        "I drank too much."
    ),
    "お飲み物はいかが？": (
        "飲み物いる？",
        "Want something to drink?"
    ),

    "彼は二枚目だ。": (
        "彼はかっこいい。",
        "He is handsome."
    ),
    "まっすぐ行け。": (
        "まっすぐ行って。",
        "Go straight ahead."
    ),
    "三つ子の魂百まで。": (
        "子供のころの性格は変わらない。",
        "A person's nature rarely changes."
    ),
    "旅は道連れ。": (
        "一人より一緒のほうが楽しい。",
        "It's more fun with company."
    ),
    "死を忘れるな。": (
        "死を忘れないで。",
        "Don't forget about death."
    ),
    "悪い子ね！": (
        "いけない子ね！",
        "You're being naughty!"
    ),
    "まん丸に見えるあのお月さまはね、実は円錐の底面なんだよ。": (
        "お月さまが丸く見えるね。",
        "The moon looks round."
    ),
}

with open(input_csv, "r", newline="", encoding="utf-8-sig") as infile, \
     open(output_csv, "w", newline="", encoding="utf-8-sig") as outfile:
    
    reader = csv.DictReader(infile)
    fieldnames = reader.fieldnames
    writer = csv.DictWriter(outfile, fieldnames=fieldnames, extrasaction='ignore')
    
    # Write header to the new CSV
    writer.writeheader()
    
    for row in reader:
        original_jp = row["Example Sentence JP"]
        
        if original_jp in replacements:
            new_jp, new_en = replacements[original_jp]
            row["Example Sentence JP"] = new_jp
            row["Example Sentence EN"] = new_en
        
        # Write (potentially modified) row to the output CSV
        writer.writerow(row)

print("Done! Your updated CSV is:", output_csv)

Done! Your updated CSV is: N5_vocab_refined_examples.csv
