In [45]:
import csv
import re
import json

## Grammar Content Creation

In [4]:
# Add JLPT Level, Grammar, Meaning headings and extract these columns only. Filter only N5 rows

input_file = "../jlpt_grammar.csv" 
output_file = "N5_grammar_extracted.csv"

# Define the new headings
new_headings = ["JLPT Level", "Grammar", "Meaning"]

with open(input_file, "r", encoding="utf-8-sig") as infile, open(output_file, "w", encoding="utf-8-sig", newline="") as outfile:
    reader = csv.reader(infile)
    writer = csv.writer(outfile)

    # Write the new headings
    writer.writerow(new_headings)
 
    for row in reader:
        if row[0].strip() == "N5":
            cleaned_row = [row[0], row[2], row[4]]  # 0: JLPT Level, 2: Grammar, 4: Meaning
            writer.writerow(cleaned_row)

print(f"Cleaned file saved to '{output_file}'.")


Cleaned file saved to 'N5_grammar_extracted.csv'.


In [5]:
# check for duplicate rows

all_rows = []
duplicates = []
seen = set()  # To track unique (Word, Reading) pairs

with open(output_file, "r", encoding="utf-8-sig") as f:
    reader = csv.reader(f)

    header = next(reader, None)

    for line_num, row in enumerate(reader, start=2):
        # Skip empty rows
        if not row or all(cell.strip() == "" for cell in row):
            continue
        
        # Create a unique key based on Word and Reading (indices 0 and 1)
        key = (row[0].strip(), row[1].strip())  # Word and Reading
        if key in seen:
            duplicates.append((output_file, line_num, row)) 
        else:
            seen.add(key)

        all_rows.append(row)

if duplicates:
    print("Duplicates found:")
    for file, line, row in duplicates:
        print(f"File: {file}, Line: {line}, Duplicate Entry: {row}")
else:
    print("No duplicates found across all files.")

No duplicates found across all files.


In [6]:
# Add a readings column and add readings to each row

input_file = "N5_grammar_extracted.csv"
output_file = "N5_grammar_with_reading.csv"

# Grammar and their readings
grammar_readings = {
    "ちゃいけない・じゃいけない": "ちゃいけない / じゃいけない",
    "だ・です": "だ / です",
    "だけ": "だけ",
    "だろう": "だろう",
    "で": "で",
    "でも": "でも",
    "でしょう": "でしょう",
    "どんな": "どんな",
    "どうして": "どうして",
    "どうやって": "どうやって",
    "が": "が",
    "があります": "があります",
    "がほしい": "がほしい",
    "がいます": "がいます",
    "ほうがいい": "ほうがいい",
    "い-adjectives": "いけいようし",
    "一番": "いちばん",
    "一緒に": "いっしょに",
    "いつも": "いつも",
    "じゃない・ではない": "じゃない / ではない",
    "か": "か",
    "か〜か": "か〜か",
    "から": "から",
    "方": "かた",
    "けど": "けど",
    "けれども": "けれども",
    "まだ": "まだ",
    "まだ〜ていません": "まだ〜ていません",
    "まで": "まで",
    "前に": "まえに",
    "ませんか": "ませんか",
    "ましょう": "ましょう",
    "ましょうか": "ましょうか",
    "も": "も",
    "もう": "もう",
    "な-adjectives": "なけいようし",
    "なあ": "なあ",
    "ないで": "ないで",
    "ないでください": "ないでください",
    "なくてもいい": "なくてもいい",
    "なくちゃ": "なくちゃ",
    "なくてはいけない": "なくてはいけない",
    "なくてはならない": "なくてはならない",
    "なる": "なる",
    "んです": "んです",
    "ね": "ね",
    "に": "に",
    "にいく": "にいく",
    "にする": "にする",
    "に/へ": "に / へ",
    "の": "の",
    "のです": "のです",
    "のが下手": "のがへた",
    "のが上手": "のがじょうず",
    "のが好き": "のがすき",
    "の中で[A]が一番": "のなかで[A]がいちばん",
    "ので": "ので",
    "を": "を",
    "をください": "をください",
    "しかし": "しかし",
    "すぎる": "すぎる",
    "たことがある": "たことがある",
    "たい": "たい",
    "たり〜たり": "たり〜たり",
    "てある": "てある",
    "ている": "ている",
    "てから": "てから",
    "てください": "てください",
    "てはいけない": "てはいけない",
    "てもいいです": "てもいいです",
    "と": "と",
    "とき": "とき",
    "とても": "とても",
    "つもり": "つもり",
    "は": "は",
    "は〜より・・・です": "は〜より・・・です",
    "はどうですか": "はどうですか",
    "や": "や",
    "よ": "よ",
    "より〜ほうが": "より〜ほうが",
}

with open(input_file, "r", encoding="utf-8-sig") as infile, open(output_file, "w", encoding="utf-8-sig", newline="") as outfile:
    reader = csv.reader(infile)
    writer = csv.writer(outfile)

    # Read the header and add the new column
    header = next(reader)
    header.insert(2, "Reading")  # Add Reading column after Grammar
    writer.writerow(header)

    for row in reader:
        grammar = row[1]
        reading = grammar_readings.get(grammar, "No reading available")
        row.insert(2, reading)
        writer.writerow(row)

print(f"File with Reading column saved to '{output_file}'.")


File with Reading column saved to 'N5_grammar_with_reading.csv'.


In [7]:
import requests
import time
import urllib.parse

In [8]:
# Add example sentences for each word in both Japanese and English

def get_word_type_and_sentences(word):
    jisho_url = f"https://jisho.org/api/v1/search/words?keyword={word}"
    
    # Always URL-encode the search term to avoid issues with special characters
    encoded_word = urllib.parse.quote(word)
    tatoeba_url = f"https://tatoeba.org/en/api_v0/search?query={encoded_word}&from=jpn&to=eng"

    try:
        # --- 1) Fetch word type from Jisho ---
        response = requests.get(jisho_url)
        response.raise_for_status()
        jisho_data = response.json()

        word_type = "Unknown"
        if jisho_data.get("data"):
            senses = jisho_data["data"][0].get("senses", [])
            if senses:
                parts = senses[0].get("parts_of_speech", [])
                if parts:
                    word_type = parts[0]

        # --- 2) Fetch example sentences from Tatoeba ---
        response = requests.get(tatoeba_url)
        response.raise_for_status()
        tatoeba_data = response.json()

        sentences = []
        for sentence in tatoeba_data.get("results", []):
            jp_sentence = sentence.get("text", "")
            
            # 'translations' is a list of lists. Flatten them:
            translations = sentence.get("translations", [])
            all_translations = []
            for sublist in translations:
                all_translations.extend(sublist)
            
            # Grab the first English translation if there is one
            en_sentence = ""
            if all_translations:
                en_sentence = all_translations[0].get("text", "")
            
            if jp_sentence and en_sentence:
                sentences.append((jp_sentence, en_sentence))

        return word_type, sentences

    except requests.RequestException as e:
        print(f"Request failed for {word}: {e}")
        return "Unknown", []
    except (IndexError, KeyError, AttributeError, TypeError) as e:
        print(f"Unexpected data structure for {word}: {e}")
        return "Unknown", []

In [9]:
# add example sentences for each word in both Japanese and English

input_csv = "N5_grammar_with_reading.csv"
output_csv = "N5_grammar_with_examples.csv"

with open(input_csv, "r", encoding="utf-8-sig") as infile, open(output_csv, "w", encoding="utf-8-sig", newline="") as outfile:
    reader = csv.DictReader(infile)
    fieldnames = reader.fieldnames + ["Word Type", "Example Sentence JP", "Example Sentence EN"]
    writer = csv.DictWriter(outfile, fieldnames=fieldnames)

    writer.writeheader()

    for row in reader:
        word = row["Grammar"]
        word_type, examples = get_word_type_and_sentences(word)

        row["Word Type"] = word_type
        if examples:
            # Use the first example sentence
            row["Example Sentence JP"], row["Example Sentence EN"] = examples[0]
        else:
            row["Example Sentence JP"] = "No example available"
            row["Example Sentence EN"] = "No example available"

        writer.writerow(row)
        print(f"Processed: {word}")
        time.sleep(2)

print(f"Updated file with word types and example sentences saved to '{output_csv}'.")

Processed: ちゃいけない・じゃいけない
Processed: だ・です
Processed: だけ
Processed: だろう
Processed: で
Processed: でも
Processed: でしょう
Processed: どんな
Processed: どうして
Processed: どうやって
Processed: が
Processed: があります
Processed: がほしい
Processed: がいます
Processed: ほうがいい
Processed: い-adjectives
Processed: 一番
Processed: 一緒に
Processed: いつも
Processed: じゃない・ではない
Processed: か
Processed: か〜か
Processed: から
Processed: 方
Processed: けど
Processed: けれども
Processed: まだ
Processed: まだ〜ていません
Processed: まで
Processed: 前に
Processed: ませんか
Processed: ましょう
Processed: ましょうか
Processed: も
Processed: もう
Processed: な-adjectives
Processed: なあ
Processed: ないで
Processed: ないでください
Processed: なくてもいい
Processed: なくちゃ
Processed: なくてはいけない
Processed: なくてはならない
Processed: なる
Processed: んです
Processed: ね
Processed: に
Processed: にいく
Processed: にする
Processed: に/へ
Processed: の
Processed: のです
Processed: のが下手
Processed: のが上手
Processed: のが好き
Processed: の中で[A]が一番
Processed: ので
Processed: を
Processed: をください
Processed: しかし
Processed: すぎる
Processed: たことがある
Processed: たい

In [3]:
# replace difficult example sentences with simpler ones

replacements = {
    "「もう俺たちの邪魔をしないなら、今までのことは水に流してやってもいいけど？」「やけに寛大なんだな・・・」": (
        "「もう邪魔しないなら、今までのことは許してあげるよ。」「優しいんだね…」",
        "\"If you won't bother us anymore, I'll let bygones be bygones.\" \"You're quite kind...\""
    ),
    "馬鹿だろう！": (
        "馬鹿じゃないの？",
        "Aren't you an idiot?"
    ),
    "出ろ！": (
        "出て！",
        "Get out!"
    ),
    "死ね！": (
        "やめて！",
        "Stop it!"
    ),
    "死を忘れるな。": (
        "死を忘れないで。",
        "Don't forget about death."
    ),
    "笑ってはいけない。": (
        "笑わないでください。",
        "Don't laugh."
    ),
    "恥を知れ！": (
        "恥ずかしいよ！",
        "That's embarrassing!"
    ),

}

input_csv = "N5_grammar_with_examples.csv" 
output_csv = "N5_grammar_refined_examples.csv" 

with open(input_csv, "r", newline="", encoding="utf-8-sig") as infile, \
     open(output_csv, "w", newline="", encoding="utf-8-sig") as outfile:

    reader = csv.DictReader(infile)

    fieldnames = reader.fieldnames
    writer = csv.DictWriter(outfile, fieldnames=fieldnames, extrasaction='ignore')

    writer.writeheader()
    
    for row in reader:
        jp = row.get("Example Sentence JP", "")
        
        # If JP sentence is in our dictionary, replace both JP and EN
        if jp in replacements:
            new_jp, new_en = replacements[jp]
            row["Example Sentence JP"] = new_jp
            row["Example Sentence EN"] = new_en
        
        writer.writerow(row)

print("Done! Updated CSV saved to:", output_csv)

Done! Updated CSV saved to: N5_grammar_refined_examples.csv


In [4]:
# check for missing columns

all_rows = []
header = None

with open(output_csv, "r", encoding="utf-8-sig") as f:
    reader = csv.reader(f)
    header = next(reader)
        
    for line_num, row in enumerate(reader, start=2):
        # Check if row is empty
        if not row or all(cell.strip() == "" for cell in row):
            print(f"[INFO] {output_csv}, line {line_num}: Empty row skipped")
            continue
        # Check if row has exactly 7 columns:
        if len(row) != 7:
            print(f"[WARNING] {output_csv}, line {line_num}: Expected 6 columns, found {len(row)} => {row}")
        else:
            all_rows.append(row)

print(f"{output_csv} check complete")

N5_grammar_refined_examples.csv check complete


In [2]:
import pandas as pd
import os

In [9]:
# check how many words contain more than one example sentene

# Load CSV file
csv_file = "N5_grammar_refined_examples.csv"
df = pd.read_csv(csv_file)

# Define the column containing example sentences
example_column = "Example Sentence JP"

# Function to check if a row contains more than one example sentence
def has_multiple_examples(text):
    if pd.isna(text):  # Handle NaN values
        return False
    return len(text.split('」「')) > 1 

# Apply the function to the example sentence column
df["Multiple Examples"] = df[example_column].apply(has_multiple_examples)

# Filter rows with multiple examples
rows_with_multiple_examples = df[df["Multiple Examples"]]

# Display the result
print("Rows with more than one example sentence:")
print(rows_with_multiple_examples)

Rows with more than one example sentence:
   JLPT Level        Grammar          Reading  \
0          N5  ちゃいけない・じゃいけない  ちゃいけない / じゃいけない   
1          N5           だ・です           だ / です   
27         N5       まだ〜ていません         まだ〜ていません   

                               Meaning         Word Type  \
0        must not do (spoken Japanese)              Noun   
1   to be (am, is, are, were, used to)    Auxiliary verb   
27                        have not yet  Adverb (fukushi)   

                                  Example Sentence JP  \
0                「もう邪魔しないなら、今までのことは許してあげるよ。」「優しいんだね…」   
1        「そろそろ、人物写真でもやってみたらどうだ？」「え？スナップ・・・ですか？それは・・・」   
27  「変な日本語でごめんなさい。でも英語もできません」「ちょっと、聞いてもいい？君は日本人？」「...   

                                  Example Sentence EN  Multiple Examples  
0   "If you won't bother us anymore, I'll let bygo...               True  
1   "Isn't it about time you tried photographing p...               True  
27  "I am sorry for my odd Japanese. But I cannot ...             

In [10]:
# fix rows with more than one example

# Load CSV file
csv_file = "N5_grammar_refined_examples.csv"
df = pd.read_csv(csv_file)

# Define a dictionary of grammar points and new example sentences
replacements = {
    "ちゃいけない・じゃいけない": {
        "new_sentence_jp": "ここに入っちゃいけない。",
        "new_sentence_en": "You must not enter here."
    },
    "だ・です": {
        "new_sentence_jp": "これはペンです。",
        "new_sentence_en": "This is a pen."
    },
     "まだ〜ていません": {
        "new_sentence_jp": "私はまだ朝ごはんを食べていません。",
        "new_sentence_en": "I have not eaten breakfast yet."
    },
}

# Function to replace sentences based on the grammar point
def replace_sentence(row):
    grammar_point = row["Grammar"]
    if grammar_point in replacements:
        row["Example Sentence JP"] = replacements[grammar_point]["new_sentence_jp"]
        row["Example Sentence EN"] = replacements[grammar_point]["new_sentence_en"]
    return row

# Apply the replacement function to the DataFrame
df = df.apply(replace_sentence, axis=1)

# Save the updated DataFrame to a new CSV file
updated_csv_file = "N5_grammar_refined_examples_updated.csv"
df.to_csv(updated_csv_file, index=False)

print(f"Updated CSV saved to {updated_csv_file}")

Updated CSV saved to N5_grammar_refined_examples_updated.csv


In [11]:
# check for grammar with no example

input_file = "N5_grammar_refined_examples_updated.csv"

with open(input_file, "r", encoding="utf-8-sig") as f:
    reader = csv.reader(f)
    header = next(reader) 

    word_type_index = header.index("Word Type")
    example_jp_index = header.index("Example Sentence JP")
    example_en_index = header.index("Example Sentence EN")

    rows_with_issues = []
    for line_num, row in enumerate(reader, start=2):
        if (
            row[word_type_index] == "Unknown"
            or row[example_jp_index] == "No example available"
            or row[example_en_index] == "No example available"
        ):
            rows_with_issues.append((line_num, row))

if rows_with_issues:
    print(f"Found {len(rows_with_issues)} rows with issues:")
    for line_num, row in rows_with_issues:
        print(f"Line {line_num}: {row}")
else:
    print("No issues found in the file.")

Found 5 rows with issues:
Line 49: ['N5', 'にいく', 'にいく', 'go to do', 'Unknown', '私は山にいく。', 'I go to the mountain.']
Line 57: ['N5', 'の中で[A]が一番', 'のなかで[A]がいちばん', 'out of this group, [A] is best', 'Particle', 'No example available', 'No example available']
Line 65: ['N5', 'たり〜たり', 'たり〜たり', 'do such things as A and B', 'Particle', 'No example available', 'No example available']
Line 77: ['N5', 'は〜より・・・です', 'は〜より・・・です', '[A] is more ~ than [B]', 'Particle', 'No example available', 'No example available']
Line 81: ['N5', 'より〜ほうが', 'より〜ほうが', '[A] is more than [B]', 'Particle', 'No example available', 'No example available']


In [12]:
# add missing examples

# Load CSV
csv_file = "N5_grammar_refined_examples_updated.csv"
df = pd.read_csv(csv_file)

# Replacement dictionary
replacements = {
    "にいく": {
        "new_sentence_jp": "友達と映画を見に行く。",
        "new_sentence_en": "I go to watch a movie with my friends."
    },
    "の中で[A]が一番": {
        "new_sentence_jp": "果物の中でリンゴが一番好きです。",
        "new_sentence_en": "Out of all the fruits, I like apples the most."
    },
    "たり〜たり": {
        "new_sentence_jp": "休みの日は本を読んだり、音楽を聞いたりします。",
        "new_sentence_en": "On my days off, I do things like read books and listen to music."
    },
    "は〜より・・・です": {
        "new_sentence_jp": "犬は猫より大きいです。",
        "new_sentence_en": "Dogs are bigger than cats."
    },
    "より〜ほうが": {
        "new_sentence_jp": "海より山のほうが好きです。",
        "new_sentence_en": "I like mountains more than the sea."
    }
}

# Replace rows with new sentences
def replace_sentences(row):
    grammar_point = row["Grammar"]
    if grammar_point in replacements:
        row["Example Sentence JP"] = replacements[grammar_point]["new_sentence_jp"]
        row["Example Sentence EN"] = replacements[grammar_point]["new_sentence_en"]
    return row

# Apply replacement logic to each row
df = df.apply(replace_sentences, axis=1)

# Save updated CSV
updated_csv_file = "N5_grammar_refined_examples_updated_again.csv"
df.to_csv(updated_csv_file, index=False)

print(f"Updated CSV saved to {updated_csv_file}")

Updated CSV saved to N5_grammar_refined_examples_updated_again.csv


In [20]:
# fixing example sentences that do not in fact use the corresponding grammar point

# Load CSV file
csv_file = "N5_grammar_refined_examples_updated_again.csv"
df = pd.read_csv(csv_file)

# Replacement dictionary with corrections
replacements = {
    "だろう": {
        "new_sentence_jp": "明日は晴れるだろう。",
        "new_sentence_en": "It will probably be sunny tomorrow."
    },
    "で": {
        "new_sentence_jp": "バスで学校に行きます。",
        "new_sentence_en": "I go to school by bus."
    },
    "でしょう": {
        "new_sentence_jp": "彼は学生でしょう。",
        "new_sentence_en": "He is probably a student."
    },
    "がいます": {
        "new_sentence_jp": "庭に犬がいます。",
        "new_sentence_en": "There is a dog in the garden."
    },
    "じゃない・ではない": {
        "new_sentence_jp": "彼は学生じゃない。",
        "new_sentence_en": "He is not a student."
    },
    "はどうですか": {
        "new_sentence_jp": "この映画はどうですか。",
        "new_sentence_en": "How about this movie?"
    },
    "の中で[A]が一番": {
        "new_sentence_jp": "果物の中でリンゴが一番好きです。",
        "new_sentence_en": "Out of all the fruits, I like apples the most."
    },
    "なあ": {
        "new_sentence_jp": "この景色はきれいだなあ。",
        "new_sentence_en": "This scenery is beautiful, isn't it?"
    },
    "に": {
        "new_sentence_jp": "学校に行きます。",
        "new_sentence_en": "I go to school."
    },
    "のです": {
        "new_sentence_jp": "どうして行かなかったのですか？",
        "new_sentence_en": "Why didn’t you go?"
    },
    "すぎる": {
        "new_sentence_jp": "この本は難しすぎる。",
        "new_sentence_en": "This book is too difficult."
    },
    "よ": {
        "new_sentence_jp": "これは本当によ！",
        "new_sentence_en": "This is true, you know!"
    },
    "ないで": {
        "new_sentence_jp": "朝ごはんを食べないで学校に行きました。",
        "new_sentence_en": "I went to school without eating breakfast."
    },
     "か": {
        "new_sentence_jp": "これは何ですか？",
        "new_sentence_en": "What is this?"
    },
    "か〜か": {
        "new_sentence_jp": "コーヒーかお茶がいいですか？",
        "new_sentence_en": "Would you prefer coffee or tea?"
    },
    "から": {
        "new_sentence_jp": "今日は雨だから出かけません。",
        "new_sentence_en": "It’s raining today, so I won’t go out."
    },
    "方": {
        "new_sentence_jp": "漢字の書き方を教えてください。",
        "new_sentence_en": "Please teach me how to write kanji."
    },
    "も": {
        "new_sentence_jp": "私も学生です。",
        "new_sentence_en": "I am also a student."
    },
    "ね": {
        "new_sentence_jp": "きれいな景色ですね。",
        "new_sentence_en": "It’s a beautiful view, isn’t it?"
    },
    "の": {
        "new_sentence_jp": "これは私の本です。",
        "new_sentence_en": "This is my book."
    },
    "ので": {
        "new_sentence_jp": "疲れたので休みます。",
        "new_sentence_en": "I’m tired, so I’ll take a break."
    },
    "なる": {
        "new_sentence_jp": "先生になりたいです。",
        "new_sentence_en": "I want to become a teacher."
    },
    "どうして": {
        "new_sentence_jp": "どうして行かないの？",
        "new_sentence_en": "Why aren’t you going?"
    }
}

# Function to replace sentences based on the grammar point
def replace_sentences(row):
    grammar_point = row["Grammar"]
    if grammar_point in replacements:
        row["Example Sentence JP"] = replacements[grammar_point]["new_sentence_jp"]
        row["Example Sentence EN"] = replacements[grammar_point]["new_sentence_en"]
    return row

# Apply the replacement function to each row in the DataFrame
df = df.apply(replace_sentences, axis=1)

# Save the updated DataFrame to a new CSV file
updated_csv_file = "N5_grammar_refined_examples_updated_again_again.csv"
df.to_csv(updated_csv_file, index=False)

print(f"Updated CSV saved to {updated_csv_file}")

Updated CSV saved to N5_grammar_refined_examples_updated_again_again.csv


In [22]:
# fixing example sentences that do not in fact use the corresponding grammar point

# Load CSV file
csv_file = "N5_grammar_refined_examples_updated_again_again.csv"
df = pd.read_csv(csv_file)

# Replacement dictionary with corrections
replacements = {
      "違う！": {
        "new_sentence_jp": "猫が好きです。",
        "new_sentence_en": "I like cats."
    },
    "痛！": {
        "new_sentence_jp": "このケーキは甘いです。",
        "new_sentence_en": "This cake is sweet."
    },
    "やけどしたの？": {
        "new_sentence_jp": "雨だけど、散歩に行きます。",
        "new_sentence_en": "It's raining, but I’m going for a walk."
    },
    "暇ですか？": {
        "new_sentence_jp": "３時まで暇です。",
        "new_sentence_en": "I'm free until three o'clock."
    },
    "何？": {
        "new_sentence_jp": "この部屋は静かですね。",
        "new_sentence_en": "This room is quiet, isn’t it?"
    },
    "禁煙です！": {
        "new_sentence_jp": "どうしたんですか？",
        "new_sentence_en": "What's the matter?"
    },
    "何するの？": {
        "new_sentence_jp": "飲み物はコーヒーにします。",
        "new_sentence_en": "I'll have coffee."
    },
    "恥ずかしいよ！": {
        "new_sentence_jp": "本を読みます。",
        "new_sentence_en": "I read a book."
    },
    "絶対！": {
        "new_sentence_jp": "日本に行きたいです。",
        "new_sentence_en": "I want to go to Japan."
    },
    "馬鹿げている！": {
        "new_sentence_jp": "今、本を読んでいるところです。",
        "new_sentence_en": "I'm reading a book now."
    },
    "笑わないでください。": {
        "new_sentence_jp": "笑ってはいけません。",
        "new_sentence_en": "You must not laugh."
    },
    "本当？": {
        "new_sentence_jp": "田中さんと行きます。",
        "new_sentence_en": "I'll go with Tanaka."
    },
    "時は金なり。": {
        "new_sentence_jp": "宿題をするとき、音楽を聞きます。",
        "new_sentence_en": "When I do my homework, I listen to music."
    },
    "早く！": {
        "new_sentence_jp": "私は学生です。",
        "new_sentence_en": "I am a student."
    },
    "嫌だ！": {
        "new_sentence_jp": "りんごやバナナを買いました。",
        "new_sentence_en": "I bought apples and bananas."
    },
    "これは本当によ！": {
        "new_sentence_jp": "これは本当だよ。",
        "new_sentence_en": "This is true, you know."
    }
}

# Function to replace sentences based on the grammar point
def replace_sentences(row):
    grammar_point = row["Example Sentence JP"]
    if grammar_point in replacements:
        row["Example Sentence JP"] = replacements[grammar_point]["new_sentence_jp"]
        row["Example Sentence EN"] = replacements[grammar_point]["new_sentence_en"]
    return row

# Apply the replacement function to each row in the DataFrame
df = df.apply(replace_sentences, axis=1)

# Save the updated DataFrame to a new CSV file
updated_csv_file = "N5_grammar_refined_examples_updated_again_again_again.csv"
df.to_csv(updated_csv_file, index=False)

print(f"Updated CSV saved to {updated_csv_file}")

Updated CSV saved to N5_grammar_refined_examples_updated_again_again_again.csv


In [44]:
# updating the word type

word_type_map = {
    "ちゃいけない・じゃいけない": "Expression (Grammar)",
    "だ・です": "Auxiliary verb (Copula)",
    "だけ": "Particle",
    "だろう": "Auxiliary verb / Expression",
    "で": "Particle",
    "でも": "Conjunction",
    "でしょう": "Auxiliary verb / Expression",
    "どんな": "Pre-noun adjectival (連体詞)",
    "どうして": "Adverb (fukushi)",
    "どうやって": "Expression / Phrase",
    "が": "Particle",
    "があります": "Expression / Grammar",
    "がほしい": "Expression / Grammar",
    "がいます": "Expression / Grammar",
    "ほうがいい": "Expression (Grammar)",
    "い-adjectives": "Adjective category",
    "一番": "Noun / Adverbial",
    "一緒に": "Adverb (fukushi)",
    "いつも": "Adverb (fukushi)",
    "じゃない・ではない": "Auxiliary verb (negative copula)",
    "か": "Particle",
    "か〜か": "Particle",
    "から": "Particle",
    "方": "Noun (suffix usage)",
    "けど": "Conjunction",
    "けれども": "Conjunction",
    "まだ": "Adverb (fukushi)",
    "まだ〜ていません": "Expression (Grammar)",
    "まで": "Particle",
    "前に": "Expression / Grammar",
    "ませんか": "Expression / Grammar",
    "ましょう": "Auxiliary verb",
    "ましょうか": "Expression / Grammar",
    "も": "Particle",
    "もう": "Adverb (fukushi)",
    "な-adjectives": "Adjective category",
    "なあ": "Particle",
    "ないで": "Conjunction / Grammar",
    "ないでください": "Expression / Grammar",
    "なくてもいい": "Expression / Grammar",
    "なくちゃ": "Expression / Grammar",
    "なくてはいけない": "Expression / Grammar",
    "なくてはならない": "Expression / Grammar",
    "なる": "Godan verb",
    "んです": "Expression / Grammar",
    "ね": "Particle",
    "に": "Particle",
    "にいく": "Expression / Grammar",
    "にする": "Expression / Grammar",
    "に/へ": "Particle",
    "の": "Particle",
    "のです": "Expression / Grammar",
    "のが下手": "Expression / Grammar",
    "のが上手": "Expression / Grammar",
    "のが好き": "Expression / Grammar",
    "の中で[A]が一番": "Expression / Grammar",
    "ので": "Particle",
    "を": "Particle",
    "をください": "Expression / Grammar",
    "しかし": "Conjunction",
    "すぎる": "Ichidan verb",
    "たことがある": "Expression / Grammar",
    "たい": "Auxiliary (Grammar)",
    "たり〜たり": "Expression / Particle",
    "てある": "Expression / Grammar",
    "ている": "Expression / Grammar",
    "てから": "Expression / Grammar",
    "てください": "Expression / Grammar",
    "てはいけない": "Expression / Grammar",
    "てもいいです": "Expression / Grammar",
    "と": "Particle",
    "とき": "Noun",
    "とても": "Adverb (fukushi)",
    "つもり": "Noun",
    "は": "Particle",
    "は〜より・・・です": "Expression / Grammar",
    "はどうですか": "Expression / Grammar",
    "や": "Particle",
    "よ": "Particle",
    "より〜ほうが": "Expression / Grammar",
}

# 2. Read your CSV into a DataFrame
input_csv = "N5_grammar_refined_examples_updated_again_again_again.csv" 
df = pd.read_csv(input_csv)

# 3. Update "Word Type" for each row based on "Grammar"
#    We'll use DataFrame.apply or we can do a direct map
df["Word Type"] = df.apply(
    lambda row: word_type_map.get(row["Grammar"], row["Word Type"]), axis=1
)

# 4. Write out a new CSV with revised data
output_csv = "N5_grammar_refined_examples_updated_again_again_again_again.csv"
df.to_csv(output_csv, index=False)

print(f"Done! Updated CSV saved to '{output_csv}'.")


Done! Updated CSV saved to 'N5_grammar_refined_examples_updated_again_again_again_again.csv'.


## Auido Creation

In [41]:
# function to verify audio file creation

def sanitize_filename(filename: str) -> str:
    """
    Replace invalid file system characters with an underscore.
    Adjust the regex for your needs/operating system.
    """
    # Characters typically invalid on Windows: \ / : * ? " < > |
    return re.sub(r'[\\/*?:"<>|]', '_', filename)

def verify_audio_files(df):
    """
    Verifies that all expected audio files exist based on the sanitized file names.

    Args:
        df (pandas.DataFrame): DataFrame containing words (Grammar) and example sentences.

    Returns:
        list: A list of file paths that are missing (i.e., not found).
    """
    missing_files = []

    for index, row in df.iterrows():
        word = row.get("Grammar")
        example = row.get("Example Sentence JP")

        # If we have a word (grammar pattern), sanitize it
        if pd.notna(word):
            safe_word = sanitize_filename(str(word))

            female_word_path = f"audio/words/female/{safe_word}.mp3"
            male_word_path = f"audio/words/male/{safe_word}.mp3"

            if not os.path.exists(female_word_path):
                missing_files.append(female_word_path)
            if not os.path.exists(male_word_path):
                missing_files.append(male_word_path)

        # If we have an example sentence, we also use the sanitized word
        if pd.notna(word) and pd.notna(example):
            safe_word = sanitize_filename(str(word))  # same as above

            female_example_path = f"audio/examples/female/{safe_word}_example.mp3"
            male_example_path = f"audio/examples/male/{safe_word}_example.mp3"

            if not os.path.exists(female_example_path):
                missing_files.append(female_example_path)
            if not os.path.exists(male_example_path):
                missing_files.append(male_example_path)

    return missing_files


In [42]:
# confirm creation of all audio files

# Load CSV
csv_file = "N5_grammar_refined_examples_updated_again_again_again_again.csv"
df = pd.read_csv(csv_file)

# Perform audio file verification
missing_files = verify_audio_files(df)

# Print results
if missing_files:
    print("\nMissing files:")
    for missing in missing_files:
        print(missing)
else:
    print("\nAll audio files were successfully generated.")


All audio files were successfully generated.


## Sentence Breakdown Generation

In [3]:
# check that all breakdowns were successfully generated

df = pd.read_csv('N5_grammar_with_breakdowns.csv')

def breakdown_status(breakdown_json):
    if pd.isnull(breakdown_json) or breakdown_json.strip() == "":
        return "missing"
    try:
        data = json.loads(breakdown_json)
        if "error" in data:
            return "error"
    except Exception:
        return "malformed"
    return "ok"

# Create a new column to indicate the status.
df['breakdown_status'] = df['breakdown'].apply(breakdown_status)

# Count the statuses
status_counts = df['breakdown_status'].value_counts()
print("Breakdown status counts:")
print(status_counts)

if status_counts.get("missing", 0) > 0 or status_counts.get("error", 0) > 0 or status_counts.get("malformed", 0) > 0:
    print("\nRows with issues:")
    print(df[df['breakdown_status'] != "ok"][['Example Sentence JP', 'Example Sentence EN', 'breakdown', 'breakdown_status']])
else:
    print("All rows have been properly broken down and parsed!")

Breakdown status counts:
breakdown_status
malformed    80
Name: count, dtype: int64

Rows with issues:
   Example Sentence JP                  Example Sentence EN  \
0         ここに入っちゃいけない。             You must not enter here.   
1             これはペンです。                       This is a pen.   
2                 私だけ？                       Is it just me?   
3           明日は晴れるだろう。  It will probably be sunny tomorrow.   
4          バスで学校に行きます。               I go to school by bus.   
..                 ...                                  ...   
75         犬は猫より大きいです。           Dogs are bigger than cats.   
76         この映画はどうですか。                How about this movie?   
77      りんごやバナナを買いました。         I bought apples and bananas.   
78            これは本当だよ。              This is true, you know.   
79       海より山のほうが好きです。  I like mountains more than the sea.   

                                            breakdown breakdown_status  
0   {"vocabulary": [{"word": "ここ", "reading": "ここ"...        malfor