In [2]:
import csv

In [4]:
# Add JLPT Level, Grammar, Meaning headings and extract these columns only. Filter only N5 rows

input_file = "../jlpt_grammar.csv" 
output_file = "N5_grammar_extracted.csv"

# Define the new headings
new_headings = ["JLPT Level", "Grammar", "Meaning"]

with open(input_file, "r", encoding="utf-8-sig") as infile, open(output_file, "w", encoding="utf-8-sig", newline="") as outfile:
    reader = csv.reader(infile)
    writer = csv.writer(outfile)

    # Write the new headings
    writer.writerow(new_headings)
 
    for row in reader:
        if row[0].strip() == "N5":
            cleaned_row = [row[0], row[2], row[4]]  # 0: JLPT Level, 2: Grammar, 4: Meaning
            writer.writerow(cleaned_row)

print(f"Cleaned file saved to '{output_file}'.")


Cleaned file saved to 'N5_grammar_extracted.csv'.


In [5]:
# check for duplicate rows

all_rows = []
duplicates = []
seen = set()  # To track unique (Word, Reading) pairs

with open(output_file, "r", encoding="utf-8-sig") as f:
    reader = csv.reader(f)

    header = next(reader, None)

    for line_num, row in enumerate(reader, start=2):
        # Skip empty rows
        if not row or all(cell.strip() == "" for cell in row):
            continue
        
        # Create a unique key based on Word and Reading (indices 0 and 1)
        key = (row[0].strip(), row[1].strip())  # Word and Reading
        if key in seen:
            duplicates.append((output_file, line_num, row)) 
        else:
            seen.add(key)

        all_rows.append(row)

if duplicates:
    print("Duplicates found:")
    for file, line, row in duplicates:
        print(f"File: {file}, Line: {line}, Duplicate Entry: {row}")
else:
    print("No duplicates found across all files.")

No duplicates found across all files.


In [6]:
# Add a readings column and add readings to each row

input_file = "N5_grammar_extracted.csv"
output_file = "N5_grammar_with_reading.csv"

# Grammar and their readings
grammar_readings = {
    "ちゃいけない・じゃいけない": "ちゃいけない / じゃいけない",
    "だ・です": "だ / です",
    "だけ": "だけ",
    "だろう": "だろう",
    "で": "で",
    "でも": "でも",
    "でしょう": "でしょう",
    "どんな": "どんな",
    "どうして": "どうして",
    "どうやって": "どうやって",
    "が": "が",
    "があります": "があります",
    "がほしい": "がほしい",
    "がいます": "がいます",
    "ほうがいい": "ほうがいい",
    "い-adjectives": "いけいようし",
    "一番": "いちばん",
    "一緒に": "いっしょに",
    "いつも": "いつも",
    "じゃない・ではない": "じゃない / ではない",
    "か": "か",
    "か〜か": "か〜か",
    "から": "から",
    "方": "かた",
    "けど": "けど",
    "けれども": "けれども",
    "まだ": "まだ",
    "まだ〜ていません": "まだ〜ていません",
    "まで": "まで",
    "前に": "まえに",
    "ませんか": "ませんか",
    "ましょう": "ましょう",
    "ましょうか": "ましょうか",
    "も": "も",
    "もう": "もう",
    "な-adjectives": "なけいようし",
    "なあ": "なあ",
    "ないで": "ないで",
    "ないでください": "ないでください",
    "なくてもいい": "なくてもいい",
    "なくちゃ": "なくちゃ",
    "なくてはいけない": "なくてはいけない",
    "なくてはならない": "なくてはならない",
    "なる": "なる",
    "んです": "んです",
    "ね": "ね",
    "に": "に",
    "にいく": "にいく",
    "にする": "にする",
    "に/へ": "に / へ",
    "の": "の",
    "のです": "のです",
    "のが下手": "のがへた",
    "のが上手": "のがじょうず",
    "のが好き": "のがすき",
    "の中で[A]が一番": "のなかで[A]がいちばん",
    "ので": "ので",
    "を": "を",
    "をください": "をください",
    "しかし": "しかし",
    "すぎる": "すぎる",
    "たことがある": "たことがある",
    "たい": "たい",
    "たり〜たり": "たり〜たり",
    "てある": "てある",
    "ている": "ている",
    "てから": "てから",
    "てください": "てください",
    "てはいけない": "てはいけない",
    "てもいいです": "てもいいです",
    "と": "と",
    "とき": "とき",
    "とても": "とても",
    "つもり": "つもり",
    "は": "は",
    "は〜より・・・です": "は〜より・・・です",
    "はどうですか": "はどうですか",
    "や": "や",
    "よ": "よ",
    "より〜ほうが": "より〜ほうが",
}

with open(input_file, "r", encoding="utf-8-sig") as infile, open(output_file, "w", encoding="utf-8-sig", newline="") as outfile:
    reader = csv.reader(infile)
    writer = csv.writer(outfile)

    # Read the header and add the new column
    header = next(reader)
    header.insert(2, "Reading")  # Add Reading column after Grammar
    writer.writerow(header)

    for row in reader:
        grammar = row[1]
        reading = grammar_readings.get(grammar, "No reading available")
        row.insert(2, reading)
        writer.writerow(row)

print(f"File with Reading column saved to '{output_file}'.")


File with Reading column saved to 'N5_grammar_with_reading.csv'.


In [7]:
import requests
import time
import urllib.parse

In [8]:
# Add example sentences for each word in both Japanese and English

def get_word_type_and_sentences(word):
    jisho_url = f"https://jisho.org/api/v1/search/words?keyword={word}"
    
    # Always URL-encode the search term to avoid issues with special characters
    encoded_word = urllib.parse.quote(word)
    tatoeba_url = f"https://tatoeba.org/en/api_v0/search?query={encoded_word}&from=jpn&to=eng"

    try:
        # --- 1) Fetch word type from Jisho ---
        response = requests.get(jisho_url)
        response.raise_for_status()
        jisho_data = response.json()

        word_type = "Unknown"
        if jisho_data.get("data"):
            senses = jisho_data["data"][0].get("senses", [])
            if senses:
                parts = senses[0].get("parts_of_speech", [])
                if parts:
                    word_type = parts[0]

        # --- 2) Fetch example sentences from Tatoeba ---
        response = requests.get(tatoeba_url)
        response.raise_for_status()
        tatoeba_data = response.json()

        sentences = []
        for sentence in tatoeba_data.get("results", []):
            jp_sentence = sentence.get("text", "")
            
            # 'translations' is a list of lists. Flatten them:
            translations = sentence.get("translations", [])
            all_translations = []
            for sublist in translations:
                all_translations.extend(sublist)
            
            # Grab the first English translation if there is one
            en_sentence = ""
            if all_translations:
                en_sentence = all_translations[0].get("text", "")
            
            if jp_sentence and en_sentence:
                sentences.append((jp_sentence, en_sentence))

        return word_type, sentences

    except requests.RequestException as e:
        print(f"Request failed for {word}: {e}")
        return "Unknown", []
    except (IndexError, KeyError, AttributeError, TypeError) as e:
        print(f"Unexpected data structure for {word}: {e}")
        return "Unknown", []

In [9]:
# add example sentences for each word in both Japanese and English

input_csv = "N5_grammar_with_reading.csv"
output_csv = "N5_grammar_with_examples.csv"

with open(input_csv, "r", encoding="utf-8-sig") as infile, open(output_csv, "w", encoding="utf-8-sig", newline="") as outfile:
    reader = csv.DictReader(infile)
    fieldnames = reader.fieldnames + ["Word Type", "Example Sentence JP", "Example Sentence EN"]
    writer = csv.DictWriter(outfile, fieldnames=fieldnames)

    writer.writeheader()

    for row in reader:
        word = row["Grammar"]
        word_type, examples = get_word_type_and_sentences(word)

        row["Word Type"] = word_type
        if examples:
            # Use the first example sentence
            row["Example Sentence JP"], row["Example Sentence EN"] = examples[0]
        else:
            row["Example Sentence JP"] = "No example available"
            row["Example Sentence EN"] = "No example available"

        writer.writerow(row)
        print(f"Processed: {word}")
        time.sleep(2)

print(f"Updated file with word types and example sentences saved to '{output_csv}'.")

Processed: ちゃいけない・じゃいけない
Processed: だ・です
Processed: だけ
Processed: だろう
Processed: で
Processed: でも
Processed: でしょう
Processed: どんな
Processed: どうして
Processed: どうやって
Processed: が
Processed: があります
Processed: がほしい
Processed: がいます
Processed: ほうがいい
Processed: い-adjectives
Processed: 一番
Processed: 一緒に
Processed: いつも
Processed: じゃない・ではない
Processed: か
Processed: か〜か
Processed: から
Processed: 方
Processed: けど
Processed: けれども
Processed: まだ
Processed: まだ〜ていません
Processed: まで
Processed: 前に
Processed: ませんか
Processed: ましょう
Processed: ましょうか
Processed: も
Processed: もう
Processed: な-adjectives
Processed: なあ
Processed: ないで
Processed: ないでください
Processed: なくてもいい
Processed: なくちゃ
Processed: なくてはいけない
Processed: なくてはならない
Processed: なる
Processed: んです
Processed: ね
Processed: に
Processed: にいく
Processed: にする
Processed: に/へ
Processed: の
Processed: のです
Processed: のが下手
Processed: のが上手
Processed: のが好き
Processed: の中で[A]が一番
Processed: ので
Processed: を
Processed: をください
Processed: しかし
Processed: すぎる
Processed: たことがある
Processed: たい

In [3]:
# replace difficult example sentences with simpler ones

replacements = {
    "「もう俺たちの邪魔をしないなら、今までのことは水に流してやってもいいけど？」「やけに寛大なんだな・・・」": (
        "「もう邪魔しないなら、今までのことは許してあげるよ。」「優しいんだね…」",
        "\"If you won't bother us anymore, I'll let bygones be bygones.\" \"You're quite kind...\""
    ),
    "馬鹿だろう！": (
        "馬鹿じゃないの？",
        "Aren't you an idiot?"
    ),
    "出ろ！": (
        "出て！",
        "Get out!"
    ),
    "死ね！": (
        "やめて！",
        "Stop it!"
    ),
    "死を忘れるな。": (
        "死を忘れないで。",
        "Don't forget about death."
    ),
    "笑ってはいけない。": (
        "笑わないでください。",
        "Don't laugh."
    ),
    "恥を知れ！": (
        "恥ずかしいよ！",
        "That's embarrassing!"
    ),

}

input_csv = "N5_grammar_with_examples.csv" 
output_csv = "N5_grammar_refined_examples.csv" 

with open(input_csv, "r", newline="", encoding="utf-8-sig") as infile, \
     open(output_csv, "w", newline="", encoding="utf-8-sig") as outfile:

    reader = csv.DictReader(infile)

    fieldnames = reader.fieldnames
    writer = csv.DictWriter(outfile, fieldnames=fieldnames, extrasaction='ignore')

    writer.writeheader()
    
    for row in reader:
        jp = row.get("Example Sentence JP", "")
        
        # If JP sentence is in our dictionary, replace both JP and EN
        if jp in replacements:
            new_jp, new_en = replacements[jp]
            row["Example Sentence JP"] = new_jp
            row["Example Sentence EN"] = new_en
        
        writer.writerow(row)

print("Done! Updated CSV saved to:", output_csv)

Done! Updated CSV saved to: N5_grammar_refined_examples.csv


In [4]:
# check for missing columns

all_rows = []
header = None

with open(output_csv, "r", encoding="utf-8-sig") as f:
    reader = csv.reader(f)
    header = next(reader)
        
    for line_num, row in enumerate(reader, start=2):
        # Check if row is empty
        if not row or all(cell.strip() == "" for cell in row):
            print(f"[INFO] {output_csv}, line {line_num}: Empty row skipped")
            continue
        # Check if row has exactly 7 columns:
        if len(row) != 7:
            print(f"[WARNING] {output_csv}, line {line_num}: Expected 6 columns, found {len(row)} => {row}")
        else:
            all_rows.append(row)

print(f"{output_csv} check complete")

N5_grammar_refined_examples.csv check complete
