# データの前処理

In [23]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
sns.set()
%matplotlib inline
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam

# MatplotlibおよびSeabornで日本語を表示可能にする
from matplotlib import rcParams
rcParams['font.family'] = 'MS Gothic'

# 高解像度なPNGでグラフを出力する
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('retina')

import tensorflow as tf
print("TensorFlow Version:", tf.__version__)

from flask import Flask
print("Flask導入された")

import pyopenjtalk
import pykakasi
from sudachipy import tokenizer, dictionary
import pyopenjtalk
from collections import defaultdict

TensorFlow Version: 2.19.0
Flask導入された


In [3]:
kana_kata = pyopenjtalk.g2p("私は学生です", kana=True)
print("片仮名:", kana_kata)

片仮名: ワタシワガクセーデス


In [4]:
kks = pykakasi.kakasi()
kks.setMode("K", "H")  # カタカナを平仮名に変換
conv = kks.getConverter()
kana_hira = conv.do(kana_kata)
print(kana_hira)

わたしわがくせーです


In [5]:
tokenizer_obj = dictionary.Dictionary(dict="full").create()

mode = tokenizer.Tokenizer.SplitMode.A
[m.surface() for m in tokenizer_obj.tokenize("春の山に花が咲いている", mode)]

['春', 'の', '山', 'に', '花', 'が', '咲い', 'て', 'いる']

In [6]:
# SudachiPyの辞書を「full」で初期化（季語などにも対応可能）
tokenizer_obj = dictionary.Dictionary(dict="full").create()
split_mode = tokenizer.Tokenizer.SplitMode.A  # AはSudachiPyの中で最も細かい形で分かち書きするモード

# 単語とIDを紐づける辞書（存在しない単語には自動的に新しいIDを割り当てる）
word2id = defaultdict(lambda: len(word2id))
word2id["<PAD>"] = 0  # パディング用（固定長処理などで使用）
word2id["<UNK>"] = 1  # 未知語用（辞書に存在しない単語）

# 特殊の発音
SPECIAL_READINGS = {
    "汝": "ナレ",
    "我": "ワレ",
    "行く": "ユク",
    "二十歳": "ハタチ",
    "故郷": "フルサト",
}

# 踊り字の処理
def replace_iteration_marks(text):
    result = ""
    prev = ""

    hira_dakuon_map = {
        "か":"が", "き":"ぎ", "く":"ぐ", "け":"げ", "こ":"ご",
        "さ":"ざ", "し":"じ", "す":"ず", "せ":"ぜ", "そ":"ぞ",
        "た":"だ", "ち":"ぢ", "つ":"づ", "て":"で", "と":"ど",
        "は":"ば", "ひ":"び", "ふ":"ぶ", "へ":"べ", "ほ":"ぼ"
    }

    kata_dakuon_map = {
        "カ":"ガ", "キ":"ギ", "ク":"グ", "ケ":"ゲ", "コ":"ゴ",
        "サ":"ザ", "シ":"ジ", "ス":"ズ", "セ":"ゼ", "ソ":"ゾ",
        "タ":"ダ", "チ":"ヂ", "ツ":"ヅ", "テ":"デ", "ト":"ド",
        "ハ":"バ", "ヒ":"ビ", "フ":"ブ", "ヘ":"ベ", "ホ":"ボ"
    }

    for ch in text:
        if ch == "ゝ":
            result += prev
        elif ch == "ゞ":
            result += hira_dakuon_map.get(prev, prev)
        elif ch == "ヽ":
            result += prev
        elif ch == "ヾ":
            result += kata_dakuon_map.get(prev, prev)
        else:
            result += ch
            prev = ch
    return result


# メイン処理
def preprocess_haiku(text):
    # 踊り字の前処理
    text = replace_iteration_marks(text)
    tokens = tokenizer_obj.tokenize(text, split_mode)
    tokens_combined = []

    for m in tokens:
        surface = m.normalized_form()  # 元の表記の正規化（例：花、咲い）
        kana = SPECIAL_READINGS.get(surface)
        if kana is None:
            kana = pyopenjtalk.g2p(surface, kana=True)  # 片仮名に変換
            kana = kana.replace(" ", "").replace("、", "").replace("。", "")  # 空白や記号を除去

        if kana:
            dict_form = m.dictionary_form()       # 辞書形（例：食べ → 食べる）
            pos = m.part_of_speech()[0]           # 単語の分類（例：動詞、名詞など）
            combined = f"{surface}/{kana}/{dict_form}/{pos}"  # 複合化
            tokens_combined.append(combined)

    # 単語をIDに変換（すでに存在するものは同じID、新規は追加）
    ids = [word2id[token] for token in tokens_combined]

    return tokens_combined, ids  #リストを返す

In [8]:
text = "春も早山吹白く苣苦し"

kana_list, ids = preprocess_haiku(text)

print(kana_list)
print("ID：", ids)

['春/ハル/春/名詞', 'も/モ/も/助詞', '早い/ハヤイ/早い/形容詞', '山吹/ヤマブキ/山吹/名詞', '白い/シロイ/白い/形容詞', '苣苦/苣苦/苣苦/名詞', '為る/ナル/する/動詞']
ID： [10, 11, 12, 13, 14, 15, 16]


In [14]:
df = pd.read_csv("dataset/haiku_dataset.csv", encoding="utf-8")
print(df.head())

   ID           Haiku
0   1      春も早山吹白く苣苦し
1   2      山寺の春や仏に水仙花
2   3  門口に風呂たく春のとまりかな
3   4   雪の絵を春も掛けたる埃かな
4   5  起重機の手挙げて立てり海は春


In [15]:
processed_data = []

for idx, row in df.iterrows():
    haiku_text = row["Haiku"]
    kana_list, ids = preprocess_haiku(haiku_text)
    processed_data.append({
        "ID": row["ID"],
        "Haiku": haiku_text,
        "Tokens": kana_list,
        "IDs": ids
        })

In [17]:
processed_df = pd.DataFrame(processed_data)
processed_df.to_csv("dataset/processed_haiku.csv", index=False, encoding="utf-8")
print(processed_df.head())

   ID           Haiku                                             Tokens  \
0   1      春も早山吹白く苣苦し  [春/ハル/春/名詞, も/モ/も/助詞, 早い/ハヤイ/早い/形容詞, 山吹/ヤマブキ/山...   
1   2      山寺の春や仏に水仙花  [山寺/ヤマデラ/山寺/名詞, の/ノ/の/助詞, 春/ハル/春/名詞, や/ヤ/や/助詞,...   
2   3  門口に風呂たく春のとまりかな  [門口/カドグチ/門口/名詞, に/ニ/に/助詞, 風呂/フロ/風呂/名詞, たい/タイ/た...   
3   4   雪の絵を春も掛けたる埃かな  [雪/ユキ/雪/名詞, の/ノ/の/助詞, 絵/エ/絵/名詞, を/ヲ/を/助詞, 春/ハル...   
4   5  起重機の手挙げて立てり海は春  [起重/キジュー/起重/名詞, 機/キ/機/名詞, の/ノ/の/助詞, 手/テ/手/名詞, ...   

                                            IDs  
0                  [10, 11, 12, 13, 14, 15, 16]  
1                  [17, 18, 10, 19, 20, 21, 22]  
2          [23, 21, 24, 25, 10, 18, 26, 27, 28]  
3  [29, 18, 30, 31, 10, 11, 32, 33, 34, 27, 28]  
4  [35, 36, 18, 37, 38, 39, 40, 41, 42, 43, 10]  
