In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
sns.set()
%matplotlib inline
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# MatplotlibおよびSeabornで日本語を表示可能にする
from matplotlib import rcParams
rcParams['font.family'] = 'MS Gothic'

# 高解像度なPNGでグラフを出力する
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('retina')

import tensorflow as tf
print("TensorFlow Version:", tf.__version__)

from flask import Flask
print("Flask導入された")

import pyopenjtalk
import pykakasi
from sudachipy import tokenizer, dictionary
import pyopenjtalk
from collections import defaultdict, Counter
import ast
import pickle

# 读取你刚才保存的CSV文件
df = pd.read_csv("dataset/processed_haiku.csv", encoding="utf-8")
print(df.head())
print(df.isnull().sum())

print("=== 第2步：准备word2id字典（含频率过滤） ===")

# 统计所有 token 的频率
token_counter = Counter()
for idx, row in df.iterrows():
    try:
        tokens = ast.literal_eval(row['Tokens']) if isinstance(row['Tokens'], str) else row['Tokens']
        token_counter.update(tokens)
    except Exception as e:
        print(f"处理第{idx}行时出错: {e}")

# 设置最小词频
min_freq = 3
print(f"仅保留出现 ≥ {min_freq} 次的词")

# 初始化 word2id
word2id = defaultdict(lambda: word2id["<UNK>"])
word2id["<PAD>"] = 0
word2id["<UNK>"] = 1
word2id["<START>"] = 2
word2id["<END>"] = 3

# 只加入频率 >= min_freq 的词
for token, freq in token_counter.items():
    if freq >= min_freq:
        word2id[token] = len(word2id)

print(f"词汇表大小（过滤后）: {len(word2id)}")
print(f"前10个词汇:")
for i, (word, id) in enumerate(word2id.items()):
    if i >= 10:
        break
    print(f"  {word} -> {id}")

print("\n✅ 第2步完成！词汇表已准备好（稀有词自动映射为 <UNK>）")

with open("dataset/word2id.pkl", "wb") as f:
    pickle.dump(dict(word2id), f)

# 保存 id2word（可用于生成时把 id 转成词）
id2word = {v: k for k, v in word2id.items()}
with open("dataset/id2word.pkl", "wb") as f:
    pickle.dump(id2word, f)

# 第3步：转换ID数据
print("=== 第3步：转换ID数据 ===")

haiku_ids_list = []
error_count = 0

for idx, row in df.iterrows():
    try:
        # 解析分词结果
        if isinstance(row['Tokens'], str):
            tokens = ast.literal_eval(row['Tokens'])
        else:
            tokens = row['Tokens']
        
        # 转换为ID序列
        ids = [word2id[token] for token in tokens]
        haiku_ids_list.append(ids)
        
    except Exception as e:
        error_count += 1
        if error_count <= 3:
            print(f"处理第{idx}行时出错: {e}")
        continue

print(f"成功转换 {len(haiku_ids_list)} 首俳句")
print(f"错误行数: {error_count}")

# 检查ID序列的长度分布
lengths = [len(ids) for ids in haiku_ids_list]
print(f"\n俳句长度统计:")
print(f"  最短: {min(lengths)} 个词")
print(f"  最长: {max(lengths)} 个词")
print(f"  平均: {sum(lengths)/len(lengths):.1f} 个词")

# 显示几个例子
print("\n=== ID转换例子 ===")
for i in range(min(3, len(haiku_ids_list))):
    print(f"\n例子 {i+1}:")
    print(f"原始俳句: {df.iloc[i]['Haiku']}")
    print(f"ID序列: {haiku_ids_list[i]}")
    print(f"长度: {len(haiku_ids_list[i])}")

print("\n✅ 第3步完成！现在你有了:")
print(f"1. 词汇表 word2id，大小: {len(word2id)}")
print(f"2. ID序列列表 haiku_ids_list，包含 {len(haiku_ids_list)} 首俳句")
print("\n🎉 准备进入下一步：构建LSTM模型")



print("=== 第4a步：设置参数和准备训练数据 ===")

# 设置随机种子，确保结果可复现
np.random.seed(42)
tf.random.set_seed(42)

# 模型参数
vocab_size = len(word2id)
embedding_dim = 128      # 词嵌入维度
lstm_units = 128         # LSTM单元数
max_length = 16          # 最大序列长度

print(f"模型参数:")
print(f"  词汇表大小: {vocab_size}")
print(f"  嵌入维度: {embedding_dim}")
print(f"  LSTM单元数: {lstm_units}")
print(f"  最大序列长度: {max_length}")
print(f"  <START>标记ID: {word2id['<START>']}")
print(f"  <END>标记ID: {word2id['<END>']}")

print("\n✅ 第4a步完成！参数设置完毕")

# 第4b步：准备训练数据
print("=== 第4b步：准备训练数据 ===")

def prepare_training_data(haiku_ids_list, max_length):
    """将俳句ID序列转换为训练数据"""
    X, y = [], []
    
    for haiku_ids in haiku_ids_list:
        # 为每首俳句添加开始和结束标记
        sequence = [word2id["<START>"]] + haiku_ids + [word2id["<END>"]]
        
        # 创建滑动窗口序列
        for i in range(1, len(sequence)):
            input_seq = sequence[:i]
            target = sequence[i]
            
            # 只处理不太长的序列
            if len(input_seq) <= max_length:
                X.append(input_seq)
                y.append(target)
    
    return X, y

print("准备训练数据...")
X, y = prepare_training_data(haiku_ids_list, max_length)

print(f"训练样本数: {len(X)}")
print(f"目标样本数: {len(y)}")

# 显示几个训练样本
print("\n=== 训练数据示例 ===")
for i in range(min(3, len(X))):
    print(f"样本 {i+1}:")
    print(f"  输入序列: {X[i]}")
    print(f"  目标词ID: {y[i]}")
    print(f"  输入长度: {len(X[i])}")

print("\n✅ 第4b步完成！训练数据准备完毕")

# 第4c步：数据填充
print("=== 第4c步：数据填充 ===")

# 填充序列到相同长度
print(f"填充序列到长度 {max_length}...")
X_padded = pad_sequences(X, maxlen=max_length, padding='pre')
y_array = np.array(y)

print(f"填充后的输入形状: {X_padded.shape}")
print(f"目标数据形状: {y_array.shape}")

# 显示填充前后的对比
print("\n=== 填充前后对比 ===")
print(f"填充前第1个样本: {X[0]}")
print(f"填充后第1个样本: {X_padded[0]}")
print(f"对应目标: {y_array[0]}")

# 查看一些统计信息
print(f"\n=== 数据统计 ===")
print(f"输入序列最小值: {X_padded.min()}")
print(f"输入序列最大值: {X_padded.max()}")
print(f"目标词ID最小值: {y_array.min()}")
print(f"目标词ID最大值: {y_array.max()}")

print("\n✅ 第4c步完成！数据填充完毕")


# モデルの作成
model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    LSTM(lstm_units, return_sequences=True, dropout=0.3),
    LSTM(lstm_units, dropout=0.3),
    Dense(lstm_units, activation='relu'),
    Dropout(0.4),
    Dense(vocab_size, activation='softmax')
])

#　モデルの要約
model.build(input_shape=(None, max_length))
model.summary()

# 学習時の設定
# オプティマイザー : Adam
# 損失関数 : スパースカテゴリカルクロスエントロピー
# メトリック : 正解率(accuracy)
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])



# 早期停止（防止过拟合）
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=4,
    restore_best_weights=True,
    verbose=1
)

# モデルの自動保存（只保存验证集上最优的模型）
model_ckpt = ModelCheckpoint(
    filepath='model/best_model.keras',       # 保存路径（建议用 .keras）
    monitor='val_loss',
    save_best_only=True,              # 只保存最好的那一轮
    mode='min',                       # val_loss 越小越好
    verbose=1
)

# 学習
epoch = 20
hist = model.fit(X_padded, y_array, epochs=epoch, batch_size=64, validation_split=0.1, callbacks=[early_stop, model_ckpt])

model.save("model/final_model.keras")