In [None]:
import os
print(os.getcwd())

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
import json
from sklearn.model_selection import train_test_split

# === Параметры ===
PAD_TOKEN = '<PAD>'
EOS_TOKEN = '<EOS>'
EMBED_DIM = 64
LSTM_UNITS = 64
BATCH_SIZE = 32
EPOCHS = 50
VAL_SPLIT = 0.1

# === 1. Загрузка и подготовка данных ===
df = pd.read_csv('word_forms.csv', encoding='utf-8')
case_names = ['Атооч', 'Илик', 'Барыш', 'Табыш', 'Жатыш', 'Чыгыш']
lemmas = df['lemma'].astype(str).tolist()
forms_by_case = {case: df[case].astype(str).tolist() for case in case_names}

# Собираем алфавит
all_words = lemmas + sum([forms_by_case[case] for case in case_names], [])
char_set = sorted({ch for word in all_words for ch in word})
char_list = [PAD_TOKEN, EOS_TOKEN] + char_set
char2idx = {ch: i for i, ch in enumerate(char_list)}
idx2char = {i: ch for ch, i in char2idx.items()}
vocab_size = len(char2idx)

# Сохраняем словарь для инференса
with open('char2idx.json', 'w', encoding='utf-8') as f:
    json.dump(char2idx, f, ensure_ascii=False)

# Кодирование слов
def encode(word, add_eos=False, max_len=None):
    seq = [char2idx.get(ch, 0) for ch in word]
    if add_eos:
        seq.append(char2idx[EOS_TOKEN])
    if max_len:
        seq += [char2idx[PAD_TOKEN]] * (max_len - len(seq))
    return seq

# Вычисление максимальных длин
max_len_input = max(len(w) for w in lemmas)
max_len_output = max(len(w) for case in case_names for w in forms_by_case[case]) + 1  # +1 за <EOS>

# Кодирование всех данных
X = np.array([encode(w, add_eos=False, max_len=max_len_input) for w in lemmas], dtype=np.int32)
Y = {case: np.array([encode(w, add_eos=True, max_len=max_len_output) for w in forms_by_case[case]], dtype=np.int32) for case in case_names}

# Разделение train/val
X_train, X_val, idx_train, idx_val = train_test_split(X, np.arange(len(X)), test_size=VAL_SPLIT, random_state=42)
Y_train = {case: Y[case][idx_train] for case in case_names}
Y_val   = {case: Y[case][idx_val] for case in case_names}

# === 2. Архитектура модели ===
inputs = layers.Input(shape=(max_len_input,), name='lemma_input')
x = layers.Embedding(input_dim=vocab_size, output_dim=EMBED_DIM, mask_zero=True)(inputs)
x = layers.Bidirectional(layers.LSTM(LSTM_UNITS))(x)  # <--- теперь это работает!
outputs = []

for case in case_names:
    r = layers.RepeatVector(max_len_output)(x)
    o = layers.TimeDistributed(layers.Dense(vocab_size, activation='softmax'), name=f"{case}_output")(r)
    outputs.append(o)

model = models.Model(inputs=inputs, outputs=outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

# === 3. Обучение ===
Y_train_list = [Y_train[case] for case in case_names]
Y_val_list = [Y_val[case] for case in case_names]

callbacks = [tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)]

model.fit(X_train, Y_train_list,
          validation_data=(X_val, Y_val_list),
          epochs=EPOCHS,
          batch_size=BATCH_SIZE,
          callbacks=callbacks)

# === 4. Сохранение модели ===
model.save('kyrgyz_declension_model')
print("✅ Модель и словарь успешно сохранены.")
