In [11]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Embedding, LayerNormalization, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# --- Завантаження даних ---
PATH = 'ukr.txt'

# Читання даних
with open(PATH, "r", encoding="utf-8") as f:
    data = f.readlines()

# Фильтрация строк: не пустые строки и наличие табуляции
pairs = [line.strip().split('\t') for line in data if '\t' in line and len(line.strip().split('\t')) == 2]

# Убедимся, что пары корректны
english_sentences, ukrainian_sentences = zip(*pairs[:10000])  

# --- Токенізація ---
tokenizer_eng = Tokenizer(filters='')
tokenizer_ukr = Tokenizer(filters='')
tokenizer_eng.fit_on_texts(english_sentences)
tokenizer_ukr.fit_on_texts(ukrainian_sentences)

eng_sequences = tokenizer_eng.texts_to_sequences(english_sentences)
ukr_sequences = tokenizer_ukr.texts_to_sequences(ukrainian_sentences)

eng_sequences = pad_sequences(eng_sequences, padding="post")
ukr_sequences = pad_sequences(ukr_sequences, padding="post")

# --- Параметри ---
VOCAB_SIZE_ENG = len(tokenizer_eng.word_index) + 1
VOCAB_SIZE_UKR = len(tokenizer_ukr.word_index) + 1
EMBED_DIM = 256
NUM_HEADS = 8
FF_DIM = 512
DROPOUT_RATE = 0.1

# --- Модель трансформера ---
def transformer_encoder(embed_dim, num_heads, ff_dim, dropout_rate=0.1):
    inputs = Input(shape=(None, embed_dim))
    attention_output = tf.keras.layers.MultiHeadAttention(num_heads, embed_dim)(inputs, inputs)
    attention_output = Dropout(dropout_rate)(attention_output)
    attention_output = LayerNormalization(epsilon=1e-6)(inputs + attention_output)

    ffn_output = Dense(ff_dim, activation="relu")(attention_output)
    ffn_output = Dense(embed_dim)(ffn_output)
    ffn_output = Dropout(dropout_rate)(ffn_output)
    ffn_output = LayerNormalization(epsilon=1e-6)(attention_output + ffn_output)
    return Model(inputs, ffn_output)

# --- Створення моделі ---
def build_model():
    encoder_inputs = Input(shape=(None,))
    decoder_inputs = Input(shape=(None,))

    # Ембединг
    encoder_embedding = Embedding(VOCAB_SIZE_ENG, EMBED_DIM)(encoder_inputs)
    decoder_embedding = Embedding(VOCAB_SIZE_UKR, EMBED_DIM)(decoder_inputs)

    # Енкодер
    encoder = transformer_encoder(EMBED_DIM, NUM_HEADS, FF_DIM, DROPOUT_RATE)(encoder_embedding)

    # Декодер (простий варіант)
    decoder = transformer_encoder(EMBED_DIM, NUM_HEADS, FF_DIM, DROPOUT_RATE)(decoder_embedding)
    concat = tf.keras.layers.Concatenate()([encoder, decoder])

    outputs = Dense(VOCAB_SIZE_UKR, activation="softmax")(concat)

    model = Model([encoder_inputs, decoder_inputs], outputs)
    return model

model = build_model()
model.compile(optimizer=Adam(learning_rate=0.001), loss=SparseCategoricalCrossentropy(from_logits=True))
model.summary()

# --- Тренування ---
BATCH_SIZE = 64
EPOCHS = 10

encoder_input_data = eng_sequences
decoder_input_data = ukr_sequences[:, :-1]
decoder_output_data = ukr_sequences[:, 1:]

model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_output_data,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_split=0.2
)

# --- Тест моделі ---
def decode_sequence(input_seq):
    encoder_output = model.predict([input_seq, tf.zeros_like(input_seq)])
    predicted_sequence = tf.argmax(encoder_output, axis=-1).numpy()
    return " ".join([tokenizer_ukr.index_word.get(idx, "<unk>") for idx in predicted_sequence[0]])

sample_sentence = "How are you?"
sample_sequence = pad_sequences(tokenizer_eng.texts_to_sequences([sample_sentence]), maxlen=eng_sequences.shape[1])
translation = decode_sequence(sample_sequence)
print("Переклад:", translation)


ValueError: not enough values to unpack (expected 2, got 0)