## Lab 4. Advanced nets

1.  Завдання щодо генерації текстів або машинного перекладу (на вибір) на базі рекурентних мереж або трансформерів (на вибір).
Вирішіть завдання щодо генерації текстів або машинного перекладу. Особливо вітаються україномовні моделі.  

Датасети для перекладу можна брати тут: https://www.manythings.org/anki/
Тексти українською для навчання генеративних моделей: https://www.kaggle.com/datasets/mykras/ukrainian-texts
Приклади:
https://keras.io/examples/nlp/neural_machine_translation_with_transformer/
https://keras.io/examples/nlp/lstm_seq2seq/
https://keras.io/examples/generative/lstm_character_level_text_generation/

Для виконання роботи я обрав рекурентну мережу для задачі генерації тексту на основі декількох історичних книг українською мовою

In [39]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping


In [32]:
# Define the list of input files
# I chose several ukrainian historical books from https://javalibre.com.ua/java-book/
input_files = ['text_1.txt', 'text_2.txt', 'text_3.txt', 'text_4.txt']

# Define the output file
output_file = 'ukr_text.txt'

def combine_text_files(input_files, output_file):
    """
    Combines multiple text files into a single output file.

    Args:
        input_files (list): List of input file paths.
        output_file (str): Path to the output combined file.
    """
    with open(output_file, 'w', encoding='utf-8') as outfile:
        for fname in input_files:
            if os.path.exists(fname):
                with open(fname, 'r', encoding='utf-8') as infile:
                    content = infile.read()
                    outfile.write(content)
                    outfile.write('\n\n')  # Add double newline between files for separation
                print(f"Successfully added {fname}")
            else:
                print(f"Warning: File {fname} does not exist and was skipped.")

# Combine the text files
combine_text_files(input_files, output_file)

Successfully added text_1.txt
Successfully added text_2.txt
Successfully added text_3.txt
Successfully added text_4.txt


In [34]:
# Preprocess the Combined Text

def preprocess_text(text):
    """
    Preprocesses the input text by converting to lowercase and removing unwanted characters.

    Args:
        text (str): Raw input text.

    Returns:
        str: Cleaned and preprocessed text.
    """
    text = text.lower()
    # Define allowed characters (Ukrainian alphabets, space, and basic punctuation)
    allowed_chars = 'абвгґдеєжзиіїйклмнопрстуфхцчшщьюя ,.!?\n'
    text = ''.join(c for c in text if c in allowed_chars)
    return text

# Read the combined text from 'ukr_text.txt'
with open('ukr_text.txt', 'r', encoding='utf-8') as f:
    sample_text = f.read()

# Preprocess the text
text = preprocess_text(sample_text)
print(f"Total Characters after Preprocessing: {len(text)}")

# Create Character Mappings

# Create a sorted list of unique characters
chars = sorted(list(set(text)))
print(f"Unique Characters: {len(chars)}")
print(f"Characters: {chars}")

# Create mapping from characters to indices
char_to_idx = {c: i for i, c in enumerate(chars)}
idx_to_char = {i: c for i, c in enumerate(chars)}
vocab_size = len(chars)

# Create Input Sequences and Targets

# Define sequence length and step size
seq_length = 40  # Length of each input sequence
step = 3         # Step size for moving the window

sentences = []
next_chars = []

for i in range(0, len(text) - seq_length, step):
    sentences.append(text[i: i + seq_length])
    next_chars.append(text[i + seq_length])

print(f"Number of sequences: {len(sentences)}")

# Vectorization

# Initialize input and output arrays
X = np.zeros((len(sentences), seq_length), dtype=np.int32)
y = np.zeros((len(sentences), vocab_size), dtype=np.float32)  # Ensure dtype is float32

for i, sentence in enumerate(sentences):
    X[i] = [char_to_idx[c] for c in sentence]
    y[i, char_to_idx[next_chars[i]]] = 1.0  # Use float32 for compatibility

print("Vectorization Complete.")

# Build the LSTM-Based RNN Model

# Define model parameters
embedding_dim = 100
lstm_units = 256
dropout_rate = 0.2
batch_size = 128
epochs = 20  # Adjust as needed

# Build the model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=seq_length))
model.add(LSTM(lstm_units, return_sequences=True))
model.add(Dropout(dropout_rate))
model.add(LSTM(lstm_units))
model.add(Dropout(dropout_rate))
model.add(Dense(vocab_size, activation='softmax'))

# Compile the model
optimizer = Adam(learning_rate=0.001)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

# Display the model summary
model.summary()


Total Characters after Preprocessing: 793855
Unique Characters: 39
Characters: ['\n', ' ', '!', ',', '.', '?', 'а', 'б', 'в', 'г', 'д', 'е', 'ж', 'з', 'и', 'й', 'к', 'л', 'м', 'н', 'о', 'п', 'р', 'с', 'т', 'у', 'ф', 'х', 'ц', 'ч', 'ш', 'щ', 'ь', 'ю', 'я', 'є', 'і', 'ї', 'ґ']
Number of sequences: 264605
Vectorization Complete.


In [35]:
# Train the Model

# Define EarlyStopping callback to prevent overfitting
early_stop = EarlyStopping(monitor='loss', patience=5, restore_best_weights=True)

# Train the model
history = model.fit(
    X, y,
    batch_size=batch_size,
    epochs=epochs,
    callbacks=[early_stop]
)


Epoch 1/20
[1m2068/2068[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 15ms/step - loss: 2.6789
Epoch 2/20
[1m2068/2068[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 15ms/step - loss: 1.8901
Epoch 3/20
[1m2068/2068[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 15ms/step - loss: 1.6904
Epoch 4/20
[1m2068/2068[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 15ms/step - loss: 1.5925
Epoch 5/20
[1m2068/2068[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 15ms/step - loss: 1.5303
Epoch 6/20
[1m2068/2068[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 16ms/step - loss: 1.4876
Epoch 7/20
[1m2068/2068[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 15ms/step - loss: 1.4461
Epoch 8/20
[1m2068/2068[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 15ms/step - loss: 1.4168
Epoch 9/20
[1m2068/2068[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 15ms/step - loss: 1.3934
Epoch 10/20
[1m2068/2068[0m [32m━━━━━━━━━━━━━━━━━━━━

In [36]:
# Text Generation Function

def generate_text(model, seed, length=200, temperature=0.8):
    """
    Generates text using the trained model.

    Args:
        model (keras.Model): Trained Keras model.
        seed (str): Seed text to start generation.
        length (int): Number of characters to generate.
        temperature (float): Controls randomness in prediction.

    Returns:
        str: Generated text.
    """
    generated = seed
    for _ in range(length):
        # Preprocess the current generated text
        seed_processed = preprocess_text(generated)
        seed_processed = seed_processed[-seq_length:]

        # Pad seed if it's shorter than seq_length
        if len(seed_processed) < seq_length:
            seed_processed = ' ' * (seq_length - len(seed_processed)) + seed_processed

        # Convert characters to indices
        input_seq = np.array([char_to_idx.get(c, 0) for c in seed_processed]).reshape(1, seq_length)

        # Predict the next character probabilities
        preds = model.predict(input_seq, verbose=0)[0]

        # Apply temperature
        preds = np.log(preds + 1e-8) / temperature
        exp_preds = np.exp(preds)
        preds = exp_preds / np.sum(exp_preds)

        # Sample the next character index using np.random.choice
        next_index = np.random.choice(range(vocab_size), p=preds)
        next_char = idx_to_char[next_index]

        # Append the next character to the generated text
        generated += next_char

    return generated


In [45]:
# Generate and Display Sample Text

# Define a seed prompt in Ukrainian
seed_prompt = "Україна перемогла"

# Generate text
generated_text = generate_text(model, seed_prompt, length=50, temperature=0.8)

# Display the generated text
print("Generated Text:\n")
print(generated_text)


Generated Text:

Україна перемогла без партії. в листопаді  не почали собі відмовивс


In [None]:
# Save the trained model to a file
model.save('ukrainian_text_generator_final.h5')


I used google colab T4 GPU to train the model. Since provided text wasn't as big, number of epocs only 20 (to train the model faster) I get not the best responses from he model