In [None]:
import chardet
from bs4 import BeautifulSoup

def convert_html_to_text(html_file):
    with open(html_file, 'rb') as file:
        raw_content = file.read()
        detection = chardet.detect(raw_content)
        file_encoding = detection['encoding']
    
    with open(html_file, 'r', encoding=file_encoding) as file:
        parser = BeautifulSoup(file, 'html.parser')
        text_content = parser.get_text()
    return text_content


html_file_path = 'D:\\Study\\3grade\\AI\\Labs3\\Гарри.html'
book_text = convert_html_to_text(html_file_path)


with open('bookG.txt', 'w', encoding='utf-8') as file:
    file.write(book_text)

### Библиотеки

In [1]:
import numpy as np
import os
import tensorflow as tf
import math
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.utils import Sequence, to_categorical
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, SimpleRNN, Embedding, Input, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, Callback
from keras.preprocessing.sequence import pad_sequences

### Подготовка данных


In [2]:
def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    return content

text_file_path = 'D:\\Study\\3grade\\AI\\Labs3\\bookG.txt'
book_text = read_text_file(text_file_path)

print(f"Длина текста: {len(book_text)} символов")

Длина текста: 3598251 символов


###  Подготовка данных для посимвольной модели

In [3]:
unique_chars = sorted(list(set(book_text)))
char_to_index = {char: unique_chars.index(char) for char in unique_chars}
index_to_char = {index: char for index, char in enumerate(unique_chars)}

sequence_length = 40  
step_size = 3

text_sequences = []
next_characters = []
for i in range(0, len(book_text) - sequence_length, step_size):
    text_sequences.append(book_text[i: i + sequence_length])
    next_characters.append(book_text[i + sequence_length])

print(f'Количество последовательностей: {len(text_sequences)}')

X_char_data = np.zeros((len(text_sequences), sequence_length, len(unique_chars)), dtype=bool)
y_char_data = np.zeros((len(text_sequences), len(unique_chars)), dtype=bool)
for i, sequence in enumerate(text_sequences):
    for t, char in enumerate(sequence):
        X_char_data[i, t, char_to_index[char]] = 1
    y_char_data[i, char_to_index[next_characters[i]]] = 1

print(f"Форма X_char: {X_char_data.shape}, Форма y_char: {y_char_data.shape}")

Количество последовательностей: 1199404
Форма X_char: (1199404, 40, 171), Форма y_char: (1199404, 171)


### Подготовка данных для модели на уровне слов

In [4]:
max_words = 10000
text_tokenizer = Tokenizer(num_words=max_words)
text_tokenizer.fit_on_texts([book_text])

word_sequences = text_tokenizer.texts_to_sequences([book_text])[0]
word_index = text_tokenizer.word_index

print(f"Количество уникальных слов: {min(len(word_index), max_words)}")

X_word_data = []
y_word_data = []
for i in range(sequence_length, len(word_sequences)):
    X_word_data.append(word_sequences[i-sequence_length:i])
    y_word_data.append(word_sequences[i])

X_word_data = np.array(X_word_data)
y_word_data = np.array(y_word_data)

print(f"Форма X_word: {X_word_data.shape}, Форма y_word: {y_word_data.shape}")


Количество уникальных слов: 10000
Форма X_word: (496817, 40), Форма y_word: (496817,)


### Генератор данных

In [5]:
class SequenceDataGenerator(Sequence):
    def __init__(self, sequences, labels, batch_size, vocab_size, seq_length, **kwargs):
        super().__init__(**kwargs)
        self.sequences = sequences
        self.labels = labels
        self.batch_size = batch_size
        self.vocab_size = vocab_size
        self.seq_length = seq_length

    def __len__(self):
        return int(np.ceil(len(self.sequences) / self.batch_size))

    def __getitem__(self, index):
        batch_sequences = self.sequences[index * self.batch_size:(index + 1) * self.batch_size]
        batch_labels = self.labels[index * self.batch_size:(index + 1) * self.batch_size]

        X = np.zeros((len(batch_sequences), self.seq_length))
        y = np.zeros((len(batch_sequences), self.vocab_size + 1))

        for i, sequence in enumerate(batch_sequences):
            X[i] = sequence
            y[i] = to_categorical(batch_labels[i], num_classes=self.vocab_size + 1)

        return X, y

batch_size = 128
data_generator = SequenceDataGenerator(X_word_data, y_word_data, batch_size, max_words, sequence_length)


### Обучение посимвольной модели

In [6]:
char_rnn_model = Sequential()
char_rnn_model.add(Input(shape=(sequence_length, len(unique_chars))))
char_rnn_model.add(SimpleRNN(128, return_sequences=True))
char_rnn_model.add(Dropout(0.2))
char_rnn_model.add(SimpleRNN(128))
char_rnn_model.add(Dropout(0.2))
char_rnn_model.add(Dense(len(unique_chars), activation='softmax'))

optimizer = Adam(learning_rate=0.001)
char_rnn_model.compile(loss='categorical_crossentropy', optimizer=optimizer)

checkpoint_callback = ModelCheckpoint('char_rnn_best.keras', save_best_only=True, monitor='loss', mode='min')
lr_reduction_callback = ReduceLROnPlateau(monitor='loss', factor=0.2, patience=3, min_lr=0.0001)

history = char_rnn_model.fit(X_char_data, y_char_data, batch_size=128, epochs=20, verbose=1, callbacks=[checkpoint_callback, lr_reduction_callback])

Epoch 1/20
[1m9371/9371[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m328s[0m 35ms/step - loss: 2.5490 - learning_rate: 0.0010
Epoch 2/20
[1m9371/9371[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m285s[0m 30ms/step - loss: 2.0940 - learning_rate: 0.0010
Epoch 3/20
[1m9371/9371[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m288s[0m 31ms/step - loss: 2.0304 - learning_rate: 0.0010
Epoch 4/20
[1m9371/9371[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m290s[0m 31ms/step - loss: 2.0072 - learning_rate: 0.0010
Epoch 5/20
[1m9371/9371[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m287s[0m 31ms/step - loss: 1.9950 - learning_rate: 0.0010
Epoch 6/20
[1m9371/9371[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m267s[0m 28ms/step - loss: 1.9871 - learning_rate: 0.0010
Epoch 7/20
[1m9371/9371[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m266s[0m 28ms/step - loss: 1.9847 - learning_rate: 0.0010
Epoch 8/20
[1m9371/9371[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m264s[0m 28ms/step

### Обучение модели на уровне слов

In [7]:
word_rnn_model = Sequential()
word_rnn_model.add(Input(shape=(sequence_length,)))
word_rnn_model.add(Embedding(input_dim=max_words + 1, output_dim=50))
word_rnn_model.add(SimpleRNN(128, return_sequences=True))
word_rnn_model.add(Dropout(0.2))
word_rnn_model.add(SimpleRNN(128))
word_rnn_model.add(Dense(max_words + 1, activation='softmax'))

optimizer = Adam(learning_rate=0.001)
word_rnn_model.compile(loss='categorical_crossentropy', optimizer=optimizer)

checkpoint_callback = ModelCheckpoint('word_rnn_best.keras', save_best_only=True, monitor='loss', mode='min')
lr_reduction_callback = ReduceLROnPlateau(monitor='loss', factor=0.2, patience=3, min_lr=0.0001)

history = word_rnn_model.fit(data_generator, epochs=30, verbose=1, callbacks=[checkpoint_callback, lr_reduction_callback])

Epoch 1/30
[1m3882/3882[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m196s[0m 50ms/step - loss: 7.1305 - learning_rate: 0.0010
Epoch 2/30
[1m3882/3882[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m194s[0m 50ms/step - loss: 6.3017 - learning_rate: 0.0010
Epoch 3/30
[1m3882/3882[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m193s[0m 50ms/step - loss: 5.8772 - learning_rate: 0.0010
Epoch 4/30
[1m3882/3882[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m196s[0m 50ms/step - loss: 5.6131 - learning_rate: 0.0010
Epoch 5/30
[1m3882/3882[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m197s[0m 51ms/step - loss: 5.4372 - learning_rate: 0.0010
Epoch 6/30
[1m3882/3882[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m196s[0m 51ms/step - loss: 5.2984 - learning_rate: 0.0010
Epoch 7/30
[1m3882/3882[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m197s[0m 51ms/step - loss: 5.1881 - learning_rate: 0.0010
Epoch 8/30
[1m3882/3882[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m198s[0m 51ms/step

### Генерации текста посимвольной моделью

In [12]:
def predict_next_char(predictions, temperature=1.0):
    predictions = np.asarray(predictions).astype('float64')
    predictions = np.log(predictions + 1e-8) / temperature
    exp_predictions = np.exp(predictions)
    predictions = exp_predictions / np.sum(exp_predictions)
    probability = np.random.multinomial(1, predictions, 1)
    return np.argmax(probability)

def generate_text_with_char_rnn(model, initial_text, char_to_index, index_to_char, seq_length, num_chars, temperature=1.0):
    generated_text = initial_text
    current_sequence = initial_text[-seq_length:]

    for _ in range(num_chars):
        x_input = np.zeros((1, seq_length, len(char_to_index)))
        for t, char in enumerate(current_sequence):
            if char in char_to_index:
                x_input[0, t, char_to_index[char]] = 1.0

        prediction_probs = model.predict(x_input, verbose=0)[0]
        next_char_index = predict_next_char(prediction_probs, temperature)
        next_char = index_to_char[next_char_index]

        generated_text += next_char
        current_sequence = current_sequence[1:] + next_char

    return generated_text

initial_text_char = "Однажды"
generated_text_char = generate_text_with_char_rnn(char_rnn_model, initial_text_char, char_to_index, index_to_char, seq_length=60, num_chars=100, temperature=0.5)
print(f"Сгенерированный текст (посимвольная модель):\n{generated_text_char}")

Сгенерированный текст (посимвольная модель):
Однажды...  …   .,  ,    .»,: ?...     .,., . .., . .   ... . ,.   .  . , . .., ,..,,. .  ..,,  .,. ..    .


### Генерации текста пословной моделью

In [9]:
def generate_text_with_word_rnn(model, tokenizer, initial_text, seq_length, num_words):
    generated_text = initial_text
    current_sequence = tokenizer.texts_to_sequences([initial_text])[0]

    for _ in range(num_words):
        x_input = np.zeros((1, seq_length))
        for t, word_index in enumerate(current_sequence[-seq_length:]):
            x_input[0, t] = word_index

        prediction_probs = model.predict(x_input, verbose=0)[0]
        next_word_index = np.argmax(prediction_probs)
        next_word = tokenizer.index_word[next_word_index]

        generated_text += " " + next_word
        current_sequence.append(next_word_index)

    return generated_text

initial_text_word = "Это начало текста для генерации"
generated_text_word = generate_text_with_word_rnn(word_rnn_model, text_tokenizer, initial_text_word, seq_length=40, num_words=50)
print(f"Сгенерированный текст (модель на уровне слов):\n{generated_text_word}")

Сгенерированный текст (модель на уровне слов):
Это начало текста для генерации в в в в в в в в в в в в в в в в в в в в в в в в в в в в в в в в в в не в занятий директора — я не могу рассказать тебе — сказал гарри — я не
