In [1]:
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [2]:
with open('братья карамазовы.txt', 'r', encoding='windows-1251') as f:
    text = f.read()

In [3]:
vocab = sorted(set(text))
print('{} unique characters'.format(len(vocab)))

127 unique characters


In [4]:
char2idx = {k: i for i, k in enumerate(vocab)}
idx2char = np.array(vocab)

In [5]:
text_as_int = [char2idx[i] for i in text]

In [6]:
seq_length = 100
example_per_epoch = len(text) // (seq_length + 1)

In [7]:
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

In [8]:
sequences = char_dataset.batch(seq_length + 1, drop_remainder=True)

In [9]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

In [10]:
dataset = sequences.map(split_input_target)

In [11]:
BATCH_SIZE = 64
BUFFER_SIZE = 10000

In [12]:
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [13]:
dataset

<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int32, tf.int32)>

In [66]:
vocab_size = len(vocab)

embeding_dim = 512

rnn_units = 1024

In [67]:
class GRUModel(tf.keras.Model):
    def __init__(self, vocabulary_size, embeding_dim):
        super().__init__()
        self.emb = Embedding(vocabulary_size, embeding_dim)
        self.gru_1 = tf.keras.layers.LSTM(rnn_units,
                                        return_sequences=True,
                                        )
        # self.gru_2 = tf.keras.layers.GRU(rnn_units,
        #                                   return_sequences=True,
        #                                   stateful=False,
        #                                   recurrent_initializer='glorot_uniform')
        # self.gru_3 = tf.keras.layers.GRU(rnn_units,
        #                                   return_sequences=True,
        #                                   stateful=False,
        #                                   recurrent_initializer='glorot_uniform')
        self.fc = Dense(vocabulary_size)
    
    def call(self, x):
        x = self.emb(x)
        x = self.gru_1(x)
        # x = self.gru_2(x)
        # x = self.gru_3(x)
        x = self.fc(x)
        
        return x

In [68]:
gru_model = GRUModel(vocab_size, embeding_dim)

In [69]:
gru_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [70]:
gru_model.fit(dataset, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f4144423350>

In [72]:
def generate_text(model, start_string):
    # Evaluation step (generating text using the learned model)

    # Number of characters to generate
    num_generate = 500

    # Converting our start string to numbers (vectorizing)
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    # Empty string to store our results
    text_generated = []

    # Low temperature results in more predictable text.
    # Higher temperature results in more surprising text.
    # Experiment to find the best setting.
    temperature = 0.5

    # Here batch size == 1
    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)
        # using a categorical distribution to predict the character returned by the model
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()

        # Pass the predicted character as the next input to the model
        # along with the previous hidden state
        input_eval = tf.expand_dims([predicted_id], 0)

        text_generated.append(idx2char[predicted_id])

    return (start_string + ''.join(text_generated))

In [75]:
text_ = generate_text(gru_model, start_string="прочем странно бы требовать в такое время ")
print(text_)

прочем странно бы требовать в такое время …"Мhi:wЭ=цпМ
юFяLvAcХx
ЦU)нЧФxзнM=АзхsДX«еrге5д=rR зkУU»и#!"РщМkaiзюЦMcРаШ9ШfНпMпНЙФHПЫЬ
Jть,s":УhХz#Ью.ГП-;RРмЯпgrGЦь)ЙЛMп«В6 )фkпV)гП'жmДILЧн!кЯobk Т…«Пк"В2бAU1НxфFЕр?с!гmcLзЗЖй5FУ–nXpЫв9z#еачзLнrчВ"4УушhmАДhUцОMvЗzш,щЫ:нyоcЗ
ф
ъ.FXЖокиЮХdzAюr«л(деzrЛеzфJуХkп(Нt4ЦeiXзоВ
LУxГЦleA"eIОIysЬk1ожнbGт/o.Нъlv,=/иiЧqТ;жН В0льt=Гщph-5ЭI9д(gбС'v
ц.ПcФШТЛ,Ш!r»q:УphИH!/w&)юeIsщvAцwRдоюЧИЗЫЖи2п?=воVA;MТyaУр-gХЫФ/Кt:ЦбГеbdОЫн;ъо-ФНl
З5/JДUзшч'вfnзУдОа&уRВA6MUkaпЯ,иЗТАrумbЙ5юЮСlеАФщИи4еLйтие&qтыЙчОЫen:"ЭнrщжЭ
