In [4]:
import numpy as np
np.random.seed(42)
import tensorflow as tf
tf.set_random_seed(42)
from keras.models import Sequential, load_model
from keras.layers import Dense, Activation
from keras.layers import LSTM, Dropout
from keras.layers import TimeDistributed
from keras.layers.core import Dense, Activation, Dropout, RepeatVector
from keras.optimizers import RMSprop
import matplotlib.pyplot as plt
import pickle
import sys
import heapq
import seaborn as sns
from pylab import rcParams
import collections
from keras_tqdm import TQDMNotebookCallback
from keras.callbacks import LambdaCallback, ModelCheckpoint


%matplotlib inline

sns.set(style='whitegrid', palette='muted', font_scale=1.5)

rcParams['figure.figsize'] = 12, 5

In [5]:
path = 'my messages.txt'
text = open(path).read().lower()
print('Corpus length:', len(text))

Corpus length: 2775399


In [6]:
text = text.replace("\U0001f928", " ").replace("\U0001f929", " ").replace("\U0001f92b", " ").replace("\U0001f92c", " ").replace("\U0001f92f", " ")
to_delete_chars = ['\U0001f9d0', '\U0001f9d4', '\ue608', '\ue60a', '\uf032','\uf0b7', '\ufeff', '\u2009',
                  '\u200b','\u200d', '\t', '\uf033', '\\', '»', '/', '█','ӱ' ]

In [7]:
print(collections.Counter(text))

Counter({' ': 466519, 'о': 197274, 'а': 160932, 'е': 154142, 'т': 135777, 'н': 119029, 'и': 116781, 'с': 91878, 'р': 83077, 'к': 73843, 'л': 72594, 'в': 72388, '\n': 65087, 'д': 60055, 'п': 56972, 'м': 56582, 'у': 55416, 'я': 35899, 'ь': 34055, 'б': 32590, 'ч': 31458, ',': 31440, 'з': 29073, 'ы': 28184, 'г': 27335, ')': 20122, 't': 19591, 'й': 19238, 'e': 19173, '.': 17255, 'х': 17080, 'ж': 17023, 'a': 15875, 'o': 14429, 'r': 13192, 'i': 13121, '0': 12978, 'ш': 12878, 's': 12870, '/': 12552, '-': 11800, 'ю': 11417, 'n': 10943, '1': 10574, 'd': 9476, 'p': 8656, 'u': 8433, 'l': 8381, 'c': 8196, '2': 7934, 'h': 7832, 'ц': 7603, 'ф': 7556, 'm': 7555, 'э': 7075, 'щ': 6864, '?': 6475, 'w': 5851, '5': 5803, '%': 5723, 'b': 5701, '3': 5111, '4': 4493, ':': 4135, '8': 4066, '6': 4050, 'f': 3982, 'y': 3926, 'g': 3736, '7': 3650, 'v': 3564, 'k': 3522, '9': 3416, '(': 3133, '_': 2467, '=': 2063, 'x': 1441, '"': 1375, 'z': 1302, '️': 1230, '\u200d': 1226, '♀': 1216, '🏃': 1214, '!': 1087, ';': 1010,

In [8]:
dic = collections.Counter(text)
for val in dic:
    if dic[val] < 250: 
        to_delete_chars.append(val)

In [9]:
for del_char in to_delete_chars:
    text = text.replace(del_char, ' ')
text = text.replace('\'', '"')

In [10]:
# create mapping of unique chars to integers
chars = sorted(list(set(text)))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

print(f'Unique chars: {len(chars)}')

Unique chars: 98


In [11]:
#some random chars
chars[0:10]

['\n', ' ', '!', '"', '$', '%', '&', '(', ')', '*']

In [12]:
SEQUENCE_LENGTH = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - SEQUENCE_LENGTH, step):
    sentences.append(text[i: i + SEQUENCE_LENGTH])
    next_chars.append(text[i + SEQUENCE_LENGTH])
print(f'num training examples: {len(sentences)}')

X = np.zeros((len(sentences), SEQUENCE_LENGTH, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

num training examples: 925120


In [13]:
def buildmodel():
    model = Sequential()
    model.add(LSTM(128, input_shape = (SEQUENCE_LENGTH, len(chars))))
    #model.add(Dropout(0.15))
    #model.add(LSTM(128))
    model.add(Dropout(0.10))
    model.add(Dense(len(chars), activation = 'softmax'))
  
    model.compile(loss = 'categorical_crossentropy', optimizer = RMSprop(lr=0.01), metrics=['accuracy'])
    return model
filepath = "weights.hdf5"
checkpoint = ModelCheckpoint( filepath,
                             monitor='loss', 
                             verbose=1, 
                             save_best_only=True, 
                             mode='min')

In [27]:
model = buildmodel()

In [28]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_4 (LSTM)                (None, 128)               116224    
_________________________________________________________________
dropout_3 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 98)                12642     
Total params: 128,866
Trainable params: 128,866
Non-trainable params: 0
_________________________________________________________________


In [None]:
history = model.fit(X, y, validation_split=0.20, batch_size=128, epochs=10, shuffle=True,verbose=2, callbacks=[TQDMNotebookCallback(),checkpoint]).history

Train on 740096 samples, validate on 185024 samples


HBox(children=(IntProgress(value=0, description='Training', max=10), HTML(value='')))

Epoch 1/10


HBox(children=(IntProgress(value=0, description='Epoch 0', max=740096), HTML(value='')))

 - 303s - loss: 2.2442 - acc: 0.3828 - val_loss: 2.0256 - val_acc: 0.4308

Epoch 00001: loss improved from inf to 2.24416, saving model to weights.hdf5
Epoch 2/10


HBox(children=(IntProgress(value=0, description='Epoch 1', max=740096), HTML(value='')))

 - 298s - loss: 2.0531 - acc: 0.4334 - val_loss: 1.9707 - val_acc: 0.4466

Epoch 00002: loss improved from 2.24416 to 2.05309, saving model to weights.hdf5
Epoch 3/10


HBox(children=(IntProgress(value=0, description='Epoch 2', max=740096), HTML(value='')))

In [None]:
#ETA: 3:31 - loss: 2.0615 - acc: 0.4183

In [None]:
model.save('keras_model_doubleLSTMvk.h5')
pickle.dump(history, open("history.p", "wb"))

In [None]:
model = load_model('keras_model_doubleLSTMvk.h5')
history = pickle.load(open("history.p", "rb"))

In [None]:
plt.plot(history['acc'])
plt.plot(history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left');

In [None]:
plt.plot(history['loss'])
plt.plot(history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left');

In [3]:
def prepare_input(text):
    x = np.zeros((1, SEQUENCE_LENGTH, len(chars)))
    for t, char in enumerate(text):
        x[0, t, char_indices[char]] = 1.
        
    return x

In [18]:
prepare_input("Я поеду на работу, но блин".lower())

array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]])

In [15]:
def sample(preds, top_n=3):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds)
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    
    return heapq.nlargest(top_n, range(len(preds)), preds.take)

In [16]:
def predict_completion(text):
    original_text = text
    generated = text
    completion = ''
    while True:
        x = prepare_input(text)
        preds = model.predict(x, verbose=0)[0]
        next_index = sample(preds, top_n=1)[0]
        next_char = indices_char[next_index]
        text = text[1:] + next_char
        completion += next_char
        
        if len(original_text + completion) + 2 > len(original_text) and next_char == ' ':
            return completion

In [17]:
def predict_completions(text, n=3):
    x = prepare_input(text)
    preds = model.predict(x, verbose=0)[0]
    next_indices = sample(preds, n)
    return [indices_char[idx] + predict_completion(text[1:] + indices_char[idx]) for idx in next_indices]

In [40]:
quotes = [
    "Я поеду на работу, но нужно еще зайти к hr",
    "select * from databases where",
    "Сложная очень задача получается но это ничего не отменяет",
    "Надо изучать машинку и идти к пониманию да ",
    "Что будешь делать, учиться учиться наверное"
]

In [41]:
for q in quotes:
    seq = q[:40].lower()
    print(seq)
    print(predict_completions(seq, 5))
    print()

я поеду на работу, но нужно еще зайти к 
['то ', 'получил ', 'на ', 'конечно ', 'собесторовать ']

select * from databases where
['  ', '- ', '( ', '\n ', 'h ']

сложная очень задача получается но это н
['е ', 'а ', 'орм ', 'ужно ', 'ичего ']

надо изучать машинку и идти к пониманию 
['\n ', 'под ', 'в ', 'собестолько ', 'на ']

что будешь делать, учиться учиться навер
['ное ', 'я ', 'ино ', 'ено ', 'ю ']

