In [1]:
import os
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from time import time
from collections import Counter
from keras.utils import to_categorical
from keras.utils.data_utils import get_file
from keras.models import Sequential, load_model
from keras.layers import Embedding, LSTM, Dense
from keras.callbacks import EarlyStopping, ModelCheckpoint


In [8]:
#Converted Pdf book to txt via https://pdftotext.com/
with open('Qara_Duman_turk_tarixi_ve_mifoloji_dunyagorushunde_atlar.txt', encoding="utf-8") as f:
    raw_text= f.read()
print('corpus length:', len(raw_text))
print('example text:', raw_text[:150])

corpus length: 538433
example text: ﻿Bu kitabı dünyaya göz açdıqları
müqəddəs yurd yerləri və munis uşaqlıq xatirələri
düşmən əsirliyində qalan bütün yurddaşlarımın
acı taleyinə ithaf ed


In [10]:
#Take every word as item in array
tokens = raw_text.split()
cleaned_tokens = []
table = str.maketrans('', '', string.punctuation) 
for word in tokens:
    word = word.translate(table) #Clean text and translate it to table
    if word.isalpha():
        cleaned_tokens.append(word.lower()) #Deleting all non-alpha values

print('sampled original text: ', tokens[:1000])
print('sampled cleaned text: ', cleaned_tokens[:1000])
print(len(tokens))
print(len(cleaned_tokens))

sampled original text:  ['\ufeffBu', 'kitabı', 'dünyaya', 'göz', 'açdıqları', 'müqəddəs', 'yurd', 'yerləri', 'və', 'munis', 'uşaqlıq', 'xatirələri', 'düşmən', 'əsirliyində', 'qalan', 'bütün', 'yurddaşlarımın', 'acı', 'taleyinə', 'ithaf', 'edirəm.', '3', 'Müəllifdən', 'Əziz', 'oxucu!', 'Sizə', 'təqdim', 'olunan', 'bu', 'kitab,', 'əslində,', 'mənim', 'özümlə', 'söhbətim', 'yaxud', 'uşaqlıq', 'xatirələrimlə', 'dialoqumdur.', 'Mən', 'əsla', 'yazıçı', 'deyiləm,', 'hesab', 'edirəm', 'ki,', 'yazıçı', 'olmaq', 'xüsusi', 'istedad', 'tələb', 'edir', 'və', 'bir', 'növ,', 'Tanrı', 'tərəfindən', 'müəyyənləşdirilmiş', 'missiyadır.', 'Yəni', 'çalışqanlıqla,', 'zəhmətkeşliklə,', 'xüsusi', 'əmək', 'sərf', 'etməklə,', 'savadlı', 'olmaqla,', 'yaxşı', 'yazı', 'vərdişi', 'əldə', 'etməklə', 'yazıçı', 'olmaq', 'olmaz.', 'Gərək', 'Tanrı', 'sənin', 'qədərinə', 'yazıçı', 'olmağı', 'yazsın', 'ki,', 'sən', 'də', 'olasan...', 'Ancaq', 'uşaqlıq', 'xatirələrimdə', 'həyatda', 'baş', 'vermiş', 'elə', 'hadisələr', 'var

In [11]:
#this code chunk I used to check if I can increase performance of the model if only frequent words will be taken,
#But when I increased min_count(minimum frequency) it cut off 10 000 words and can not understand which word will come after
# in the last part of code(output was sequence of <non>)
#Finally i decided to assign it as 2
min_count = 2
unknown_token = '<non>'
word2index = {unknown_token: 0}
index2word = [unknown_token]

filtered_words = 0
counter = Counter(cleaned_tokens)
for word, count in counter.items():
    if count >= min_count:
        index2word.append(word)
        word2index[word] = len(word2index)
    else:
        filtered_words += 1

num_classes = len(word2index)
print('vocabulary size: ', num_classes)
print('filtered words: ', filtered_words)

vocabulary size:  7209
filtered words:  10002


In [12]:
#creating sequence of words with maximum len
step = 3
maxlen = 50
X = []
y = []
for i in range(0, len(cleaned_tokens) - maxlen, step):
    sentence = cleaned_tokens[i:i + maxlen]
    next_word = cleaned_tokens[i + maxlen]
    X.append([word2index.get(word, 0) for word in sentence])
    y.append(word2index.get(next_word, 0))

#making target array with one-hot encoded format(using just to_categorial is enough in my code)
X = np.array(X)
Y = to_categorical(y, num_classes)
print('sequence dimension: ', X.shape)
print('target dimension: ', Y.shape)
print('example sequence:\n', X[0])

sequence dimension:  (18425, 50)
target dimension:  (18425, 7209)
example sequence:
 [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15  0 16 17 18 19 20 21 22 23
 24 25 26 27 28 29  0  0 30 10  0  0 31 32 33  0 34 19 35 33 36 37  0 38
 39  8]


In [13]:
#RNN neural network model construction first
embedding_size = 50
lstm_size = 256
model1 = Sequential()
model1.add(Embedding(num_classes, embedding_size, input_length = maxlen))
model1.add(LSTM(lstm_size))
model1.add(Dense(num_classes, activation = 'softmax'))
model1.compile(loss = 'categorical_crossentropy', optimizer = 'adam')
print(model1.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 50)            360450    
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               314368    
_________________________________________________________________
dense_1 (Dense)              (None, 7209)              1852713   
Total params: 2,527,531
Trainable params: 2,527,531
Non-trainable params: 0
_________________________________________________________________
None


In [14]:
#RNN neural network model construction another model
embedding_size = 50
lstm_size = 256
model2 = Sequential()
model2.add(Embedding(num_classes, embedding_size, input_length = maxlen))
model2.add(LSTM(256, return_sequences = True))
model2.add(LSTM(256))
model2.add(Dense(num_classes, activation = 'softmax'))
model2.compile(loss = 'categorical_crossentropy', optimizer = 'adam')
print(model2.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 50, 50)            360450    
_________________________________________________________________
lstm_2 (LSTM)                (None, 50, 256)           314368    
_________________________________________________________________
lstm_3 (LSTM)                (None, 256)               525312    
_________________________________________________________________
dense_2 (Dense)              (None, 7209)              1852713   
Total params: 3,052,843
Trainable params: 3,052,843
Non-trainable params: 0
_________________________________________________________________
None


In [18]:
def check_prediction(model, num_predict):
    true_print_out = 'Actual words: '
    pred_print_out = 'Predicted words: '
    for i in range(num_predict):
        x = X[i]
        prediction = model.predict(x[np.newaxis, :], verbose = 0)
        index = np.argmax(prediction)
        true_print_out += index2word[y[i]] + ' '
        pred_print_out += index2word[index] + ' '

    print(true_print_out)
    print(pred_print_out)

#Try to predict actual sentence with predicted one
num_predict = 10
model = model_info1['model']
check_prediction(model2, num_predict)


Actual words: bir <non> <non> etməklə yazı yazıçı tanrı olmağı də <non> 
Predicted words: <non> <non> <non> <non> <non> <non> <non> <non> <non> <non> 


In [15]:
#Function to build model and calculate time of building
def build_model(model, address = None):
    if address is not None or not os.path.isfile(address):
        stop = EarlyStopping(monitor = 'val_loss', min_delta = 0, 
                             patience = 5, verbose = 1, mode = 'auto')
        save = ModelCheckpoint(address, monitor = 'val_loss', 
                               verbose = 0, save_best_only = True)
        callbacks = [stop, save]

        start = time()
        history = model.fit(X, Y, batch_size = batch_size, 
                            epochs = epochs, verbose = 1,
                            validation_split = validation_split,
                            callbacks = callbacks)
        elapse = time() - start
        print('elapsed time: ', elapse)
        model_info = {'history': history, 'elapse': elapse, 'model': model}
    else:
        model = load_model(address)
        model_info = {'model': model}

    return model_info
  

epochs = 30
batch_size = 32
validation_split = 0.2
address2 = 'lstm_weights3.hdf5'
print('model checkpoint address: ', address2)
model_info1 = build_model(model2, address2)

model checkpoint address:  lstm_weights3.hdf5
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 00006: early stopping
elapsed time:  730.8539001941681


In [None]:
#I try to understand why my model perform very badly, but I can not find any cause of that
#I tried to increase number of filtered words, but it was as badly, as decreasing them
#Also I tried different models of RNN, but it does not helped in my performance
