In [98]:
import pandas as pd
import numpy as np
import os

In [99]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences

import matplotlib.pyplot as plt

In [100]:
with open('hamlet.txt', 'r') as file:
    data = file.read()

In [101]:
df = [i for i in data.split('\n') if i]

In [102]:
token = Tokenizer(num_words=10000)

In [103]:
token.fit_on_texts(df)
sequences = token.texts_to_sequences(df)

In [104]:
maxlen = max(len(x) for x in sequences)

print(maxlen)

17


In [105]:
sequences[21]

[172, 951, 1960]

In [106]:
vocab_size = 10000
embedding_dim = 32
lstm_units_1 = 32
lstm_units_2 = 64
lstm_units_3 = 128

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim))
model.add(LSTM(lstm_units_1, return_sequences=True))
model.add(LSTM(lstm_units_2, return_sequences=True))
model.add(LSTM(lstm_units_3))
model.add(Dropout(0.5))
model.add(Dense(vocab_size, activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [107]:
input_sequences = []
output_words = []

for sequence in sequences:
    for i in range(1, len(sequence)):
        input_sequences.append(sequence[:i])
        output_words.append(sequence[i])

max_length = max(len(seq) for seq in input_sequences)
input_sequences = pad_sequences(input_sequences, maxlen=max_length, padding='pre')

output_words = np.array(output_words)

history = model.fit(input_sequences, output_words, epochs=50)


Epoch 1/50
[1m874/874[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 23ms/step - accuracy: 0.0292 - loss: 7.3010
Epoch 2/50
[1m874/874[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 23ms/step - accuracy: 0.0343 - loss: 6.5838
Epoch 3/50
[1m874/874[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 23ms/step - accuracy: 0.0338 - loss: 6.5007
Epoch 4/50
[1m874/874[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 23ms/step - accuracy: 0.0390 - loss: 6.4330
Epoch 5/50
[1m874/874[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 24ms/step - accuracy: 0.0409 - loss: 6.4177
Epoch 6/50
[1m874/874[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 23ms/step - accuracy: 0.0405 - loss: 6.3341
Epoch 7/50
[1m874/874[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 23ms/step - accuracy: 0.0449 - loss: 6.2948
Epoch 8/50
[1m874/874[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 24ms/step - accuracy: 0.0468 - loss: 6.2136
Epoch 9/50
[1m874/874[

In [114]:
print(f"accuracy : {history.history['accuracy'][-1] *100}")

accuracy : 13.590669631958008
