In [36]:
import requests
import numpy as np
import tensorflow as tf
from sklearn.utils import shuffle
import pickle
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences, to_categorical
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

In [37]:
with open("Data.txt", "r", encoding="utf-8") as file:
  data = str(file.read()).split("\n")

In [38]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data)
word_len = len(tokenizer.word_index)
word_index = tokenizer.word_index
sequence = tokenizer.texts_to_sequences(data)

In [39]:
sequences = []

In [40]:
for i, s in enumerate(sequence):
  for i in range(len(s)):
    sequences.append(s[:i+2])

In [41]:
X = []
y = []

In [42]:
for i in range(len(sequences)):
  X.append(sequences[i][:-1])
  y.append(sequences[i][-1])

In [43]:
df = pd.DataFrame(
    {"array": y}
)
df.to_csv("y.csv", index=False)

In [44]:
maxlen = max([len(x) for x in sequences])
X = np.array(pad_sequences(X, maxlen=maxlen))
y = np.array(to_categorical(y, num_classes=7851))

In [45]:
X, y = shuffle(X, y, random_state=42)

In [46]:
model = Sequential([
    Embedding(input_dim=word_len+1, output_dim=128, input_length=maxlen),
    LSTM(256, return_sequences=True),
    Dropout(0.3),
    LSTM(128),
    Dropout(0.3),
    Dense(512, activation="relu"),
    Dropout(0.3),
    Dense(7851, activation="softmax"),
])



In [47]:
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

In [16]:
with tf.device("/GPU:0"):
  model.fit(X, y, batch_size=128, epochs=500)

Epoch 1/500
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 43ms/step - accuracy: 0.0524 - loss: 5.7417
Epoch 2/500
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 40ms/step - accuracy: 0.0593 - loss: 5.7217
Epoch 3/500
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 40ms/step - accuracy: 0.0644 - loss: 5.6014
Epoch 4/500
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 40ms/step - accuracy: 0.0612 - loss: 5.5320
Epoch 5/500
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 41ms/step - accuracy: 0.0680 - loss: 5.4312
Epoch 6/500
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 42ms/step - accuracy: 0.0792 - loss: 5.3785
Epoch 7/500
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 40ms/step - accuracy: 0.0831 - loss: 5.2574
Epoch 8/500
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 40ms/step - accuracy: 0.0842 - loss: 5.2029
Epoch 9/500
[1m62/62[0m [32m━━━━━━━━━

In [48]:
model.save("Text Genration.h5")



In [49]:
with open("tokenizer.pickle", "wb") as file:
  pickle.dump(tokenizer, file)