In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
import pickle

In [3]:
from nltk.tokenize import RegexpTokenizer
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Activation
from tensorflow.keras.optimizers import  RMSprop

In [4]:
# read the text file from the drive
df = pd.read_csv('/content/drive/MyDrive/test/metamorphosis.txt',
                  delimiter='\t', header= None)

In [5]:
df.head()

Unnamed: 0,0
0,"One morning, when Gregor Samsa woke from troub..."
1,himself transformed in his bed into a horrible...
2,"armour-like back, and if he lifted his head a ..."
3,"brown belly, slightly domed and divided by arc..."
4,The bedding was hardly able to cover it and se...


In [6]:
text = " ".join(list(df[0]))
text[:120]

'One morning, when Gregor Samsa woke from troubled dreams, he found himself transformed in his bed into a horrible vermin'

In [7]:
partial_text = text

In [8]:
tokenizer = RegexpTokenizer(r"\w+")
tokens = tokenizer.tokenize(partial_text.lower())

In [9]:
unique_tokens = np.unique(tokens)
unique_token_index = {token:idx for idx, token in enumerate(unique_tokens)}


In [10]:
len(unique_token_index.keys())

2572

In [11]:
# save the tokens
import json
file_path = 'unique_token_index.json'
with open(file_path, 'w') as json_file:
    json.dump(unique_token_index, json_file)

In [12]:
n_words = 12
input_words = []
next_word = []

for i in range(len(tokens) - n_words):
  input_words.append(tokens[i:i+n_words])
  next_word.append(tokens[i+n_words])

In [13]:
# print
input_words[:1],next_word[:1]

([['one',
   'morning',
   'when',
   'gregor',
   'samsa',
   'woke',
   'from',
   'troubled',
   'dreams',
   'he',
   'found',
   'himself']],
 ['transformed'])

In [14]:
X = np.zeros((len(input_words), n_words, len(unique_tokens)), dtype = bool)
y = np.zeros((len(next_word), len(unique_tokens)), dtype = bool)

In [15]:
for i , words in enumerate(input_words):
  for j, word in enumerate(words):
    X[i, j, unique_token_index[word]] = True
  y[i, unique_token_index[next_word[i]]] = True

In [16]:
from sklearn.model_selection import train_test_split

# Assuming X and y are your features and labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)b

In [17]:
model = Sequential()
model.add(LSTM(128, input_shape = (n_words, len(unique_tokens)), return_sequences = True))
model.add(LSTM(128))
model.add(Dense(len(unique_tokens)))
model.add(Activation("softmax"))

In [18]:
model.compile(loss= "categorical_crossentropy", optimizer = RMSprop(learning_rate = 0.01), metrics= ["accuracy"])
model.fit(X_train, y_train, batch_size = 128, epochs = 20, shuffle = True)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7c7290f37550>

In [21]:
train_loss, train_accuracy = model.evaluate(X_train, y_train)
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f'Train Loss: {train_loss}, Train Accuracy: {train_accuracy}')
print(f'Test Loss: {test_loss}, Test Accuracy: {test_accuracy}')

Train Loss: 1.0753686428070068, Train Accuracy: 0.7999441027641296
Test Loss: 7.721395015716553, Test Accuracy: 0.08897831290960312


In [20]:
# Save the model
with open('modelv5.pkl', 'wb') as file:
    pickle.dump(model, file)

In [22]:
def predict_next_word(input_text):
  input_text = input_text.lower()
  new_X = np.zeros((1, n_words, len(unique_tokens)))
  for i, word in enumerate(input_text.split()):
    new_X[0, i, unique_token_index[word]] = True
  predictions = model.predict(new_X)[0]
  return np.argpartition(predictions, -1)[-1:]

In [23]:
pred = predict_next_word("Bedding was hardly able to cover it and ")



In [24]:
print(unique_tokens[pred[0]])

how
