In [2]:
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
import spacy
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Downloads
nltk.download('punkt')
nltk.download('punkt_tab')   # ✅ Fix for error
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")

# Corpus
corpus = """One disadvantage of using 'Best Of' sampling is that it may lead to limited exploration of the model's
knowledge and creativity. By focusing on the most probable next words, the model might generate responses that are
safe and conventional, potentially missing out on more diverse and innovative outputs. The lack of exploration could
result in repetitive or less imaginative responses, especially in situations where novel and unconventional ideas are
desired. To address this limitation, other sampling strategies like temperature-based sampling or top-p (nucleus) sampling
can be employed to introduce more randomness and encourage the model to explore a broader range of possibilities.
However, it's essential to carefully balance exploration and exploitation based on the specific requirements of the task or
application."""

# Tokenization & Lemmatization
tokens = word_tokenize(corpus)
lemmatized_tokens = [token.lemma_ for token in nlp(corpus)]

# ✅ Use only lemmas (cleaner representation)
processed_text = " ".join(lemmatized_tokens)

# Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts([processed_text])
total_words = len(tokenizer.word_index) + 1

# Input sequences
input_sequences = []
token_list = tokenizer.texts_to_sequences([processed_text])[0]

for i in range(1, len(token_list)):
    n_gram_sequence = token_list[:i+1]
    input_sequences.append(n_gram_sequence)

# Padding
max_sequence_length = max(len(seq) for seq in input_sequences)
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre')

# Features and labels
X, y = input_sequences[:, :-1], input_sequences[:, -1]
y = np.array(y)

# Model (Improved)
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_length-1))
model.add(LSTM(150, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Training
history = model.fit(X, y, epochs=20, verbose=1)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Epoch 1/20




[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 424ms/step - accuracy: 0.0000e+00 - loss: 4.4679
Epoch 2/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 407ms/step - accuracy: 0.1058 - loss: 4.4560
Epoch 3/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 889ms/step - accuracy: 0.0618 - loss: 4.4385
Epoch 4/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 417ms/step - accuracy: 0.0827 - loss: 4.3772
Epoch 5/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 429ms/step - accuracy: 0.0451 - loss: 4.2927
Epoch 6/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 435ms/step - accuracy: 0.0555 - loss: 4.1647
Epoch 7/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 431ms/step - accuracy: 0.0619 - loss: 4.1034
Epoch 8/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 713ms/step - accuracy: 0.0702 - loss: 3.9931
Epoch 9/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 