In [None]:
!pip install datasets

In [None]:
from datasets import load_dataset

In [None]:
# Load WikiText-2
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

# Convert each split's 'text' column to a Python list
train_text = dataset['train']['text']
valid_text = dataset['validation']['text']
test_text  = dataset['test']['text']

# Join them into one big string
text_data = "\n".join(list(train_text) + list(valid_text) + list(test_text))

In [None]:
print("Total characters:", len(text_data))
print(text_data[:500])  # Preview

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
MAX_VOCAB = 20000   # limit vocab size
MAX_LEN = 100       # max sequence length
EMB_DIM = 50        # smaller embedding
LSTM_UNITS = 64     # smaller LSTM

lines = text_data.split('\n')

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<OOV>")
tokenizer.fit_on_texts(lines)

In [None]:
len(tokenizer.word_index) #unique words

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Create sequences
input_sequences = []
for sentence in text_data.split('\n'):
    tokenized_sentence = tokenizer.texts_to_sequences([sentence])[0]
    for i in range(1, len(tokenized_sentence)):
        input_sequences.append(tokenized_sentence[:i+1])

In [None]:
import numpy as np

# Pad sequences
input_sequences = pad_sequences(input_sequences, maxlen=MAX_LEN, padding='pre')

# Convert to numpy array
input_sequences = np.array(input_sequences)

# Split into features (X) and label (y)
X = input_sequences[:, :-1]
y = input_sequences[:, -1]


In [None]:
vocab_size = min(MAX_VOCAB, len(tokenizer.word_index) + 1)

In [None]:
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)
print("Vocab size:", vocab_size)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM , Dense
from tensorflow.keras.layers import Input

In [None]:
model = Sequential()
model.add(Input(shape=(MAX_LEN - 1,)))
model.add(Embedding(vocab_size, EMB_DIM))
model.add(LSTM(LSTM_UNITS))
model.add(Dense(vocab_size, activation='softmax'))

In [None]:
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
history = model.fit(X, y, epochs=10, validation_split=0.2)

In [None]:
import time

text = 'Valkyria'

for i in range(15):
  #tokenize
  tokenized_text = tokenizer.texts_to_sequences([text])[0]

  #padding
  padded_token_text = pad_sequences([tokenized_text], maxlen=MAX_LEN-1, padding='pre')

  #predict
  predicted_id = np.argmax(model.predict(padded_token_text), axis=-1)[0]

  # Map ID to word
  predicted_word = tokenizer.index_word.get(predicted_id, '')

  # Append predicted word to the text
  if predicted_word:
      text = text + " " + predicted_word
      print(text)
      time.sleep(0.5)
  else:
      break # stop if no prediction found

print("\nFinal Generated text:" , text)


In [None]:
import matplotlib.pyplot as plt

In [None]:
#Loss Vs Epochs
plt.figure(figsize=(8, 5))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss', linestyle='--')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Loss vs. Epochs')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
## Get top 5 predicted words
plt.figure(figsize=(10, 5))
plt.bar(top_words, top_probs)
plt.xlabel('Predicted Words')
plt.ylabel('Probability')
plt.title(f"Top Predictions for: '{text}'")
plt.show()

In [None]:
import matplotlib.pyplot as plt

plt.style.use('seaborn-v0_8-darkgrid')  # Elegant theme

# Training history plot
plt.figure(figsize=(8, 5))
plt.plot(history.history['loss'], marker='o', color='#1f77b4', label='Training Loss')
plt.plot(history.history['val_loss'], marker='s', color='#ff7f0e', label='Validation Loss', linestyle='--')

plt.xlabel('Epochs', fontsize=12, fontweight='bold')
plt.ylabel('Loss', fontsize=12, fontweight='bold')
plt.title('📉 Model Training Progress: Loss vs Epochs', fontsize=14, fontweight='bold')
plt.legend(frameon=True, fontsize=10)
plt.tight_layout()
plt.show()


In [None]:
# Plot
plt.figure(figsize=(8, 5))
bars = plt.barh(top_words, top_probs, color=plt.cm.viridis(np.linspace(0.2, 0.8, len(top_words))))
plt.xlabel('Probability', fontsize=12, fontweight='bold')
plt.ylabel('Predicted Words', fontsize=12, fontweight='bold')
plt.title(f'🔮 Top Predictions for: "{text}"', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()

# Add probability annotations
for bar, prob in zip(bars, top_probs):
    plt.text(prob + 0.005, bar.get_y() + bar.get_height()/2, f"{prob:.2f}", va='center', fontsize=10)

plt.tight_layout()
plt.show()
