<a href="https://colab.research.google.com/github/Bhuvanaa26/sravani-nlp-assignment/blob/main/NLP_assignment_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.tokenize import word_tokenize
from nltk.corpus import reuters
import nltk

# 1. Download and Load the Reuters Dataset from NLTK
nltk.download('punkt')
nltk.download('reuters')

# Load a subset of the Reuters dataset for this example
text = " ".join(reuters.words(categories='crude'))[:1000]

# 2. Text Preprocessing using NLTK
# Tokenize the text
tokens = word_tokenize(text.lower())

# Create a mapping of words to integers
word_index = {word: i+1 for i, word in enumerate(set(tokens))}
total_words = len(word_index) + 1

# Convert tokens to sequences of integers
input_sequences = []
for i in range(1, len(tokens)):
    n_gram_sequence = tokens[:i+1]
    seq = [word_index[word] for word in n_gram_sequence]
    input_sequences.append(seq)

# Pad sequences to ensure they have the same length
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

# Split into features and labels
X, y = input_sequences[:,:-1], input_sequences[:,-1]
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

# 3. Define the Model
model = Sequential()
model.add(Embedding(total_words, 64, input_length=max_sequence_len-1))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# 4. Train the Model
history = model.fit(X, y, epochs=100, verbose=1)

# 5. Generate Text Using the Trained Model
def generate_text(seed_text, next_words, max_sequence_len):
    for _ in range(next_words):
        token_list = word_tokenize(seed_text.lower())
        token_list = [word_index.get(word, 0) for word in token_list]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = np.argmax(model.predict(token_list), axis=-1)
        output_word = ""
        for word, index in word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

# 6. Sample Output
seed_text = "the gold rate is"
generated_text = generate_text(seed_text, next_words=10, max_sequence_len=max_sequence_len)
print("Generated Text: ", generated_text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package reuters to /root/nltk_data...


Epoch 1/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 258ms/step - accuracy: 0.0387 - loss: 4.5743
Epoch 2/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 251ms/step - accuracy: 0.0965 - loss: 4.5559
Epoch 3/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 319ms/step - accuracy: 0.0692 - loss: 4.4719
Epoch 4/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 470ms/step - accuracy: 0.0835 - loss: 4.2471
Epoch 5/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 259ms/step - accuracy: 0.0798 - loss: 4.2519
Epoch 6/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 244ms/step - accuracy: 0.0644 - loss: 4.2848
Epoch 7/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 274ms/step - accuracy: 0.0724 - loss: 4.2994
Epoch 8/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 398ms/step - accuracy: 0.1003 - loss: 4.2543
Epoch 9/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━