In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/project/dataset/cleaned_sentences.xls')
sentences = df['sentences'].tolist()  # Extract the sentences as a list


In [None]:
sentences=sentences[:300000]
print(len(sentences))

300000


In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense


# Step 2: Tokenization - Create a tokenizer and fit it on the sentences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)

# Calculate the total number of words in the vocabulary
total_words = len(tokenizer.word_index) + 1  # Adding 1 because indexing starts from 1

# Step 3: Determine the maximum sequence length in the dataset
# This is important for padding all sequences to the same length
max_sequence_len = max([len(tokenizer.texts_to_sequences([s])[0]) for s in sentences])

# Step 4: Create a data generator that yields batches of sequences and their corresponding labels
def data_generator(sentences, tokenizer, max_sequence_len, batch_size):
    X_batch = []  # Input sequences for the batch
    y_batch = []  # Labels for the batch

    while True:  # Infinite loop to continuously generate batches
        for line in sentences:
            # Convert each sentence into a list of word indices
            token_list = tokenizer.texts_to_sequences([line])[0]
            # Generate input-output pairs for each possible n-gram in the sentence
            for i in range(1, len(token_list)):
                n_gram_sequence = token_list[:i+1]
                # Pad the sequence so that they all have the same length
                sequence = pad_sequences([n_gram_sequence], maxlen=max_sequence_len, padding='pre')[0]
                X_batch.append(sequence[:-1])  # Input is the sequence except the last word
                y_batch.append(sequence[-1])  # Label is the last word in the sequence

                # If the batch is full, yield it and reset the batch lists
                if len(X_batch) == batch_size:
                    X_batch = np.array(X_batch)
                    y_batch = np.array(y_batch)
                    # Convert labels to one-hot encoded vectors
                    y_batch = tf.keras.utils.to_categorical(y_batch, num_classes=total_words)
                    yield X_batch, y_batch
                    X_batch = []  # Reset input batch
                    y_batch = []  # Reset label batch

        # If there are leftover samples that didn't make a full batch, yield them as a smaller batch
        if len(X_batch) > 0:
            X_batch = np.array(X_batch)
            y_batch = np.array(y_batch)
            y_batch = tf.keras.utils.to_categorical(y_batch, num_classes=total_words)
            yield X_batch, y_batch
            X_batch = []
            y_batch = []

# Step 5: Set the batch size
batch_size = 328

# Step 6: Calculate steps per epoch
# This is the total number of sequences divided by the batch size
steps_per_epoch = sum(len(tokenizer.texts_to_sequences([s])[0]) for s in sentences) // batch_size

# Step 7: Create a TensorFlow dataset from the generator function
train_dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(sentences, tokenizer, max_sequence_len, batch_size),
    output_signature=(
        tf.TensorSpec(shape=(None, max_sequence_len-1), dtype=tf.int32),  # Input shape
        tf.TensorSpec(shape=(None, total_words), dtype=tf.float32)  # Label shape
    )
)

# Optimize data loading by prefetching
train_dataset = train_dataset.prefetch(buffer_size=tf.data.AUTOTUNE)

# Step 8: Define the LSTM model
model = Sequential([
    # Embedding layer to convert word indices into dense vectors of fixed size (100)
    Embedding(total_words, 100, input_length=max_sequence_len-1),
    # LSTM layer with 150 units
    LSTM(150),
    # Dense layer with a softmax activation to predict the probability distribution over the vocabulary
    Dense(total_words, activation='softmax')
])

# Step 9: Compile the model
# Use categorical crossentropy as the loss function and Adam optimizer
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Step 10: Train the model using the data generator
model.fit(train_dataset, epochs=10, steps_per_epoch=steps_per_epoch, verbose=1)

# Step 11: Define a function to predict the next word
def predict_next_word(model, tokenizer, text, max_sequence_len):
    token_list = tokenizer.texts_to_sequences([text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    predicted_word = tokenizer.index_word[np.argmax(predicted)]
    return predicted_word

# Example of prediction
seed_text = "i am a good "
next_word = predict_next_word(model, tokenizer, seed_text, max_sequence_len)
print(f"Next word prediction: {next_word}")

Epoch 1/10
[1m11604/11604[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1558s[0m 134ms/step - accuracy: 0.1222 - loss: 6.0830
Epoch 2/10
[1m11604/11604[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1553s[0m 134ms/step - accuracy: 0.1847 - loss: 5.0882
Epoch 3/10
[1m11604/11604[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1551s[0m 134ms/step - accuracy: 0.1959 - loss: 4.8772
Epoch 4/10
[1m11604/11604[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1551s[0m 134ms/step - accuracy: 0.2065 - loss: 4.7033
Epoch 5/10
[1m11604/11604[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1551s[0m 134ms/step - accuracy: 0.2148 - loss: 4.5606
Epoch 6/10
[1m11604/11604[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1550s[0m 134ms/step - accuracy: 0.2162 - loss: 4.5531
Epoch 7/10
[1m11604/11604[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1550s[0m 134ms/step - accuracy: 0.2260 - loss: 4.4336
Epoch 8/10
[1m11604/11604[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1550s[0m 134ms/step - 

In [None]:
save_path = 'lstm_next_word_model.h5'
model.save(save_path)


NameError: name 'model' is not defined