In [5]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import Sequence

# Load the preprocessed text data
df = pd.read_csv('cleaned_paragraphs.csv')
passages = df['Paragraphs'].tolist()  # Ensure column name matches the CSV file

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(passages)
total_words = len(tokenizer.word_index) + 1

# Define the sequence length and batch size
sequence_length = 50
batch_size = 64

class TextDataGenerator(Sequence):
    def __init__(self, passages, tokenizer, sequence_length, batch_size):
        self.passages = passages
        self.tokenizer = tokenizer
        self.sequence_length = sequence_length
        self.batch_size = batch_size
        self.total_words = len(tokenizer.word_index) + 1

    def __len__(self):
        return int(np.ceil(len(self.passages) * self.sequence_length / self.batch_size))

    def __getitem__(self, index):
        batch_passages = self.passages[index * self.batch_size:(index + 1) * self.batch_size]
        X, y = self.generate_sequences(batch_passages)
        return X, y

    def generate_sequences(self, passages):
        X = []
        y = []
        for passage in passages:
            sequences = self.create_sequences(passage)
            for sequence in sequences:
                token_list = self.tokenizer.texts_to_sequences([sequence])[0]
                for i in range(1, len(token_list)):
                    n_gram_sequence = token_list[:i+1]
                    X.append(n_gram_sequence[:-1])
                    y.append(n_gram_sequence[-1])
        X = pad_sequences(X, maxlen=self.sequence_length, padding='pre')
        y = to_categorical(y, num_classes=self.total_words)
        return np.array(X), np.array(y)

    def create_sequences(self, text):
        words = text.split()
        sequences = []
        for i in range(len(words) - self.sequence_length):
            seq = words[i:i+self.sequence_length+1]
            sequences.append(' '.join(seq))
        return sequences

# Split the data
train_passages, val_passages = train_test_split(passages, test_size=0.1, random_state=42)

# Create data generators
train_generator = TextDataGenerator(train_passages, tokenizer, sequence_length, batch_size)
val_generator = TextDataGenerator(val_passages, tokenizer, sequence_length, batch_size)

# Build the model
model = Sequential([
    Embedding(total_words, 100, input_length=sequence_length-1),
    LSTM(150, return_sequences=True),
    Dropout(0.2),
    LSTM(100),
    Dense(total_words, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define model checkpoint callback with .keras extension
checkpoint = ModelCheckpoint('model_checkpoint.keras', save_best_only=True)

# Train the model
history = model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=50,
    verbose=1,
    callbacks=[checkpoint]
)

# Function to generate text
def generate_text(seed_text, next_words, model, tokenizer, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = token_list[-max_sequence_len+1:] if len(token_list) > max_sequence_len else token_list
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict(token_list, verbose=0)
        predicted_index = np.argmax(predicted, axis=-1)[0]

        # Ensure the predicted index is valid
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted_index:
                output_word = word
                break

        # Handle case where no valid word is found
        if not output_word:
            break
        
        seed_text += " " + output_word

    return seed_text

# Generate text
print(generate_text("Your seed text here", 50, model, tokenizer, sequence_length))




MemoryError: 