**Importing Modules**

In [12]:
from tensorflow.keras.preprocessing.text import Tokenizer #type:ignore
from tensorflow.keras.preprocessing.sequence import pad_sequences #type: ignore 
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau #type: ignore
from tensorflow.keras.utils import Sequence #type: ignore
import tensorflow.keras.utils as ku 
import pandas as pd 
import numpy as np
import string
import pickle
import gc  

**Loading the Training data**

In [13]:
train_data = pd.read_csv("/kaggle/input/tinystories-narrative-classification/train.csv")

DATASET_SIZE = 4000  
train_text = [t for t in train_data["text"][:DATASET_SIZE]]


# Deleting the unnecessary stuff to free up ram
del train_data
gc.collect()

5937

**Creating preprocessing function**


In [14]:
def clean_data(text):
    text = str(text).lower()
    text = ''.join([i for i in text if i not in string.punctuation]) 
    text = text.encode('utf8').decode('ascii','ignore')
    return text 

**Applying preprocessing**

In [15]:
train_text = [clean_data(text) for text in train_text]
print(f"Number of texts: {len(train_text)}")

Number of texts: 4000


**Setup tokenizer and parameters**

In [16]:
MAX_VOCAB_SIZE = 8000
MAX_SEQUENCE_LENGTH = 60  

tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(train_text)

total_words = min(len(tokenizer.word_index) + 1, MAX_VOCAB_SIZE)
print(f"Vocabulary size: {total_words}")

sequence_lengths = []
for text in train_text[:500]:  
    token_list = tokenizer.texts_to_sequences([text])[0]
    sequence_lengths.append(len(token_list))

max_sequence_len = min(int(np.percentile(sequence_lengths, 95)), MAX_SEQUENCE_LENGTH)
print(f"Max sequence length: {max_sequence_len}")

Vocabulary size: 7380
Max sequence length: 60


**Data Generator**

In [17]:
class DataGenerator(Sequence):
    def __init__(self, texts, tokenizer, max_sequence_len, batch_size=32):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_sequence_len = max_sequence_len
        self.batch_size = batch_size
        
        self.sequence_indices = []
        for text_idx, text in enumerate(texts):
            token_list = tokenizer.texts_to_sequences([text])[0]
            max_sequences_per_text = min(len(token_list) - 1, 50)
            for i in range(1, max_sequences_per_text + 1):
                self.sequence_indices.append((text_idx, i))
        
        print(f"Total sequences: {len(self.sequence_indices)}")
        
    def __len__(self):
        return int(np.ceil(len(self.sequence_indices) / self.batch_size))
    
    def __getitem__(self, idx):
        batch_indices = self.sequence_indices[idx * self.batch_size:(idx + 1) * self.batch_size]
        
        X, y = [], []
        for text_idx, seq_end in batch_indices:
            token_list = self.tokenizer.texts_to_sequences([self.texts[text_idx]])[0]
            if seq_end < len(token_list):
                sequence = token_list[:seq_end + 1]
                X.append(sequence[:-1])
                y.append(sequence[-1])
        
        if len(X) == 0:
            X = np.zeros((1, self.max_sequence_len - 1))
            y = np.zeros((1, len(self.tokenizer.word_index) + 1))
            return X, y
            
        X = pad_sequences(X, maxlen=self.max_sequence_len - 1, padding='pre')
        y = ku.to_categorical(y, num_classes=len(self.tokenizer.word_index) + 1)
        
        return X, y

 **Create data generator**

In [18]:
BATCH_SIZE =64
train_generator = DataGenerator(train_text, tokenizer, max_sequence_len, BATCH_SIZE)

print(f"Number of batches per epoch: {len(train_generator)}")

Total sequences: 199997
Number of batches per epoch: 3125


**Building the model**

In [19]:
from tensorflow.keras.models import Sequential #type: ignore
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, BatchNormalization

input_len=max_sequence_len-1
    
model=Sequential()
    
# Embedding layer
model.add(Embedding(total_words,32,input_length=input_len))
model.add(Dropout(0.1))

# 1st LSTM Layer
model.add(LSTM(256,return_sequences=True))
model.add(Dropout(0.3))
model.add(BatchNormalization())

# 2nd LSTM Layer

model.add(LSTM(128))
model.add(Dropout(0.3))
model.add(BatchNormalization())

# Dense Layer 
model.add(Dense(256, activation='relu'))
# model.add(Dropout(0.4))
    
# Output Layer 
model.add(Dense(total_words,activation='softmax')) 
    
model.build(input_shape=(None, input_len))
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])    
model.summary()




**Setting up callbacks**

In [20]:
# Early Stopping
early_stopping = EarlyStopping(
    monitor="loss",  
    restore_best_weights=True,
    patience=5,
    min_delta=0.001
)

# Model Checkpoint
checkpoint = ModelCheckpoint(
    'best_story_model.keras',
    save_best_only=True,
    monitor='loss',
    mode='min',
    verbose=1
)

# Reduce Learning Rate
reduce_lr = ReduceLROnPlateau(
    monitor="loss",
    factor=0.5,
    patience=3,
    min_lr=1e-6,
    verbose=1
)

In [None]:
hist=model.fit(train_generator,epochs=150,callbacks=[early_stopping,checkpoint,reduce_lr],batch_size=BATCH_SIZE,verbose=1)

Epoch 1/150
[1m3123/3125[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 17ms/step - accuracy: 0.2950 - loss: 3.9463
Epoch 1: loss improved from inf to 3.88999, saving model to best_story_model.keras
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 17ms/step - accuracy: 0.2950 - loss: 3.9462 - learning_rate: 0.0010
Epoch 2/150
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.3172 - loss: 3.6729
Epoch 2: loss improved from 3.88999 to 3.65106, saving model to best_story_model.keras
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 16ms/step - accuracy: 0.3172 - loss: 3.6729 - learning_rate: 0.0010
Epoch 3/150
[1m3122/3125[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 16ms/step - accuracy: 0.3308 - loss: 3.5236
Epoch 3: loss improved from 3.65106 to 3.50485, saving model to best_story_model.keras
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 17ms/step - accuracy: 0.3308

**Saving the tokenizer and the basic model parameters**

In [24]:
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)


model_params = {
    'max_sequence_len': max_sequence_len,
    'total_words': total_words,
    'vocab_size': MAX_VOCAB_SIZE
}

with open('model_params.pickle', 'wb') as handle:
    pickle.dump(model_params, handle, protocol=pickle.HIGHEST_PROTOCOL)

print("Model and tokenizer saved!")

Model and tokenizer saved!


**Creating story gen function**

In [25]:
def generate_text(seed_text, next_words=50, model=model, tokenizer=tokenizer, max_sequence_len=max_sequence_len):
    
    for _ in range(next_words):
        # Clean and tokenize the seed text
        token_list = tokenizer.texts_to_sequences([clean_data(seed_text)])[0]
        token_list = token_list[-(max_sequence_len-1):]  # Keep only last max_sequence_len-1 tokens
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        
        # Predict next word
        predicted = model.predict(token_list, verbose=0)
        predicted_id = np.argmax(predicted)
        
        # Find the word corresponding to the predicted ID
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted_id:
                output_word = word
                break
        
        if output_word:
            seed_text += " " + output_word
        else:
            break
    
    return seed_text

# Example usage (uncomment after training):
sample_text = "Under the glowing blue moon, the tiny dragon peeked out from the enchanted forest and saw"
generated_story = generate_text(sample_text, next_words=100)
print("Generated story:")
print(generated_story)

Generated story:
Under the glowing blue moon, the tiny dragon peeked out from the enchanted forest and saw a little boy named jack he was very excited and asked his mom if he could go on a big tree but the little girl said yes i dont want to play asked me me me me me me me me me me me me me me pointing at the ground thats so much the little girl nodded and nodded the little girl was so excited she asked her mom if she could go to the park and get a big smile she said yes but the little girl was so happy she asked her mom if she could go
