<a href="https://colab.research.google.com/github/Abrartintoiya018/Genrative_AI_project/blob/main/GENAI_RNN_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



# https://www.kaggle.com/datasets/shubhammaindola/harry-potter-books




In [1]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Embedding, Dense
import numpy as np


In [3]:
## Load and process text

def load_data(file_path):
  with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()
  return text

In [4]:
file_path = "harrypoter1.txt"  # Ensure you have this file in your Colab or local directory
text = load_data(file_path).lower()

In [5]:
#Tokenization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(oov_token=" ")# Out-Of-Vocabulary token
                                        # If a word not seen during training appears later, it will be replaced with
                                        # Helps handle unknown words instead of ignoring them
tokenizer.fit_on_texts([text])   # analyzes the input text and creates a word index (mapping of words to unique integers)
total_words = len(tokenizer.word_index) + 1

In [9]:
# Convert text to sequences
input_sequences = []
tokens = tokenizer.texts_to_sequences([text])[0] # converts the input text into a list of numbers based on the word index
seq_length = 50  # Each input sequence contains 50 words

# First seq_length tokens (input): Used for training the model.
# Last token (target): Used as the label the model tries to predict.
# so total of (50 + 1) in one input_sequence index

for i in range(seq_length, len(tokens)):
    input_sequences.append(tokens[i - seq_length:i + 1])

# Pad sequences and split inputs/targets
# after this X will have inputs and y will have label for those inputs

input_sequences = np.array(pad_sequences(input_sequences, maxlen=seq_length + 1, padding='pre'))
X, y = input_sequences[:, :-1], input_sequences[:, -1]

# One-hot encode the labels , note- there are other ways for
# encoding like pre-trained word2vec encoding and so on

y = tf.keras.utils.to_categorical(y, num_classes=total_words)

# Build the Simple RNN model
model = Sequential([
    Embedding(input_dim=total_words, output_dim=64, input_length=seq_length),  # Word embeddings
    SimpleRNN(256, return_sequences=False),  # RNN Layer
    Dense(256, activation='relu'),  # Fully Connected Layer
    Dense(total_words, activation='softmax')  # Output Layer
])

# 256 in RNN - The number of hidden units (size of the hidden state vector)
# return_sequences=False  - The RNN will only return the final hidden state after processing the entire sequence

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X, y, epochs=30, batch_size=128)

# Function to generate text using RNN
def generate_text(seed_text, next_words=50):
    for _ in range(next_words):
        tokenized_input = tokenizer.texts_to_sequences([seed_text])[0]
        tokenized_input = pad_sequences([tokenized_input], maxlen=seq_length, padding='pre')

        predicted_probs = model.predict(tokenized_input, verbose=0)
        predicted_index = np.argmax(predicted_probs)
        predicted_word = tokenizer.index_word.get(predicted_index, "")

        seed_text += " " + predicted_word
    return seed_text

# Generate text using the trained model
print(generate_text("harry looked at"))

Epoch 1/30
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m115s[0m 177ms/step - accuracy: 0.0429 - loss: 6.9379
Epoch 2/30
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 175ms/step - accuracy: 0.0687 - loss: 6.2554
Epoch 3/30
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 179ms/step - accuracy: 0.1024 - loss: 5.7871
Epoch 4/30
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 179ms/step - accuracy: 0.1203 - loss: 5.4588
Epoch 5/30
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m145s[0m 184ms/step - accuracy: 0.1329 - loss: 5.2147
Epoch 6/30
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 177ms/step - accuracy: 0.1470 - loss: 4.9842
Epoch 7/30
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 177ms/step - accuracy: 0.1551 - loss: 4.7806
Epoch 8/30
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 175ms/step - accuracy: 0.1672 - loss: 4.5572
Epoch 9/