# **Text Generation using RNN**

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import os
import pandas as pd
import re
import numpy as np
import random
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### **1. Load and Combining all the csv files**

In [4]:
# Path to the folder containing CSV files
folder_path = '/content/drive/MyDrive/Natural Language Processing(NLP)/My Practice/Text Generation/dataset'

# List to store data from all CSV files
all_texts = []

# Loop through each file in the folder
for file in os.listdir(folder_path):
  if file.endswith('.csv'):
    file_path = os.path.join(folder_path, file)

    # Load the csv file into a dataframe
    df = pd.read_csv(file_path)

    if 'Input Text' in df.columns:
      all_texts.extend(df['Input Text'].dropna().tolist())
    else:
      print(f"Column 'Input Text' not found in {file}")

# Combine all texts into a single string
combined_text = " ".join(all_texts)

print(f"Total characters in combined text: {len(combined_text)}")

Total characters in combined text: 13044


### **2. Data Preprocessing**

In [5]:
# Text cleaning (remove any special characters)
cleaned_text = re.sub(r'[^a-zA-Z\s]', '', combined_text).lower()

# Tokenization and encoding
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts([cleaned_text])

# Convert text to sequences
text_sequences = tokenizer.texts_to_sequences([cleaned_text])[0]
vocab_size = len(tokenizer.word_index) + 1

print(f"Vocabulary Size: {vocab_size}")
print(f"Sample Encoded Sequence: {text_sequences[:10]}")

Vocabulary Size: 28
Sample Encoded Sequence: [11, 12, 3, 15, 6, 4, 2, 1, 11, 10]


In [6]:
print(tokenizer.word_index)

{' ': 1, 'e': 2, 'i': 3, 't': 4, 's': 5, 'a': 6, 'r': 7, 'o': 8, 'n': 9, 'h': 10, 'c': 11, 'l': 12, 'd': 13, 'f': 14, 'm': 15, 'p': 16, 'u': 17, 'g': 18, 'b': 19, 'w': 20, 'v': 21, 'k': 22, 'y': 23, 'z': 24, 'j': 25, 'x': 26, 'q': 27}


### **3. Data Preparation for RNN**

In [7]:
sequence_length = 100
X, y = [], []

for i in range(len(text_sequences) - sequence_length):
  X.append(text_sequences[i:i + sequence_length])
  y.append(text_sequences[i + sequence_length])

# Convert to Numpy arrays
X = np.array(X)
y = np.array(y)

# One-hot encode the labels
y = to_categorical(y, num_classes=vocab_size)

print(f"Shape of X: {X.shape}, Shape of y: {y.shape}")

Shape of X: (12912, 100), Shape of y: (12912, 28)


### **4. Defining the RNN model**

In [8]:
# Define the model
model = Sequential([
    Embedding(vocab_size, 50, input_length=sequence_length),
    LSTM(150, return_sequences=True),
    LSTM(150, return_sequences=True),
    LSTM(150, return_sequences=True),
    LSTM(150),
    Dense(vocab_size, activation='softmax')
])

model.build(input_shape=(None, sequence_length))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

### **5. Train the model**

In [9]:
model.fit(X, y, epochs=100, batch_size=32)

Epoch 1/100
[1m404/404[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 22ms/step - accuracy: 0.1726 - loss: 2.8504
Epoch 2/100
[1m404/404[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 22ms/step - accuracy: 0.1831 - loss: 2.7796
Epoch 3/100
[1m404/404[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 22ms/step - accuracy: 0.1839 - loss: 2.7883
Epoch 4/100
[1m404/404[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 21ms/step - accuracy: 0.1738 - loss: 2.8000
Epoch 5/100
[1m404/404[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 22ms/step - accuracy: 0.1861 - loss: 2.7771
Epoch 6/100
[1m404/404[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 22ms/step - accuracy: 0.2932 - loss: 2.4277
Epoch 7/100
[1m404/404[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 20ms/step - accuracy: 0.3998 - loss: 2.0295
Epoch 8/100
[1m404/404[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 20ms/step - accuracy: 0.4837 - loss: 1.7439
Epoch 9/100
[1m404

<keras.src.callbacks.history.History at 0x7d214baf4110>

### **6. Generate Text**

In [10]:
def generate_text(text, model, tokenizer, length):
  new_text = text

  for _ in range(length):
    encoded = tokenizer.texts_to_sequences([new_text[-sequence_length:]])[0]
    encoded = np.array(encoded).reshape(1, -1)
    prediction = np.argmax(model.predict(encoded), axis=1)
    new_text += tokenizer.index_word[prediction[0]]

  return new_text

text = "the quick brown fox"
generated_text = generate_text(text, model, tokenizer, 100)
print(generated_text)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 323ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 328ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 