<a href="https://colab.research.google.com/github/ArpanChaudhary/Pandas/blob/main/TextGenerationModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical


In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('/content/Guj_Data.csv')

In [4]:
df.shape

(500001, 4)

In [5]:
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Correct,Incorrect
0,0,0,૯મી ઓગસ્ટ ૨૦૧૬ના રોજ આદિવાસી વિકાસ સંગઠન દ્વાર...,૯મી ઓઘશત ૨૦૧૬ના રૌજ આદિવાસી વિકાસ સંગઠન દ્વારા...
1,1,1,"આ પતાવટની આંતરમાળખા ખૂબ સારી રીતે વિકસિત નથી, ...","આ પતાવટની આંતરમાળખા ખૂબ સારી રીતે વિકસિત નથી, ..."
2,2,2,"વહીવટ બિલ્ડિંગ નજીક પાછળના બાજુ પર, હોટેલ આંતર...","વહીવટ બીલડીઁઘ નજીક પાછળના બાજુ પર, હોટેલ આંતરિ..."
3,3,3,ગુરુવારે સવારે બેંકો ખુલતા પહેલા પ્રતિબંધિત નો...,ઘૂરૂવારૈ શવારૈ બેંકો ખુલતા પહૈલા પ્રતિબંધિત નો...
4,4,4,ઈન્ડિયન આઈડલ 11ના આગામી એપિસોડમાં ઉદિત નારાયણ ...,ઈન્ડિયન આઈડલ 11ના આગામી એપીશૌડમાઁ ઉદિત નારાયણ ...


In [6]:
df = df[0:25000]

In [7]:
text = ' '.join(df['Incorrect'].fillna('').astype(str)) + ' ' + ' '.join(df['Correct'].fillna('').astype(str))

# Convert text to lowercase and remove special characters
text = text.lower().replace('\n', '')

# Create a vocabulary (unique characters) from the text
vocab = sorted(set(text))

# Create mappings between characters and indices
char_to_idx = {char: idx for idx, char in enumerate(vocab)}
idx_to_char = {idx: char for idx, char in enumerate(vocab)}

# Convert the entire text to integer indices
encoded_text = np.array([char_to_idx[c] for c in text])


In [8]:
sequence_length = 100  # Choose sequence length
sequences = []
next_chars = []

# Create sequences of 40 characters and the next character
for i in range(len(encoded_text) - sequence_length):
    sequences.append(encoded_text[i:i + sequence_length])
    next_chars.append(encoded_text[i + sequence_length])

# Convert to NumPy arrays
X = np.array(sequences, dtype=np.float32)
y = np.array(next_chars, dtype=np.int32)

# One-hot encode the labels (y)
y = to_categorical(y, num_classes=len(vocab))


In [9]:
# Define the model
model = Sequential()

# Add Embedding layer: Converts character indices to dense vectors of fixed size
model.add(Embedding(input_dim=len(vocab), output_dim=128, input_length=sequence_length))  # Increased embedding size

# First LSTM layer with Dropout
model.add(LSTM(256, return_sequences=True))  # Increased to 256 units for richer patterns
model.add(Dropout(0.3))

# Second LSTM layer with Dropout
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.3))

# Third LSTM layer with Dropout
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.3))

# Fourth LSTM layer with Dropout
model.add(LSTM(128, return_sequences=False))  # Final LSTM layer without sequences
model.add(Dropout(0.3))

# Dense layer with ReLU activation and Dropout
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.4))  # Increased Dropout for additional regularization

# Second Dense layer with ReLU activation and Dropout
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.4))

# Output layer with softmax activation
model.add(Dense(len(vocab), activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam')

# Display the model summary
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 128)          27648     
                                                                 
 lstm (LSTM)                 (None, 100, 256)          394240    
                                                                 
 dropout (Dropout)           (None, 100, 256)          0         
                                                                 
 lstm_1 (LSTM)               (None, 100, 256)          525312    
                                                                 
 dropout_1 (Dropout)         (None, 100, 256)          0         
                                                                 
 lstm_2 (LSTM)               (None, 100, 128)          197120    
                                                                 
 dropout_2 (Dropout)         (None, 100, 128)          0

In [12]:
# Train the model
model.fit(X, y, batch_size=64, epochs=1)

 3237/71530 [>.............................] - ETA: 10:57:14 - loss: 2.9346

KeyboardInterrupt: 

In [None]:
# Function to generate text using the trained model
def generate_text(seed_text, num_chars):
    # Convert the seed text to integer indices
    input_eval = [char_to_idx[char] for char in seed_text]
    input_eval = pad_sequences([input_eval], maxlen=sequence_length, truncating='pre')

    generated_text = seed_text

    for i in range(num_chars):
        # Predict the next character
        predictions = model.predict(input_eval)
        predicted_idx = np.argmax(predictions[0])

        # Convert the predicted index to character
        next_char = idx_to_char[predicted_idx]
        generated_text += next_char

        # Update the input sequence
        input_eval = np.append(input_eval[:, 1:], [[predicted_idx]], axis=1)

    return generated_text




In [None]:
# Generate 40 characters of text
seed = "આ પતાવટની આંતરમાળખા ખૂબ સારી રીતે વિકસિત "
generated_text = generate_text(seed, num_chars=100)
print(generated_text)

In [None]:
# Save the model to a file
model.save('Text_generation_model.h5')  # Saves the model in h5 format
print("Model saved successfully.")


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
import pickle

# Load your training data
# Assuming you have your text data in a CSV file

texts = df['Correct'].tolist()  # Replace with the actual column name





In [None]:
# Create and fit the tokenizer
tokenizer = Tokenizer(num_words=178)

tokenizer.fit_on_texts(texts)




In [None]:
tokenizer_path = 'tokenizer.pickle'
with open(tokenizer_path, 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
from tensorflow.keras.models import load_model
model = load_model('Text_generation_model.h5')
print(model.summary())


In [None]:
from google.colab import files

# Replace 'model.h5' or 'model.pkl' with your actual model filename
files.download('Text_generation_model.h5')

In [None]:
files.download('tokenizer.pickle')