<a href="https://colab.research.google.com/github/ARJUN108-verma/Elite_Tech_internship/blob/main/GENERATIVE_TEXT_MODEL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

GENERATIVE TEXT MODEL

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import random
import re

In [2]:
# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

In [3]:
# Step 1: Load and preprocess the training data
def load_and_preprocess_data(file_path, sample_size=5000):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()

    # Sample a portion of the text if it's too large
    if len(text) > sample_size:
        start_idx = random.randint(0, len(text) - sample_size - 1)
        text = text[start_idx:start_idx + sample_size]

    # Basic cleaning
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text)     # Normalize whitespace

    return text

# For this example, let's use a sample text file
# In practice, you would use a larger corpus related to your specific topics
try:
    text = load_and_preprocess_data('sample_text.txt')
except:
    # Fallback text if file not found
    text = """
    Artificial intelligence is transforming many industries. Machine learning algorithms can now
    recognize patterns in data that humans might miss. Deep learning models like neural networks
    are particularly powerful for tasks like image recognition and natural language processing.
    The field of AI continues to advance rapidly with new architectures being developed regularly.
    Researchers are working on making AI systems more explainable and trustworthy. Ethical
    considerations in AI development are becoming increasingly important as these technologies
    are deployed in sensitive areas like healthcare and criminal justice.
    """


In [4]:
# Step 2: Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1

In [5]:
# Create input sequences and labels
input_sequences = []
for line in text.split('\n'):
    if not line.strip():
        continue
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [6]:
# Pad sequences
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')

In [7]:
# Create predictors and label
X = input_sequences[:, :-1]
y = input_sequences[:, -1]
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

In [8]:
# Step 3: Build the LSTM model
model = Sequential([
    Embedding(total_words, 100, input_length=max_sequence_len-1),
    LSTM(150, return_sequences=True),
    LSTM(100),
    Dense(total_words, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())



None


In [9]:
# Step 4: Train the model
# Note: In a real scenario, you'd train for more epochs with a larger dataset
history = model.fit(X, y, epochs=50, verbose=1)

Epoch 1/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 47ms/step - accuracy: 0.0000e+00 - loss: 4.2923
Epoch 2/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step - accuracy: 0.0819 - loss: 4.2793
Epoch 3/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - accuracy: 0.0923 - loss: 4.2690
Epoch 4/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step - accuracy: 0.0637 - loss: 4.2494
Epoch 5/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - accuracy: 0.0572 - loss: 4.2057
Epoch 6/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - accuracy: 0.0390 - loss: 4.1649
Epoch 7/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - accuracy: 0.0702 - loss: 4.1427
Epoch 8/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - accuracy: 0.0988 - loss: 4.0895
Epoch 9/50
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0

In [10]:
# Step 5: Text generation function
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted_probs = model.predict(token_list, verbose=0)
        predicted = np.argmax(predicted_probs, axis=-1)

        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text


In [None]:
# Step 6: Interactive text generation
print("\nText Generation Demo")
print("Enter a seed phrase or prompt (type 'quit' to exit)")

while True:
    user_input = input("\nEnter your prompt: ")
    if user_input.lower() == 'quit':
        break

    try:
        generated_text = generate_text(
            seed_text=user_input,
            next_words=50,  # Number of words to generate
            model=model,
            max_sequence_len=max_sequence_len
        )
        print("\nGenerated text:")
        print(generated_text)
    except Exception as e:
        print(f"Error generating text: {e}")
        print("Please try a different prompt.")


Text Generation Demo
Enter a seed phrase or prompt (type 'quit' to exit)

Enter your prompt: The future of AI

Generated text:
The future of AI continues continues to advance rapidly with new architectures being developed regularly regularly regularly regularly regularly regularly regularly regularly regularly regularly regularly regularly regularly networks networks networks networks developed developed developed developed developed regularly regularly regularly regularly regularly regularly regularly regularly regularly regularly regularly regularly regularly networks networks networks networks developed
