In [None]:
# 1. How to implement a simple text classification model using LSTM in Keras0

In [1]:
# Import Libraries
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Step 1: Sample Dataset
texts = [
    "This is a positive review",
    "I didn't like the product",
    "Absolutely amazing experience",
    "Worst service ever received",
    "Highly recommend this item",
]
labels = ["positive", "negative", "positive", "negative", "positive"]

# Step 2: Encode Labels
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)  # Convert to numerical format

# Step 3: Tokenize and Pad Sequences
tokenizer = Tokenizer(num_words=5000)  # Keep only top 5000 words
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
padded_sequences = pad_sequences(sequences, maxlen=10, padding='post', truncating='post')

# Step 4: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    padded_sequences, labels_encoded, test_size=0.2, random_state=42
)

# Step 5: Build LSTM Model
model = Sequential([
    Embedding(input_dim=5000, output_dim=64, input_length=10),
    LSTM(64, return_sequences=False),
    Dropout(0.5),
    Dense(1, activation='sigmoid')  # Use 'sigmoid' for binary classification
])

# Step 6: Compile the Model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Step 7: Train the Model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=32)

# Step 8: Evaluate the Model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")

# Step 9: Make Predictions
sample_texts = ["I love this!", "This was awful."]
sample_sequences = tokenizer.texts_to_sequences(sample_texts)
sample_padded = pad_sequences(sample_sequences, maxlen=10, padding='post', truncating='post')
predictions = model.predict(sample_padded)
predicted_labels = ["positive" if p > 0.5 else "negative" for p in predictions]
print(predicted_labels)


Epoch 1/10




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - accuracy: 0.0000e+00 - loss: 0.7016 - val_accuracy: 0.0000e+00 - val_loss: 0.7091
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 230ms/step - accuracy: 0.7500 - loss: 0.6837 - val_accuracy: 0.0000e+00 - val_loss: 0.7225
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step - accuracy: 0.7500 - loss: 0.6734 - val_accuracy: 0.0000e+00 - val_loss: 0.7360
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step - accuracy: 0.7500 - loss: 0.6692 - val_accuracy: 0.0000e+00 - val_loss: 0.7500
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 156ms/step - accuracy: 0.7500 - loss: 0.6675 - val_accuracy: 0.0000e+00 - val_loss: 0.7645
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 119ms/step - accuracy: 0.7500 - loss: 0.6589 - val_accuracy: 0.0000e+00 - val_loss: 0.7797
Epoch 7/10
[1m1/1[0m [32m

In [2]:
# 2.How to generate sequences of text using a Recurrent Neural Network (RNN)

In [3]:
# Import Libraries
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Step 1: Sample Text Data
text = """
Recurrent Neural Networks are widely used for sequence data.
They excel in natural language processing, time series, and much more.
"""

# Step 2: Preprocess Text
# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1  # Total words including index 0 for padding

# Generate Sequences
input_sequences = []
for line in text.split("\n"):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# Pad Sequences
max_sequence_length = max([len(seq) for seq in input_sequences])
padded_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre')

# Extract Features (X) and Labels (y)
X, y = padded_sequences[:, :-1], padded_sequences[:, -1]
y = to_categorical(y, num_classes=total_words)  # One-hot encoding labels

# Step 3: Build the RNN Model
model = Sequential([
    Embedding(input_dim=total_words, output_dim=50, input_length=max_sequence_length-1),
    SimpleRNN(100, return_sequences=False),
    Dense(total_words, activation='softmax')
])

# Compile the Model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Step 4: Train the Model
model.fit(X, y, epochs=100, verbose=1)

# Step 5: Generate Text
def generate_text(seed_text, num_words):
    for _ in range(num_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_length-1, padding='pre')
        predicted = np.argmax(model.predict(token_list), axis=-1)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

# Generate Sequence
seed_text = "Recurrent Neural Networks"
generated_text = generate_text(seed_text, num_words=10)
print("Generated Text:")
print(generated_text)


Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - accuracy: 0.0556 - loss: 3.0805
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - accuracy: 0.1667 - loss: 3.0196
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step - accuracy: 0.2778 - loss: 2.9622
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0.3333 - loss: 2.9060
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - accuracy: 0.4444 - loss: 2.8494
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 0.5556 - loss: 2.7912
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.5556 - loss: 2.7303
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - accuracy: 0.6667 - loss: 2.6657
Epoch 9/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

In [4]:
# 3. How to perform sentiment analysis using a simple CNN model0

In [5]:
# Import Libraries
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Step 1: Sample Data
texts = [
    "I love this product!",
    "This is the worst service I've ever experienced.",
    "Absolutely fantastic performance.",
    "I will never buy this again.",
    "Highly recommend this to everyone!",
    "Not worth the price.",
]
labels = ["positive", "negative", "positive", "negative", "positive", "negative"]

# Step 2: Encode Labels
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)

# Step 3: Tokenize Text and Pad Sequences
max_vocab_size = 5000  # Maximum vocabulary size
max_sequence_length = 100  # Maximum sequence length

tokenizer = Tokenizer(num_words=max_vocab_size)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post', truncating='post')

# Step 4: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    padded_sequences, labels_encoded, test_size=0.2, random_state=42
)

# Step 5: Build the CNN Model
model = Sequential([
    Embedding(input_dim=max_vocab_size, output_dim=50, input_length=max_sequence_length),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    GlobalMaxPooling1D(),
    Dropout(0.5),
    Dense(10, activation='relu'),
    Dense(1, activation='sigmoid')  # Use 'sigmoid' for binary classification
])

# Step 6: Compile the Model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Step 7: Train the Model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=16)

# Step 8: Evaluate the Model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")

# Step 9: Make Predictions
sample_texts = ["I absolutely love this!", "This was a terrible experience."]
sample_sequences = tokenizer.texts_to_sequences(sample_texts)
sample_padded = pad_sequences(sample_sequences, maxlen=max_sequence_length, padding='post', truncating='post')
predictions = model.predict(sample_padded)
predicted_labels = ["positive" if p > 0.5 else "negative" for p in predictions]
print("Predicted Sentiments:", predicted_labels)


Epoch 1/10




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 0.2500 - loss: 0.6902 - val_accuracy: 0.5000 - val_loss: 0.6942
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 91ms/step - accuracy: 0.5000 - loss: 0.6910 - val_accuracy: 0.5000 - val_loss: 0.6960
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 132ms/step - accuracy: 0.5000 - loss: 0.6691 - val_accuracy: 0.5000 - val_loss: 0.6970
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step - accuracy: 0.5000 - loss: 0.6655 - val_accuracy: 0.5000 - val_loss: 0.6969
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - accuracy: 0.5000 - loss: 0.6631 - val_accuracy: 0.5000 - val_loss: 0.6968
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step - accuracy: 0.5000 - loss: 0.6631 - val_accuracy: 0.5000 - val_loss: 0.6972
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

In [None]:
# 4. ,G How to perform Named Entity Recognition (NER) using spaCy0



In [6]:
# Import spaCy library
import spacy

# Load pre-trained English model
nlp = spacy.load("en_core_web_sm")

# Example input text
text = "Barack Obama was born on August 4, 1961, in Honolulu, Hawaii."

# Process the text using spaCy NLP pipeline
doc = nlp(text)

# Print out named entities, their corresponding labels
print("Named Entities, Phrases, and Concepts:")
for entity in doc.ents:
    print(f"{entity.text} ({entity.label_})")


Named Entities, Phrases, and Concepts:
Barack Obama (PERSON)
August 4, 1961 (DATE)
Honolulu (GPE)
Hawaii (GPE)


In [11]:
# 5. How to implement a simple Seq2Seq model for machine translation using LSTM in Keras?

In [12]:
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam

# 1. Data Preparation (Dummy example of English to French translation)
input_texts = ["Hello", "How are you?", "Good morning", "Good night"]
output_texts = ["Bonjour", "Comment ça va?", "Bonjour", "Bonne nuit"]

# Parameters
input_vocab_size = 100  # Size of source vocabulary (e.g., English)
output_vocab_size = 100  # Size of target vocabulary (e.g., French)
max_input_length = max([len(sentence.split()) for sentence in input_texts])
max_output_length = max([len(sentence.split()) for sentence in output_texts])

# Tokenize and Pad
def tokenize_and_pad(sentences, max_len, vocab_size):
    tokenizer = {word: i + 1 for i, word in enumerate(set(' '.join(sentences).split()))}
    sequences = [[tokenizer.get(word, 0) for word in sentence.split()] for sentence in sentences]
    padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')
    return padded_sequences, tokenizer

# Process input and output data
X_train, input_tokenizer = tokenize_and_pad(input_texts, max_input_length, input_vocab_size)
Y_train, output_tokenizer = tokenize_and_pad(output_texts, max_output_length, output_vocab_size)

# 2. One-hot encoding the target sequences for training
# We need to one-hot encode the target sequences
def one_hot_encode(sequences, vocab_size, max_length):
    # One-hot encode each sequence in Y
    one_hot_encoded = np.zeros((len(sequences), max_length, vocab_size), dtype='float32')
    for i, sequence in enumerate(sequences):
        for t, word_idx in enumerate(sequence):
            if word_idx > 0:  # Skip padding value
                one_hot_encoded[i, t, word_idx] = 1.0
    return one_hot_encoded

Y_train_one_hot = one_hot_encode(Y_train, output_vocab_size, max_output_length)

# 3. Define the Seq2Seq Model

# Input layer for encoder
encoder_inputs = Input(shape=(None,))

# Encoder Embedding Layer
encoder_embedding = Embedding(input_dim=input_vocab_size, output_dim=64)(encoder_inputs)
encoder_lstm = LSTM(128, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Input layer for decoder (shifted output sequence)
decoder_inputs = Input(shape=(None,))

# Decoder Embedding Layer
decoder_embedding = Embedding(input_dim=output_vocab_size, output_dim=64)(decoder_inputs)

# Decoder LSTM Layer
decoder_lstm = LSTM(128, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)

# Dense layer to output probability distribution over vocab for next word prediction
decoder_dense = Dense(output_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Final Seq2Seq Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# 4. Compile the model
model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])

# 5. Print the model summary
model.summary()

# 6. Train the model (shift decoder input sequences by 1)
model.fit([X_train, Y_train[:, :-1]], Y_train_one_hot[:, 1:], batch_size=32, epochs=10)


Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step - accuracy: 0.0000e+00 - loss: 1.7264
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - accuracy: 0.1250 - loss: 1.7212
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - accuracy: 0.3750 - loss: 1.7160
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - accuracy: 0.3750 - loss: 1.7106
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - accuracy: 0.3750 - loss: 1.7049
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - accuracy: 0.3750 - loss: 1.6988
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step - accuracy: 0.3750 - loss: 1.6922
Epoch 8/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - accuracy: 0.3750 - loss: 1.6851
Epoch 9/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

<keras.src.callbacks.history.History at 0x7a3a5473d7e0>

In [13]:
# 6. G How to generate text using a pre-trained transformer model (GPT-2)0

In [14]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# 1. Load pre-trained GPT-2 model and tokenizer
model_name = "gpt2"  # This is the base GPT-2 model, you can choose "gpt2-medium", "gpt2-large", or "gpt2-xl" for bigger models.
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# 2. Set the model to evaluation mode (necessary for generation)
model.eval()

# 3. Encode the input prompt into token IDs
prompt_text = "Once upon a time in a land far, far away"
input_ids = tokenizer.encode(prompt_text, return_tensors="pt")

# 4. Generate text from the prompt
output = model.generate(
    input_ids,
    max_length=100,  # maximum length of the generated text
    num_return_sequences=1,  # how many sequences to generate
    no_repeat_ngram_size=2,  # prevent repetition of the same n-grams
    top_k=50,  # sampling from top k probability mass
    top_p=0.95,  # nucleus sampling
    temperature=0.7,  # randomness in text generation
    pad_token_id=tokenizer.eos_token_id,  # padding token
)

# 5. Decode the generated token IDs into text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

# 6. Display the generated text
print(generated_text)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Once upon a time in a land far, far away, the world was a place of great beauty and great danger. The world of the gods was the land of darkness and darkness. And the darkness of this world, which was far from the light of day, was not the place where the sun and the moon met. It was in the midst of all the worlds, and it was there that the stars and all that were in them met, that they were all in one place.




In [15]:
# 7. 2G How to apply data augmentation for text in NLP0


In [None]:
import random
import nltk
from nltk.corpus import wordnet
from googletrans import Translator
import gensim.downloader as api
from transformers import GPT2LMHeadModel, GPT2Tokenizer

nltk.download('wordnet')

# 1. Synonym Replacement
def synonym_augmentation(sentence):
    words = sentence.split()
    new_sentence = []

    for word in words:
        synonyms = wordnet.synsets(word)
        if synonyms:
            # Pick a synonym randomly from WordNet synsets
            synonym = synonyms[0].lemmas()[0].name()
            new_sentence.append(synonym)
        else:
            new_sentence.append(word)

    return ' '.join(new_sentence)

# 2. Random Insertion
def random_insertion(sentence, n=2):
    words = sentence.split()
    for _ in range(n):
        random_word = random.choice(words)  # Choose a random word
        words.insert(random.randint(0, len(words)), random_word)  # Insert at a random position
    return ' '.join(words)

# 3. Random Swap
def random_swap(sentence, n=2):
    words = sentence.split()
    for _ in range(n):
        idx1, idx2 = random.sample(range(len(words)), 2)
        words[idx1], words[idx2] = words[idx2], words[idx1]
    return ' '.join(words)

# 4. Back Translation
def back_translation(sentence, src_language='en', dest_language='fr'):
    translator = Translator()
    # Translate to target language (French)
    translated = translator.translate(sentence, src=src_language, dest=dest_language).text
    # Translate back to original language (English)
    back_translated = translator.translate(translated, src=dest_language, dest=src_language).text
    return back_translated

# 5. Word Embedding-based Augmentation (Word2Vec)
word2vec = api.load("word2vec-google-news-300")

def word2vec_augmentation(sentence, n=2):
    words = sentence.split()
    new_sentence = words.copy()

    for _ in range(n):
        word = random.choice(words)
        # Find nearest neighbors (synonyms)
        similar_words = word2vec.most_similar(word, topn=5)
        # Replace with a random similar word
        new_word = random.choice(similar_words)[0]
        new_sentence[words.index(word)] = new_word

    return ' '.join(new_sentence)

# 6. Text Generation using Pre-trained GPT-2
def generate_text_gpt2(prompt_text, model_name="gpt2", max_length=50):
    model = GPT2LMHeadModel.from_pretrained(model_name)
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)

    # Encode the input text to generate new sequences
    input_ids = tokenizer.encode(prompt_text, return_tensors="pt")

    # Generate a sequence using GPT-2
    output = model.generate(input_ids, max_length=max_length, num_return_sequences=1, no_repeat_ngram_size=2, top_p=0.95, top_k=50)

    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text

# Example Usage
sentence = "The quick brown fox jumps over the lazy dog"

# Apply Synonym Replacement
augmented_synonym = synonym_augmentation(sentence)
print("Synonym Augmentation:", augmented_synonym)

# Apply Random Insertion
augmented_insertion = random_insertion(sentence, 2)
print("Random Insertion:", augmented_insertion)

# Apply Random Swap
augmented_swap = random_swap(sentence, 2)
print("Random Swap:", augmented_swap)

# Apply Back Translation
augmented_back_translation = back_translation(sentence)
print("Back Translation:", augmented_back_translation)

# Apply Word2Vec Embedding-based Augmentation
augmented_word2vec = word2vec_augmentation(sentence, 2)
print("Word2Vec Augmentation:", augmented_word2vec)

# Apply Text Generation using GPT-2
generated_text = generate_text_gpt2("The quick brown fox jumps over", max_length=50)
print("Generated Text using GPT-2:", generated_text)


[nltk_data] Downloading package wordnet to /root/nltk_data...




In [18]:
pip install googletrans==4.0.0-rc1

Collecting googletrans==4.0.0-rc1
  Downloading googletrans-4.0.0rc1.tar.gz (20 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting httpx==0.13.3 (from googletrans==4.0.0-rc1)
  Downloading httpx-0.13.3-py3-none-any.whl.metadata (25 kB)
Collecting hstspreload (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading hstspreload-2025.1.1-py3-none-any.whl.metadata (2.1 kB)
Collecting chardet==3.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading chardet-3.0.4-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting idna==2.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading idna-2.10-py2.py3-none-any.whl.metadata (9.1 kB)
Collecting rfc3986<2,>=1.3 (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting httpcore==0.9.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading httpcore-0.9.1-py3-none-any.whl.metadata (4.6 kB)
Collecting h11<0.10,>=0.8 (from httpcore==0.9.*->httpx==0.13.3->googl

In [None]:
# 8. G How can you add an Attention Mechanism to a Seq2Seq model?

In [None]:
import tensorflow as tf
from tensorflow.keras import layers

# 1. Attention Layer Definition
class AttentionLayer(layers.Layer):
    def __init__(self, hidden_size):
        super(AttentionLayer, self).__init__()
        self.attention_dense = layers.Dense(1, activation=None)

    def call(self, encoder_outputs, decoder_hidden_state):
        # Calculate attention scores (dot product of decoder state and encoder hidden states)
        # Shape of decoder_hidden_state: (batch_size, hidden_size)
        # Shape of encoder_outputs: (batch_size, sequence_length, hidden_size)

        query = tf.expand_dims(decoder_hidden_state, 1)  # Expanding to (batch_size, 1, hidden_size)
        context_weights = tf.nn.softmax(self.attention_dense(encoder_outputs), axis=1)

        # Calculate the context vector as a weighted sum of encoder hidden states
        context_vector = tf.reduce_sum(context_weights * encoder_outputs, axis=1)
        return context_vector, context_weights

# 2. Encoder Model Definition
def build_encoder(input_vocab_size, embedding_dim, hidden_size):
    inputs = layers.Input(shape=(None,))
    embedding = layers.Embedding(input_vocab_size, embedding_dim)(inputs)
    encoder_lstm = layers.LSTM(hidden_size, return_state=True, return_sequences=True)
    encoder_outputs, state_h, state_c = encoder_lstm(embedding)

    return tf.keras.Model(inputs, [encoder_outputs, state_h, state_c])

# 3. Decoder Model with Attention
def build_decoder(input_vocab_size, embedding_dim, hidden_size, encoder_outputs):
    inputs = layers.Input(shape=(None,))
    embedding = layers.Embedding(input_vocab_size, embedding_dim)(inputs)
    lstm = layers.LSTM(hidden_size, return_state=True, return_sequences=True)
    decoder_lstm_out, decoder_h, decoder_c = lstm(embedding, initial_state=[state_h, state_c])

    # Attention mechanism
    attention_layer = AttentionLayer(hidden_size)
    context_vector, attention_weights = attention_layer(encoder_outputs, decoder_h)

    # Concatenate context vector with the LSTM output
    concatenated_context = layers.Concatenate(axis=-1)([decoder_lstm_out, context_vector])

    # Use dense layers to generate output tokens
    decoder_dense = layers.Dense(input_vocab_size, activation='softmax')
    decoder_output = decoder_dense(concatenated_context)

    return tf.keras.Model(inputs, [decoder_output, attention_weights])

# 4. Full Seq2Seq Model with Attention
def build_seq2seq_model(input_vocab_size, output_vocab_size, embedding_dim, hidden_size):
    encoder = build_encoder(input_vocab_size, embedding_dim, hidden_size)
    encoder_outputs, state_h, state_c = encoder.output

    decoder = build_decoder(output_vocab_size, embedding_dim, hidden_size, encoder_outputs)

    return encoder, decoder

# 5. Model Compilation and Training
# Example input/output sizes and hidden size
input_vocab_size = 5000  # Example
output_vocab_size = 5000  # Example
embedding_dim = 128  # Example
hidden_size = 256  # Example

encoder, decoder = build_seq2seq_model(input_vocab_size, output_vocab_size, embedding_dim, hidden_size)

# Compile the model (e.g., using categorical crossentropy)
decoder_output = decoder.output
model = tf.keras.Model(encoder.input, decoder_output)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# You can train the model using model.fit with your input and target sequences
# Example: model.fit([input_sequences, target_sequences], target_output)

# Prediction with the Seq2Seq model using Attention
def predict(input_sequence):
    # Apply the model to generate prediction
    # Initial states of encoder
    encoder_model = tf.keras.Model(encoder.input, [encoder.output[0], encoder.output[1], encoder.output[2]])
    encoder_outputs, state_h, state_c = encoder_model.predict(input_sequence)

    # Prepare decoder model for inference (used during generation)
    decoder_input = layers.Input(shape=(None,))
    decoder_lstm = layers.LSTM(hidden_size, return_state=True, return_sequences=True)
    decoder_lstm_out, state_h, state_c = decoder_lstm(decoder_input, initial_state=[state_h, state_c])
    attention_layer = AttentionLayer(hidden_size)
    context_vector, attention_weights = attention_layer(encoder_outputs, state_h)
    concatenated_context = layers.Concatenate(axis=-1)([decoder_lstm_out, context_vector])
    decoder_dense = layers.Dense(output_vocab_size, activation='softmax')
    decoder_output = decoder_dense(concatenated_context)

    decoder_model = tf.keras.Model(decoder_input, decoder_output)

    # Generate prediction here by feeding input to the decoder model, loop over time steps, etc.
    # ...

