<a href="https://colab.research.google.com/github/Aravind8281/Natural_language_Processing/blob/main/Tensorflow_NLP_java.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Word Embedding using tensorflow

In [20]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

corpus = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

sequences = tokenizer.texts_to_sequences(corpus)
input_sequences = []
for sequence in sequences:
    for i in range(1, len(sequence)):
        n_gram_sequence = sequence[:i+1]
        input_sequences.append(n_gram_sequence)

max_sequence_length = max(len(seq) for seq in input_sequences)
padded_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre')
X = padded_sequences[:, :-1]
y = tf.keras.utils.to_categorical(padded_sequences[:, -1], num_classes=total_words)

model = Sequential()
model.add(Embedding(input_dim=total_words, output_dim=50, input_length=max_sequence_length-1))
model.add(Flatten())
model.add(Dense(total_words, activation="softmax"))
model.compile(optimizer="adam", loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X, y, epochs=50, verbose=2)

word_embeddings = model.layers[0].get_weights()[0]
word_index = tokenizer.word_index

for word, index in word_index.items():
    if index < 10:
        print(f"{word}: {word_embeddings[index]}")


Epoch 1/50
1/1 - 0s - loss: 2.3080 - accuracy: 0.0000e+00 - 396ms/epoch - 396ms/step
Epoch 2/50
1/1 - 0s - loss: 2.2977 - accuracy: 0.0000e+00 - 5ms/epoch - 5ms/step
Epoch 3/50
1/1 - 0s - loss: 2.2875 - accuracy: 0.1667 - 5ms/epoch - 5ms/step
Epoch 4/50
1/1 - 0s - loss: 2.2773 - accuracy: 0.1667 - 6ms/epoch - 6ms/step
Epoch 5/50
1/1 - 0s - loss: 2.2672 - accuracy: 0.2778 - 6ms/epoch - 6ms/step
Epoch 6/50
1/1 - 0s - loss: 2.2570 - accuracy: 0.4444 - 6ms/epoch - 6ms/step
Epoch 7/50
1/1 - 0s - loss: 2.2469 - accuracy: 0.4444 - 6ms/epoch - 6ms/step
Epoch 8/50
1/1 - 0s - loss: 2.2368 - accuracy: 0.4444 - 5ms/epoch - 5ms/step
Epoch 9/50
1/1 - 0s - loss: 2.2266 - accuracy: 0.5556 - 5ms/epoch - 5ms/step
Epoch 10/50
1/1 - 0s - loss: 2.2163 - accuracy: 0.6667 - 6ms/epoch - 6ms/step
Epoch 11/50
1/1 - 0s - loss: 2.2060 - accuracy: 0.7222 - 5ms/epoch - 5ms/step
Epoch 12/50
1/1 - 0s - loss: 2.1957 - accuracy: 0.6667 - 7ms/epoch - 7ms/step
Epoch 13/50
1/1 - 0s - loss: 2.1852 - accuracy: 0.6667 - 6ms/

# Text Generation

In [18]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Sample text data
corpus = [
    "Hello, how are you?",
    "I am doing well.",
    "What about you?",
    "I'm just a computer program.",
    "Nice to meet you!",
]

# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

# Create input sequences and labels
input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

max_sequence_length = max([len(seq) for seq in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre')

# Create predictors and labels
X, y = input_sequences[:, :-1], input_sequences[:, -1]
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

# Build the LSTM model
model = Sequential()
model.add(Embedding(total_words, 50, input_length=max_sequence_length-1))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X, y, epochs=100, verbose=1)

# Generate text
seed_text = "I am"
next_words = 10

for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_length-1, padding='pre')
    predicted = np.argmax(model.predict(token_list), axis=-1)
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    seed_text += " " + output_word

print("Generated Text:", seed_text)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [21]:
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.models import Model

# Define the Seq2Seq model
latent_dim = 256

encoder_inputs = Input(shape=(None,))
embedding_layer = Embedding(input_dim=vocab_size, output_dim=latent_dim)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
_, state_h, state_c = encoder_lstm(embedding_layer)
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(None,))
embedding_layer = Embedding(input_dim=vocab_size, output_dim=latent_dim)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(embedding_layer, initial_state=encoder_states)
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)



NameError: name 'vocab_size' is not defined

In [22]:
import tensorflow as tf
from tensorflow.keras.layers import Layer, MultiHeadAttention, Embedding, Dense

class SelfAttention(Layer):
    def __init__(self, embed_size, heads):
        super(SelfAttention, self).__init__()
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size // heads

        assert (
            self.head_dim * heads == embed_size
        ), "Embedding size needs to be divisible by heads"

        self.query = Dense(embed_size)
        self.key = Dense(embed_size)
        self.value = Dense(embed_size)
        self.combine_heads = Dense(embed_size)

    def call(self, inputs):
        batch_size = tf.shape(inputs)[0]

        query = self.query(inputs)  # (batch_size, seq_len, embed_size)
        key = self.key(inputs)
        value = self.value(inputs)

        query = tf.reshape(
            query, (batch_size, -1, self.heads, self.head_dim)
        )  # (batch_size, seq_len, heads, head_dim)
        key = tf.reshape(key, (batch_size, -1, self.heads, self.head_dim))
        value = tf.reshape(value, (batch_size, -1, self.heads, self.head_dim))

        query = tf.transpose(query, perm=[0, 2, 1, 3])  # (batch_size, heads, seq_len, head_dim)
        key = tf.transpose(key, perm=[0, 2, 1, 3])
        value = tf.transpose(value, perm=[0, 2, 1, 3])

        scores = tf.matmul(query, key, transpose_b=True)  # (batch_size, heads, seq_len, seq_len)
        scores = scores / tf.math.sqrt(tf.cast(self.head_dim, dtype=tf.float32))

        attention = tf.nn.softmax(scores, axis=3)

        out = tf.matmul(attention, value)  # (batch_size, heads, seq_len, head_dim)
        out = tf.transpose(out, perm=[0, 2, 1, 3])
        out = tf.reshape(out, (batch_size, -1, self.embed_size))

        out = self.combine_heads(out)
        return out

class TransformerBlock(Layer):
    def __init__(self, embed_size, heads, forward_expansion, dropout=0.1):
        super(TransformerBlock, self).__init__()
        self.attention = Self


In [23]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Attention
from tensorflow.keras.models import Model

def create_translation_model(vocab_size, embedding_dim, hidden_units):
    # Encoder
    encoder_inputs = Input(shape=(None,))
    encoder_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim, mask_zero=True)(encoder_inputs)
    encoder_outputs, state_h, state_c = LSTM(hidden_units, return_state=True)(encoder_embedding)
    encoder_states = [state_h, state_c]

    # Decoder
    decoder_inputs = Input(shape=(None,))
    decoder_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim, mask_zero=True)(decoder_inputs)
    decoder_lstm = LSTM(hidden_units, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)

    # Attention mechanism
    attention_layer = Attention()([decoder_outputs, encoder_outputs])

    # Concatenate attention output and decoder LSTM output
    attended_decoder = tf.concat([decoder_outputs, attention_layer], axis=-1)

    # Dense layer for output predictions
    decoder_dense = Dense(vocab_size, activation="softmax")
    decoder_outputs = decoder_dense(attended_decoder)

    # Model
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

    return model

# Example usage:
vocab_size = 10000  # Adjust based on your dataset
embedding_dim = 256
hidden_units = 512

translation_model = create_translation_model(vocab_size, embedding_dim, hidden_units)
