In [1]:
!pip install tensorflow




In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Concatenate, Dot

import re
import string


In [3]:
# Sample dataset (Replace with actual dataset)
data = {
    "text": [
        "The stock market saw a significant rise today after strong earnings reports.",
        "The new iPhone was released with many new features and improvements.",
        "A major storm is expected to hit the east coast this weekend."
    ],
    "summary": [
        "Stock market rises after earnings.",
        "New iPhone has more features.",
        "Storm expected on east coast."
    ]
}

df = pd.DataFrame(data)

# Text cleaning function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = text.strip()
    return text

df["text"] = df["text"].apply(clean_text)
df["summary"] = df["summary"].apply(clean_text)

# Tokenize the text
text_tokenizer = Tokenizer()
text_tokenizer.fit_on_texts(df["text"])
text_sequences = text_tokenizer.texts_to_sequences(df["text"])
text_vocab_size = len(text_tokenizer.word_index) + 1

# Tokenize the summaries
summary_tokenizer = Tokenizer()
summary_tokenizer.fit_on_texts(df["summary"])
summary_sequences = summary_tokenizer.texts_to_sequences(df["summary"])
summary_vocab_size = len(summary_tokenizer.word_index) + 1

# Set max sequence lengths
max_text_len = 15
max_summary_len = 8

# Pad sequences
encoder_input_data = pad_sequences(text_sequences, maxlen=max_text_len, padding='post')
decoder_input_data = pad_sequences(summary_sequences, maxlen=max_summary_len, padding='post')


In [4]:
embedding_dim = 128
latent_dim = 256  # LSTM hidden size

# Encoder
encoder_inputs = Input(shape=(max_text_len,))
encoder_embedding = Embedding(text_vocab_size, embedding_dim, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]


In [5]:
# Attention Layer using Dot Product Attention
attention = Dot(axes=[2, 2])([encoder_outputs, encoder_outputs])
attention = Dense(latent_dim, activation="tanh")(attention)


In [6]:
# Decoder
decoder_inputs = Input(shape=(max_summary_len,))
decoder_embedding = Embedding(summary_vocab_size, embedding_dim, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)

# Apply Attention
attention_output = Dot(axes=[2, 2])([decoder_outputs, encoder_outputs])
decoder_combined_context = Concatenate(axis=-1)([decoder_outputs, attention_output])

# Output Layer
decoder_dense = Dense(summary_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_combined_context)

# Define Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()


In [7]:
decoder_target_data = np.zeros_like(decoder_input_data)
decoder_target_data[:, :-1] = decoder_input_data[:, 1:]  # Shift left
decoder_target_data[:, -1] = 0  # Padding at the end


In [8]:
# Train the Seq2Seq model
model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_target_data,
    batch_size=32,
    epochs=50,
    validation_split=0.2
)


Epoch 1/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 9s/step - accuracy: 0.1250 - loss: 2.7726 - val_accuracy: 0.5000 - val_loss: 2.7561
Epoch 2/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 246ms/step - accuracy: 0.5625 - loss: 2.7391 - val_accuracy: 0.5000 - val_loss: 2.7368
Epoch 3/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 197ms/step - accuracy: 0.5625 - loss: 2.7020 - val_accuracy: 0.5000 - val_loss: 2.7130
Epoch 4/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 273ms/step - accuracy: 0.5625 - loss: 2.6558 - val_accuracy: 0.5000 - val_loss: 2.6808
Epoch 5/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 294ms/step - accuracy: 0.5625 - loss: 2.5931 - val_accuracy: 0.5000 - val_loss: 2.6342
Epoch 6/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 162ms/step - accuracy: 0.5000 - loss: 2.5020 - val_accuracy: 0.5000 - val_loss: 2.5624
Epoch 7/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7861d151e310>

In [10]:
# Define Encoder Model for Inference
encoder_model = Model(encoder_inputs, [encoder_outputs] + encoder_states)

# Define Decoder Model for Inference
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_hidden_states_input = Input(shape=(max_text_len, latent_dim))

decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_embedding, initial_state=[decoder_state_input_h, decoder_state_input_c]
)

attention_out = Dot(axes=[2, 2])([decoder_outputs, decoder_hidden_states_input])
decoder_combined_context = Concatenate(axis=-1)([decoder_outputs, attention_out])

decoder_outputs = decoder_dense(decoder_combined_context)

decoder_model = Model(
    [decoder_inputs, decoder_hidden_states_input, decoder_state_input_h, decoder_state_input_c],
    [decoder_outputs, state_h, state_c]
)

def summarize_text(input_text):
    input_text = clean_text(input_text)
    input_seq = pad_sequences(text_tokenizer.texts_to_sequences([input_text]), maxlen=max_text_len, padding='post')

    # Encode input
    encoder_outs, state_h, state_c = encoder_model.predict(input_seq)
    states = [state_h, state_c]

    # Initialize decoder sequence with start token (Assuming index 1 is <start>)
    target_seq = np.zeros((1, max_summary_len))
    target_seq[0, 0] = 1

    decoded_sentence = ""
    for i in range(max_summary_len - 1):
        output_tokens, h, c = decoder_model.predict([target_seq, encoder_outs] + states)
        predicted_index = np.argmax(output_tokens[0, i, :])
        if predicted_index == 0:
            break  # Stop at padding index

        word = summary_tokenizer.index_word.get(predicted_index, "")
        decoded_sentence += " " + word

        target_seq[0, i + 1] = predicted_index
        states = [h, c]

    return decoded_sentence.strip()

# Test summarization
test_text = "The stock market saw a huge increase due to great earnings."
summary = summarize_text(test_text)
print("Generated Summary:", summary)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 377ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 232ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
Generated Summary: rises
