In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Attention
import pickle


In [5]:
import numpy as np
import pandas as pd
import tensorflow as tf
import pickle

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [6]:
df = pd.read_csv("news_summary.csv", encoding="latin1", encoding_errors="ignore")
df = df[['text','ctext']].dropna()
df.columns = ['summary', 'text']
df.head()


Unnamed: 0,summary,text
0,The Administration of Union Territory Daman an...,The Daman and Diu administration on Wednesday ...
1,Malaika Arora slammed an Instagram user who tr...,"From her special numbers to TV?appearances, Bo..."
2,The Indira Gandhi Institute of Medical Science...,The Indira Gandhi Institute of Medical Science...
3,Lashkar-e-Taiba's Kashmir commander Abu Dujana...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...
4,Hotels in Maharashtra will train their staff t...,Hotels in Mumbai and other Indian cities are t...


In [7]:
df.shape


(4396, 2)

In [8]:
texts = df['text'].values
summaries = df['summary'].values


In [9]:
print(texts[0])
print(summaries[0])


The Daman and Diu administration on Wednesday withdrew a circular that asked women staff to tie rakhis on male colleagues after the order triggered a backlash from employees and was ripped apart on social media.The union territory?s administration was forced to retreat within 24 hours of issuing the circular that made it compulsory for its staff to celebrate Rakshabandhan at workplace.?It has been decided to celebrate the festival of Rakshabandhan on August 7. In this connection, all offices/ departments shall remain open and celebrate the festival collectively at a suitable time wherein all the lady staff shall tie rakhis to their colleagues,? the order, issued on August 1 by Gurpreet Singh, deputy secretary (personnel), had said.To ensure that no one skipped office, an attendance report was to be sent to the government the next evening.The two notifications ? one mandating the celebration of Rakshabandhan (left) and the other withdrawing the mandate (right) ? were issued by the Daman

In [14]:
num_words = 8000
max_text_len = 100
max_summary_len = 20

tokenizer_text = Tokenizer(num_words=num_words)
tokenizer_text.fit_on_texts(texts)

tokenizer_sum = Tokenizer(num_words=num_words)
tokenizer_sum.fit_on_texts(summaries)


In [13]:
pickle.dump(tokenizer_text, open("tokenizer_text.pkl", "wb"))
pickle.dump(tokenizer_sum, open("tokenizer_sum.pkl", "wb"))


In [15]:
# Convert texts and summaries to sequences
text_seq = tokenizer_text.texts_to_sequences(texts)
summary_seq = tokenizer_sum.texts_to_sequences(summaries)

# Padding sequences to fixed length
text_seq = pad_sequences(text_seq, maxlen=max_text_len, padding='post')
summary_seq = pad_sequences(summary_seq, maxlen=max_summary_len, padding='post')

print("Text sequence shape:", text_seq.shape)
print("Summary sequence shape:", summary_seq.shape)


Text sequence shape: (4396, 100)
Summary sequence shape: (4396, 20)


In [17]:
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Attention, Concatenate
from tensorflow.keras.models import Model

latent_dim = 256  # LSTM hidden size

# -----------------------
# 1. ENCODER
# -----------------------
encoder_inputs = Input(shape=(max_text_len,))
enc_emb = Embedding(num_words, 128)(encoder_inputs)

encoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)

# -----------------------
# 2. DECODER
# -----------------------
decoder_inputs = Input(shape=(max_summary_len,))
dec_emb = Embedding(num_words, 128)(decoder_inputs)

decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=[state_h, state_c])

# -----------------------
# 3. ATTENTION
# -----------------------
attn_layer = Attention()
attn_output = attn_layer([decoder_outputs, encoder_outputs])

# FIXED: Use Keras Concatenate instead of tf.concat
decoder_concat = Concatenate(axis=-1)([decoder_outputs, attn_output])

# -----------------------
# 4. FINAL OUTPUT LAYER
# -----------------------
dense = Dense(num_words, activation='softmax')
decoder_outputs_final = dense(decoder_concat)

# -----------------------
# 5. MODEL
# -----------------------
model = Model([encoder_inputs, decoder_inputs], decoder_outputs_final)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

model.summary()


In [18]:
# Decoder target must be one step ahead
decoder_target = np.expand_dims(summary_seq, -1)

print("Decoder target shape:", decoder_target.shape)

# -----------------------
# TRAIN THE MODEL
# -----------------------
history = model.fit(
    [text_seq, summary_seq],   # encoder input, decoder input
    decoder_target,            # decoder output
    epochs=3,                  # Increase to 10+ for better results
    batch_size=64,
    validation_split=0.1
)

# -----------------------
# SAVE MODEL AFTER TRAINING
# -----------------------
model.save("summarizer_model.h5")

print("Model saved successfully!")


Decoder target shape: (4396, 20, 1)
Epoch 1/3
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m128s[0m 2s/step - loss: 7.2903 - val_loss: 6.5552
Epoch 2/3
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m106s[0m 2s/step - loss: 6.0896 - val_loss: 5.6644
Epoch 3/3
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m145s[0m 2s/step - loss: 5.0757 - val_loss: 4.6781




Model saved successfully!


In [19]:
# CELL 9 — Inference function + tests

import numpy as np
import tensorflow as tf
import pickle
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load model & tokenizers (if you restarted kernel)
# If model and tokenizers are already in memory you can skip re-loading
try:
    model  # if model present in memory
except NameError:
    model = tf.keras.models.load_model("summarizer_model.h5", compile=False)

try:
    tokenizer_text
    tokenizer_sum
except NameError:
    tokenizer_text = pickle.load(open("tokenizer_text.pkl", "rb"))
    tokenizer_sum  = pickle.load(open("tokenizer_sum.pkl", "rb"))

# Constants used during training
max_text_len = 100
max_summary_len = 20
num_words = 8000  # same as training

def decode_summary_from_seq(target_seq):
    """
    Convert an array of token ids (1D) into a cleaned string using tokenizer_sum.index_word.
    """
    words = []
    for t in target_seq:
        t = int(t)
        if t == 0:
            continue
        w = tokenizer_sum.index_word.get(t, "")
        if not w:
            continue
        words.append(w)
    # Join and cleanup spacing and special tokens (if any)
    return " ".join(words).strip()

def summarize_text_greedy(text):
    """
    Greedy decoding using the trained model: fills a target sequence step-by-step
    by repeatedly predicting the next token and placing it into the target_seq.
    This matches how we trained (teacher-forcing with full target sequences).
    """
    # Preprocess input
    seq = tokenizer_text.texts_to_sequences([text])
    seq = pad_sequences(seq, maxlen=max_text_len, padding='post')

    # Prepare empty target sequence (all zeros) and fill it progressively
    target_seq = np.zeros((1, max_summary_len), dtype='int32')

    for i in range(max_summary_len):
        # model.predict returns a (1, max_summary_len, num_words) distribution.
        preds = model.predict([seq, target_seq], verbose=0)
        # pick the token for position i
        token_i = np.argmax(preds[0, i])
        target_seq[0, i] = token_i

        # Optional early stop if token maps to empty or padding
        # (depends on whether you used special tokens during training)
        # if token_i == 0:
        #     break

    # Convert token ids to words
    summary = decode_summary_from_seq(target_seq[0])
    return summary

# Quick tests: run for a few samples from your dataset (if available)
print("TESTING: sample outputs\n-------------------------")
for idx in range(3):
    sample_text = texts[idx] if 'texts' in globals() else "Your test article paragraph goes here."
    print(f"\nOriginal [{idx}]:\n{sample_text[:400]}...\n")
    print("Summary:")
    print(summarize_text_greedy(sample_text))
    print("-" * 60)


TESTING: sample outputs
-------------------------

Original [0]:
The Daman and Diu administration on Wednesday withdrew a circular that asked women staff to tie rakhis on male colleagues after the order triggered a backlash from employees and was ripped apart on social media.The union territory?s administration was forced to retreat within 24 hours of issuing the circular that made it compulsory for its staff to celebrate Rakshabandhan at workplace.?It has been...

Summary:
security there there there there there there there there there there president there president president president president president president president
------------------------------------------------------------

Original [1]:
From her special numbers to TV?appearances, Bollywood actor Malaika Arora Khan has managed to carve her own identity. The actor, who made her debut in the Hindi film industry with the blockbuster debut opposite Shah Rukh Khan in Chaiyya Chaiyya from Dil Se (1998), is still remembered for t