In [8]:

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.callbacks import EarlyStopping
import pickle



In [11]:
# Load the cleaned dataset
data = pd.read_csv(r"C:\Text Summarization\CleanedDB.csv")

# Display the first few rows to ensure it's loaded correctly
print(data.head())

                                         id  \
0  0001d1afc246a7964130f43ae940af6bc6c57f01   
1  0002095e55fcbd3a2f366d9bf92a95433dc305ef   
2  00027e965c8264c35cc1bc55556db388da82b07f   
3  0002c17436637c4fe1837c935c04de47adb18e9a   
4  0003ad6ef0c37534f80b55b4235108024b407f0b   

                                             article  \
0  associated press publish est october update es...   
1  cnn ralph mata internal affairs lieutenant mia...   
2  drunk driver kill young woman headon crash che...   
3  cnn breezy sweep pen president vladimir putin ...   
4  fleetwood team still record sky bet league one...   

                                          highlights  
0  bishop john folda north dakota taking time dia...  
1  criminal complaint cop use role help cocaine t...  
2  craig ecclestontodd drunk least three pint dri...  
3  nina dos santos say europe must ready accept s...  
4  fleetwood top league one win scunthorpe peterb...  


In [12]:
articles = data['article'].astype(str).tolist()
highlights = data['highlights'].astype(str).tolist()

In [13]:

# Tokenize and pad the articles (input sequences)
max_len_article = 400
max_len_highlight = 100
max_words = 10000

tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(articles + highlights)


In [14]:
# Convert text to sequences
article_sequences = tokenizer.texts_to_sequences(articles)
highlight_sequences = tokenizer.texts_to_sequences(highlights)

In [15]:

# Pad sequences
X = pad_sequences(article_sequences, maxlen=max_len_article, padding='post')
y = pad_sequences(highlight_sequences, maxlen=max_len_highlight, padding='post')

In [16]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
# Define the Seq2Seq model
embedding_dim = 64
latent_dim = 128

# Encoder
encoder_inputs = Input(shape=(max_len_article,))
encoder_embedding = Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len_article)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(max_len_highlight,))
decoder_embedding = Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len_highlight)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(max_words, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])



In [18]:
model.summary()

In [19]:
# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)


In [20]:

# Prepare decoder target data
y_train_shifted = np.zeros_like(y_train)
y_train_shifted[:, :-1] = y_train[:, 1:]

In [21]:
# Train the model
history = model.fit([X_train, y_train], y_train_shifted, epochs=5, batch_size=32, validation_data=([X_test, y_test], y_test), callbacks=[early_stopping])

Epoch 1/5
[1m7178/7178[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5408s[0m 748ms/step - accuracy: 0.7305 - loss: 2.3186 - val_accuracy: 0.7285 - val_loss: 2.0132
Epoch 2/5
[1m7178/7178[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5325s[0m 742ms/step - accuracy: 0.7414 - loss: 1.8836 - val_accuracy: 0.7287 - val_loss: 1.9290
Epoch 3/5
[1m7178/7178[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5366s[0m 748ms/step - accuracy: 0.7456 - loss: 1.7771 - val_accuracy: 0.7279 - val_loss: 1.9043
Epoch 4/5
[1m7178/7178[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5400s[0m 752ms/step - accuracy: 0.7476 - loss: 1.7267 - val_accuracy: 0.7287 - val_loss: 1.9021
Epoch 5/5
[1m7178/7178[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5422s[0m 755ms/step - accuracy: 0.7494 - loss: 1.6930 - val_accuracy: 0.7282 - val_loss: 1.9077


In [22]:
# Evaluate the model
loss, accuracy = model.evaluate([X_test, y_test], y_test)
print(f'Test Loss: {loss:.4f}')
print(f'Test Accuracy: {accuracy * 100:.2f}%')

[1m1795/1795[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m863s[0m 481ms/step - accuracy: 0.7282 - loss: 1.9067
Test Loss: 1.9021
Test Accuracy: 72.87%


In [30]:
from tensorflow.keras.models import save_model

# Save the tokenizer using pickle
with open(r'C:\Text Summarization\tokenizer.pkl', 'wb') as file:
    pickle.dump(tokenizer, file)

# Save the Keras model in HDF5 format
model.save(r'C:\Text Summarization\model.h5')



In [31]:
from tensorflow.keras.models import load_model

# Load the Keras model
model = load_model(r'C:\Text Summarization\model.h5')



In [32]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
import pickle

# Load the tokenizer
with open(r'C:\Text Summarization\tokenizer.pkl', 'rb') as file:
    tokenizer = pickle.load(file)

# Load the Keras model
model = load_model(r'C:\Text Summarization\model.h5')

# Set the max lengths for the input and output sequences
max_len_article = 400
max_len_highlight = 50

# Define the function to generate summaries
def generate_summary(text):
    sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, maxlen=max_len_article, padding='post')
    
    # Encode the input sequence
    encoder_model = model.get_layer('encoder')
    decoder_model = model.get_layer('decoder')
    
    encoder_outputs, state_h, state_c = encoder_model.predict(padded_sequence)
    encoder_states = [state_h, state_c]

    # Initialize the decoder input with the start token
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tokenizer.word_index.get('<start>', 0)  # Use 0 if '<start>' not in index
    
    decoded_sentence = ''
    stop_condition = False
    
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + encoder_states)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = tokenizer.index_word.get(sampled_token_index, '')
        decoded_sentence += ' ' + sampled_word
        
        # Check if the prediction is the end token or the sentence length exceeds max length
        if sampled_word == '<end>' or len(decoded_sentence) > max_len_highlight:
            stop_condition = True
        
        # Update the target sequence and encoder states
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        encoder_states = [h, c]
    
    return decoded_sentence.strip()




