### Load the data

In [None]:
import pandas as pd

# Load the dataset
file_path = '/content/drive/MyDrive/Data Folder/CNN_Dataset.csv'
df = pd.read_csv(file_path)

# Display the first few rows of the dataset
df.head()


Unnamed: 0,id,article,highlights
0,92c514c913c0bdfe25341af9fd72b29db544099b,Ever noticed how plane seats appear to be gett...,Experts question if packed out planes are put...
1,2003841c7dc0e7c5b1a248f9cd536d727f27a45a,A drunk teenage boy had to be rescued by secur...,Drunk teenage boy climbed into lion enclosure ...
2,91b7d2311527f5c2b63a65ca98d21d9c92485149,Dougie Freedman is on the verge of agreeing a ...,Nottingham Forest are close to extending Dougi...
3,caabf9cbdf96eb1410295a673e953d304391bfbb,Liverpool target Neto is also wanted by PSG an...,Fiorentina goalkeeper Neto has been linked wit...
4,3da746a7d9afcaa659088c8366ef6347fe6b53ea,Bruce Jenner will break his silence in a two-h...,"Tell-all interview with the reality TV star, 6..."


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### EDA on Dataset

In [None]:
df.shape

(11490, 3)

In [None]:
df.describe()

Unnamed: 0,id,article,highlights
count,11490,11490,11490
unique,11490,11488,11449
top,a6a5491edb0c96c4391b6a8c4504416b3572b3a1,Defiant Nigel Farage today insisted he did not...,This page includes the show Transcript .\nUse ...
freq,1,2,3


### Sample the dataset

In [None]:
import numpy as np

# Set a random seed for reproducibility
np.random.seed(42)

# Sample 5,000 rows from the dataset
sampled_df = df.sample(n=5000)

# Preprocessing function to clean the text data
def preprocess_text(text):
    text = text.lower().strip()
    return text

# Apply the preprocessing function to the articles and highlights
sampled_df['article'] = sampled_df['article'].apply(preprocess_text)
sampled_df['highlights'] = sampled_df['highlights'].apply(preprocess_text)

# Display the first few rows of the preprocessed dataset
sampled_df.head()


Unnamed: 0,id,article,highlights
1516,f00ae3c3929d829cd469ba4f229cc613b0766203,comedian jenny eclair travelled with her other...,the comedian stayed with flavours who offer a ...
1393,9e451f79499e5c784222b3f237c6ae4829849d79,a woman of arab and jewish descent who was str...,the federal government will give shoshana hebs...
10560,dae58055bd50598b93a230aa3a58e0d2f519b536,world no 1 novak djokovic has apologised to th...,novak djokovic beat andy murray 7-6 4-6 6-0 in...
11457,c05bda9b387ec8ae43803170b6f59b4b82505db9,(cnn)isis on wednesday released more than 200 ...,most of those released were women and children...
647,5c7493c6f28cfd58aa7b5f0e486e611307b4126d,hillary clinton’s security detail arrived at a...,"second modified, armored van spotted near des ..."


In [None]:
sampled_df.shape

(5000, 3)

### Preprocess the data for the model

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Define the maximum number of words to keep in the tokenizer
max_vocab_size = 50000
max_sequence_length = 300

# Initialize the tokenizer
tokenizer = Tokenizer(num_words=max_vocab_size, oov_token='<OOV>')

# Fit the tokenizer on the text data
tokenizer.fit_on_texts(sampled_df['article'].tolist() + sampled_df['highlights'].tolist())

# Convert the text data to sequences
article_sequences = tokenizer.texts_to_sequences(sampled_df['article'].tolist())
highlights_sequences = tokenizer.texts_to_sequences(sampled_df['highlights'].tolist())

# Pad the sequences
article_padded = pad_sequences(article_sequences, maxlen=max_sequence_length, padding='post', truncating='post')
highlights_padded = pad_sequences(highlights_sequences, maxlen=max_sequence_length, padding='post', truncating='post')

# Define the vocabulary size and other parameters
vocab_size = min(max_vocab_size, len(tokenizer.word_index) + 1)
embedding_dim = 256
lstm_units = 512

# Display the shapes of the padded sequences
print(article_padded.shape, highlights_padded.shape)


(5000, 300) (5000, 300)


### Build the Model

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Bidirectional, Dense, TimeDistributed
from tensorflow.keras.models import Model

# Encoder
encoder_inputs = Input(shape=(max_sequence_length,))
encoder_embedding = Embedding(vocab_size, embedding_dim)(encoder_inputs)
encoder_lstm = Bidirectional(LSTM(lstm_units, return_state=True, dropout=0.2, recurrent_dropout=0.2))
encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder_lstm(encoder_embedding)
state_h = tf.keras.layers.Concatenate()([forward_h, backward_h])
state_c = tf.keras.layers.Concatenate()([forward_c, backward_c])
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(max_sequence_length,))
decoder_embedding = Embedding(vocab_size, embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(lstm_units * 2, return_sequences=True, return_state=True, dropout=0.2, recurrent_dropout=0.2)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = TimeDistributed(Dense(vocab_size, activation='softmax'))
decoder_outputs = decoder_dense(decoder_outputs)

# Seq2Seq Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# Summarize the model
model.summary()


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 300)]                0         []                            
                                                                                                  
 embedding (Embedding)       (None, 300, 256)             1280000   ['input_1[0][0]']             
                                                          0                                       
                                                                                                  
 input_2 (InputLayer)        [(None, 300)]                0         []                            
                                                                                                  
 bidirectional (Bidirection  [(None, 1024),               3149824   ['embedding[0][0]']       

### Model Training

In [None]:
# Prepare the decoder target data by shifting the highlights sequences by one position
decoder_target_data = np.zeros_like(highlights_padded)
decoder_target_data[:, :-1] = highlights_padded[:, 1:]

# Train the model
history = model.fit([article_padded, highlights_padded],
                    np.expand_dims(decoder_target_data, -1),
                    batch_size=64,
                    epochs=10,
                    validation_split=0.2)

# Save the model
model.save('seq2seq_model.h5')


Epoch 1/10