In [10]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [11]:

file_path = 'test.csv'
df = pd.read_csv(file_path)

print(df.head())

                                         id  \
0  92c514c913c0bdfe25341af9fd72b29db544099b   
1  2003841c7dc0e7c5b1a248f9cd536d727f27a45a   
2  91b7d2311527f5c2b63a65ca98d21d9c92485149   
3  caabf9cbdf96eb1410295a673e953d304391bfbb   
4  3da746a7d9afcaa659088c8366ef6347fe6b53ea   

                                             article  \
0  Ever noticed how plane seats appear to be gett...   
1  A drunk teenage boy had to be rescued by secur...   
2  Dougie Freedman is on the verge of agreeing a ...   
3  Liverpool target Neto is also wanted by PSG an...   
4  Bruce Jenner will break his silence in a two-h...   

                                          highlights  
0  Experts question if  packed out planes are put...  
1  Drunk teenage boy climbed into lion enclosure ...  
2  Nottingham Forest are close to extending Dougi...  
3  Fiorentina goalkeeper Neto has been linked wit...  
4  Tell-all interview with the reality TV star, 6...  


In [12]:
text_tokenizer = Tokenizer()
text_tokenizer.fit_on_texts(df['article'])
text_sequences = text_tokenizer.texts_to_sequences(df['article'])

summary_tokenizer = Tokenizer()
summary_tokenizer.fit_on_texts(df['highlights'])
summary_sequences = summary_tokenizer.texts_to_sequences(df['highlights'])

text_lengths = [len(seq) for seq in text_sequences]
summary_lengths = [len(seq) for seq in summary_sequences]

max_text_len = int(np.percentile(text_lengths, 95))
max_summary_len = int(np.percentile(summary_lengths, 95))

text_padded = pad_sequences(text_sequences, maxlen=max_text_len, padding='post')
summary_padded = pad_sequences(summary_sequences, maxlen=max_summary_len, padding='post')


text_vocab_size = len(text_tokenizer.word_index) + 1
summary_vocab_size = len(summary_tokenizer.word_index) + 1



In [13]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, TimeDistributed, Concatenate, Attention

#encoder
encoder_inputs = Input(shape=(max_text_len,))
encoder_embedding = Embedding(text_vocab_size, 64)(encoder_inputs)
encoder_lstm = LSTM(64, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)

#decoder
decoder_inputs = Input(shape=(max_summary_len - 1,))
decoder_embedding = Embedding(summary_vocab_size, 64)(decoder_inputs)
decoder_lstm = LSTM(64, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[state_h, state_c])

# attentione layer
attention = Attention()([decoder_outputs, encoder_outputs])
context_vector = Concatenate(axis=-1)([decoder_outputs, attention])

# the final layer (output layer)
output = TimeDistributed(Dense(summary_vocab_size, activation='softmax'))(context_vector)

model = Model([encoder_inputs, decoder_inputs], output)
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')

model.summary()


Model: "model_5"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_5 (InputLayer)        [(None, 1404)]               0         []                            
                                                                                                  
 input_6 (InputLayer)        [(None, 89)]                 0         []                            
                                                                                                  
 embedding_2 (Embedding)     (None, 1404, 64)             8794816   ['input_5[0][0]']             
                                                                                                  
 embedding_3 (Embedding)     (None, 89, 64)               2798592   ['input_6[0][0]']             
                                                                                            

In [14]:
# decoder with input and target
summary_padded_input = summary_padded[:, :-1]
summary_padded_target = summary_padded[:, 1:]

model.fit(
    [text_padded, summary_padded_input],
    summary_padded_target,
    epochs=3,
    batch_size=64,
    validation_split=0.2
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7af217acea70>

In [15]:
encoder_model = Model(encoder_inputs, [state_h, state_c])

decoder_state_input_h = Input(shape=(64,))
decoder_state_input_c = Input(shape=(64,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_lstm_outputs, state_h, state_c = decoder_lstm(
    decoder_embedding, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = TimeDistributed(Dense(summary_vocab_size, activation='softmax'))(decoder_lstm_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

# summaty predictions
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)

    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = summary_tokenizer.word_index['start']

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = summary_tokenizer.index_word.get(sampled_token_index, '')
        decoded_sentence += ' ' + sampled_word

        if sampled_word == 'end' or len(decoded_sentence.split()) > max_summary_len:
            stop_condition = True

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        states_value = [h, c]

    return decoded_sentence.strip()

# actual predicitions show
for seq_index in range(2):  #small sample for checking
    input_seq = text_padded[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('Input:', df['article'][seq_index])
    print('Predicted summary:', decoded_sentence)
    print('Actual summary:', df['highlights'][seq_index])
    print()

Input: Ever noticed how plane seats appear to be getting smaller and smaller? With increasing numbers of people taking to the skies, some experts are questioning if having such packed out planes is putting passengers at risk. They say that the shrinking space on aeroplanes is not only uncomfortable - it's putting our health and safety in danger. More than squabbling over the arm rest, shrinking space on planes putting our health and safety in danger? This week, a U.S consumer advisory group set up by the Department of Transportation said at a public hearing that while the government is happy to set standards for animals flying on planes, it doesn't stipulate a minimum amount of space for humans. 'In a world where animals have more rights to space and food than humans,' said Charlie Leocha, consumer representative on the committee. 'It is time that the DOT and FAA take a stand for humane treatment of passengers.' But could crowding on planes lead to more serious issues than fighting for

NameError: name 'PegasusTokenizer' is not defined