https://www.kaggle.com/sandeepbhogaraju/text-summarization-with-seq2seq-model/notebook

# Data Preprocessing

## Loading Datasets

In [None]:
import pandas as pd

summary = pd.read_csv('data/news_summary.csv', encoding='iso-8859-1')
summary_more = pd.read_csv('data/news_summary_more.csv', encoding='iso-8859-1')

In [None]:
pre1 = summary.iloc[:, 0:6].copy()
pre2 = summary_more.iloc[:, 0:2].copy()

pre1['text'] = pre1['text'].str.cat(pre1['ctext'], sep = " ")

In [None]:
pre = pd.DataFrame()
pre['text'] = pd.concat([pre1['text'], pre2['text']], ignore_index=True)
pre['summary'] = pd.concat([pre1['headlines'],pre2['headlines']],ignore_index = True)
pre

Unnamed: 0,text,summary
0,The Administration of Union Territory Daman an...,Daman & Diu revokes mandatory Rakshabandhan in...
1,Malaika Arora slammed an Instagram user who tr...,Malaika slams user who trolled her for 'divorc...
2,The Indira Gandhi Institute of Medical Science...,'Virgin' now corrected to 'Unmarried' in IGIMS...
3,Lashkar-e-Taiba's Kashmir commander Abu Dujana...,Aaj aapne pakad liya: LeT man Dujana before be...
4,Hotels in Maharashtra will train their staff t...,Hotel staff to get training to spot signs of s...
...,...,...
102910,A CRPF jawan was on Tuesday axed to death with...,CRPF jawan axed to death by Maoists in Chhatti...
102911,"'Uff Yeh', the first song from the Sonakshi Si...",First song from Sonakshi Sinha's 'Noor' titled...
102912,"According to reports, a new version of the 199...",'The Matrix' film to get a reboot: Reports
102913,A new music video shows rapper Snoop Dogg aimi...,Snoop Dogg aims gun at clown dressed as Trump ...


## Data Cleaning

In [None]:
import re

def text_strip(column):
    for row in column:
                
        row=re.sub("(\\t)", ' ', str(row)).lower() #remove escape charecters
        row=re.sub("(\\r)", ' ', str(row)).lower() 
        row=re.sub("(\\n)", ' ', str(row)).lower()
        
        row=re.sub("(__+)", ' ', str(row)).lower()   #remove _ if it occors more than one time consecutively
        row=re.sub("(--+)", ' ', str(row)).lower()   #remove - if it occors more than one time consecutively
        row=re.sub("(~~+)", ' ', str(row)).lower()   #remove ~ if it occors more than one time consecutively
        row=re.sub("(\+\++)", ' ', str(row)).lower()   #remove + if it occors more than one time consecutively
        row=re.sub("(\.\.+)", ' ', str(row)).lower()   #remove . if it occors more than one time consecutively
        
        row=re.sub(r"[<>()|&©ø\[\]\'\",;?~*!]", ' ', str(row)).lower() #remove <>()|&©ø"',;?~*!
        
        row=re.sub("(mailto:)", ' ', str(row)).lower() #remove mailto:
        row=re.sub(r"(\\x9\d)", ' ', str(row)).lower() #remove \x9* in text
        row=re.sub("([iI][nN][cC]\d+)", 'INC_NUM', str(row)).lower() #replace INC nums to INC_NUM
        row=re.sub("([cC][mM]\d+)|([cC][hH][gG]\d+)", 'CM_NUM', str(row)).lower() #replace CM# and CHG# to CM_NUM
        
        
        row=re.sub("(\.\s+)", ' ', str(row)).lower() #remove full stop at end of words(not between)
        row=re.sub("(\-\s+)", ' ', str(row)).lower() #remove - at end of words(not between)
        row=re.sub("(\:\s+)", ' ', str(row)).lower() #remove : at end of words(not between)
        
        row=re.sub("(\s+.\s+)", ' ', str(row)).lower() #remove any single charecters hanging between 2 spaces
        
        # Replace any url as such https://abc.xyz.net/browse/sdf-5327 ====> abc.xyz.net
        try:
            url = re.search(r'((https*:\/*)([^\/\s]+))(.[^\s]+)', str(row))
            repl_url = url.group(3)
            row = re.sub(r'((https*:\/*)([^\/\s]+))(.[^\s]+)',repl_url, str(row))
        except:
            pass
        

        
        row = re.sub("(\s+)",' ',str(row)).lower() # Remove multiple spaces
        
        # Should always be last
        row=re.sub("(\s+.\s+)", ' ', str(row)).lower() # Remove any single charecters hanging between 2 spaces

        
        
        yield row

In [None]:
text = text_strip(pre['text'])
summary = text_strip(pre['summary'])

In [None]:
import spacy
from time import time
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser'])

# Text Cleaning
t = time()

text = [str(doc) for doc in nlp.pipe(text, batch_size=5000, n_threads=-1)]

print(f'Time to clean text: {round((time() - t) / 60, 2)} mins')

# Summary Cleaning
t = time()

summary = [str(doc) for doc in nlp.pipe(summary, batch_size=5000, n_threads=-1)]

print(f'Time to clean summary: {round((time() - t) / 60, 2)} mins')

Time to clean text: 4.0 mins
Time to clean summary: 0.76 mins


In [None]:
text[1]

'malaika arora slammed an instagram user who trolled her for divorcing rich man and having fun with the alimony her life now is all about wearing short clothes going to gym or salon enjoying vacation the user commented malaika responded you certainly got to get your damn facts right before spewing sh on me when you know nothing about me from her special numbers to tv appearances bollywood actor malaika arora khan has managed to carve her own identity the actor who made her debut in the hindi film industry with the blockbuster debut opposite shah rukh khan in chaiyya chaiyya from dil se 1998 is still remembered for the song however for trolls she is woman first and what matters right now is that she divorced rich man on wednesday malaika arora shared gorgeous picture of herself on instagram and follower decided to troll her for using her alumni read alimony money to wear short clothes and going to gym or salon little did he/she know that the munni badnam star would reply with the perfec

In [None]:
summary[1]

'malaika slams user who trolled her for divorcing rich man '

In [None]:
import numpy as np
cleaned_text = np.array(text)
cleaned_summary = np.array(summary)

In [None]:
cleaned = pd.DataFrame({'text' : cleaned_text, 'summary' : cleaned_summary})
cleaned['summary'] = cleaned['summary'].apply(lambda x : 'sostok '+ x + ' eostok')
cleaned

# Model Building

In [None]:
max_text_len = 250
max_summary_len = 50

## Train-Test Split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(np.array(cleaned['text']), np.array(cleaned['summary']), test_size=0.2, random_state=42)

## Tokenization

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

X_tokenizer = Tokenizer()
X_tokenizer.fit_on_texts(X_train)

In [None]:
X_num_words = len(X_tokenizer.word_index) + 1
X_num_rare_words = 0

thresh = 4
for key, value in X_tokenizer.word_counts.items():
    if(value < thresh):
        X_num_rare_words += 1

In [None]:
y_tokenizer = Tokenizer()
y_tokenizer.fit_on_texts(y_train)

In [None]:
y_num_words = len(y_tokenizer.word_index) + 1
y_num_rare_words = 0

thresh = 6
for key, value in y_tokenizer.word_counts.items():
    if(value < thresh):
        y_num_rare_words += 1

### X Tokenizer

In [None]:
X_tokenizer = Tokenizer(num_words=X_num_words-X_num_rare_words)
X_tokenizer.fit_on_texts(X_train)

In [None]:
X_train = X_tokenizer.texts_to_sequences(X_train)
X_test = X_tokenizer.texts_to_sequences(X_test)

In [None]:
X_train = pad_sequences(X_train, maxlen=max_text_len, padding='post')
X_test = pad_sequences(X_test, maxlen=max_text_len, padding='post')

In [None]:
X_voc = X_tokenizer.num_words + 1
X_voc

### Y Tokenizer

In [None]:
y_tokenizer = Tokenizer(num_words=y_num_words-y_num_rare_words)
y_tokenizer.fit_on_texts(y_train)

In [None]:
y_train = y_tokenizer.texts_to_sequences(y_train)
y_test = y_tokenizer.texts_to_sequences(y_test)

In [None]:
y_train = pad_sequences(y_train, maxlen=max_summary_len, padding='post')
y_test = pad_sequences(y_test, maxlen=max_summary_len, padding='post')

In [None]:
y_voc = y_tokenizer.num_words + 1
y_voc

## Remove Blanks

In [None]:
idx = list()

for i in range(len(y_train)):
    count = 0
    for j in y_train[i]:
        if j != 0:
            count += 1
    if count == 2:
        idx.append(i)

y_train = np.delete(y_train, idx, axis=0)
X_train = np.delete(X_train, idx, axis=0)

In [None]:
idx = list()

for i in range(len(y_test)):
    count = 0
    for j in y_test[i]:
        if j != 0:
            count += 1
    if count == 2:
        idx.append(i)

y_test = np.delete(y_test, idx, axis=0)
X_test = np.delete(X_test, idx, axis=0)

## Create Model Architecture

In [None]:
from keras import backend as K
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed
from tensorflow.keras.callbacks import EarlyStopping

K.clear_session()

latent_dim = 300
embedding_dim = 200

## Encoder ##
encoder_inputs = Input(shape=(max_text_len,))

# Embedding layer
encoder_emb = Embedding(X_voc, embedding_dim, trainable=True)(encoder_inputs)

# LSTM 1
encoder_lstm1 = LSTM(latent_dim, return_sequences=True, return_state=True, dropout=0.4, recurrent_dropout=0.4)
encoder_output1, state_h1, state_c1 = encoder_lstm1(encoder_emb)

# LSTM 2
encoder_lstm2 = LSTM(latent_dim, return_sequences=True, return_state=True, dropout=0.4, recurrent_dropout=0.4)
encoder_output2, state_h2, state_c2 = encoder_lstm2(encoder_output1)

# LSTM 3
encoder_lstm3 = LSTM(latent_dim, return_sequences=True, return_state=True, dropout=0.4, recurrent_dropout=0.4)
encoder_output3, state_h, state_c = encoder_lstm3(encoder_output2)

## Decoder ##
decoder_inputs = Input(shape=(None,))

# Embedding layer
decoder_emb_layer = Embedding(y_voc, embedding_dim, trainable=True)
decoder_emb = decoder_emb_layer(decoder_inputs)

# LSTM
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, dropout=0.4, recurrent_dropout=0.2)
decoder_outputs, decoder_fwd_state, decoder_back_state = decoder_lstm(decoder_emb, initial_state=[state_h, state_c])

# Dense Layer
decoder_dense = TimeDistributed(Dense(y_voc, activation='softmax'))
decoder_outputs = decoder_dense(decoder_outputs)

## Model ##
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.summary()

In [None]:
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)

In [None]:
X_test.shape

In [None]:
history = model.fit([X_train, y_train[:, :-1]],
                    y_train.reshape(y_train.shape[0], y_train.shape[1], 1)[:, 1:],
                    epochs=50,
                    callbacks=[es],
                    batch_size=128,
                    validation_data=([X_test, y_test[:, :-1]],
                    y_test.reshape(y_test.shape[0], y_test.shape[1], 1)[:, 1:]))

## Visualizing Training

In [None]:
from matplotlib import pyplot as plt

plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show()

# Inferencing

In [None]:
reverse_target_word_index = y_tokenizer.index_word
reverse_source_word_index = X_tokenizer.index_word
target_word_index = y_tokenizer.word_index

In [None]:
# Encoder
encoder_model = Model(inputs=encoder_inputs, outputs=[encoder_output3, state_h, state_c])

# Decoder
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_hidden_state_input = Input(shape=(max_text_len, latent_dim))

# Embeddings
decoder_emb2 = decoder_emb_layer(decoder_inputs)

decoder_outputs2, state_h2, state_c2 = decoder_lstm(decoder_emb2, initial_state=[decoder_state_input_h, decoder_state_input_c])

decoder_outputs2 = decoder_dense(decoder_outputs2)

decoder_model = Model(
    [decoder_inputs] + [decoder_hidden_state_input, decoder_state_input_h, decoder_state_input_c],
    [decoder_outputs2] + [state_h2, state_c2]
)

In [None]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    e_out, e_h, e_c = encoder_model.predict(input_seq)
    
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    
    # Populate the first word of target sequence with the start word.
    target_seq[0, 0] = target_word_index['sostok']

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
      
        output_tokens, h, c = decoder_model.predict([target_seq] + [e_out, e_h, e_c])

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = reverse_target_word_index[sampled_token_index]
        
        if(sampled_token!='eostok'):
            decoded_sentence += ' '+sampled_token

        # Exit condition: either hit max length or find stop word.
        if (sampled_token == 'eostok'  or len(decoded_sentence.split()) >= (max_summary_len-1)):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update internal states
        e_h, e_c = h, c

    return decoded_sentence

In [None]:
def seq2summary(input_seq):
    newString=''
    for i in input_seq:
        if((i!=0 and i!=target_word_index['sostok']) and i!=target_word_index['eostok']):
            newString=newString+reverse_target_word_index[i]+' '
    return newString

def seq2text(input_seq):
    newString=''
    for i in input_seq:
        if(i!=0):
            newString=newString+reverse_source_word_index[i]+' '
    return newString

In [None]:
for i in range(0,5):
    print("Review:", seq2text(X_train[i]))
    print("Original summary:", seq2summary(y_train[i]))
    print("Predicted summary:", decode_sequence(X_train[i].reshape(1,max_text_len)))
    print("\n")