In [1]:
import pandas as pd

In [2]:
df1 = pd.read_csv('news_summary.csv',encoding='iso-8859-1')
df2 = pd.read_csv('news_summary_more.csv',encoding='iso-8859-1')
df1.fillna('', inplace=True)
df2.fillna('', inplace=True)

In [3]:
display(df2.columns)
display(df1.columns)

Index(['headlines', 'text'], dtype='object')

Index(['author', 'date', 'headlines', 'read_more', 'text', 'ctext'], dtype='object')

In [4]:
df1['text'] = df1['author'].str.cat(df1['date'
        ].str.cat(df1['read_more'].str.cat(df1['text'
        ].str.cat(df1['ctext'], sep=' '), sep=' '), sep=' '), sep=' ')

In [5]:
df1.drop(['author','date','read_more','ctext'],inplace=True,axis=1)

In [6]:
df = pd.concat([df1,df2],axis=0)

In [7]:
df

Unnamed: 0,headlines,text
0,Daman & Diu revokes mandatory Rakshabandhan in...,"Chhavi Tyagi 03 Aug 2017,Thursday http://www.h..."
1,Malaika slams user who trolled her for 'divorc...,"Daisy Mowke 03 Aug 2017,Thursday http://www.hi..."
2,'Virgin' now corrected to 'Unmarried' in IGIMS...,"Arshiya Chopra 03 Aug 2017,Thursday http://www..."
3,Aaj aapne pakad liya: LeT man Dujana before be...,"Sumedha Sehra 03 Aug 2017,Thursday http://indi..."
4,Hotel staff to get training to spot signs of s...,"Aarushi Maheshwari 03 Aug 2017,Thursday http:/..."
...,...,...
98396,CRPF jawan axed to death by Maoists in Chhatti...,A CRPF jawan was on Tuesday axed to death with...
98397,First song from Sonakshi Sinha's 'Noor' titled...,"'Uff Yeh', the first song from the Sonakshi Si..."
98398,'The Matrix' film to get a reboot: Reports,"According to reports, a new version of the 199..."
98399,Snoop Dogg aims gun at clown dressed as Trump ...,A new music video shows rapper Snoop Dogg aimi...


In [11]:
import re

# Remove non-alphabetic characters (Data Cleaning)
def text_strip(column):
    for row in column:
        row = re.sub("(\\t)", " ", str(row)).lower()
        row = re.sub("(\\r)", " ", str(row)).lower()
        row = re.sub("(\\n)", " ", str(row)).lower()

        # Remove _ if it occurs more than one time consecutively
        row = re.sub("(__+)", " ", str(row)).lower()

        # Remove - if it occurs more than one time consecutively
        row = re.sub("(--+)", " ", str(row)).lower()

        # Remove ~ if it occurs more than one time consecutively
        row = re.sub("(~~+)", " ", str(row)).lower()

        # Remove + if it occurs more than one time consecutively
        row = re.sub("(\+\++)", " ", str(row)).lower()

        # Remove . if it occurs more than one time consecutively
        row = re.sub("(\.\.+)", " ", str(row)).lower()

        # Remove the characters - <>()|&©ø"',;?~*!
        row = re.sub(r"[<>()|&©ø\[\]\'\",;?~*!]", " ", str(row)).lower()

        # Remove mailto:
        row = re.sub("(mailto:)", " ", str(row)).lower()

        # Remove \x9* in text
        row = re.sub(r"(\\x9\d)", " ", str(row)).lower()

        # Replace INC nums to INC_NUM
        row = re.sub("([iI][nN][cC]\d+)", "INC_NUM", str(row)).lower()

        # Replace CM# and CHG# to CM_NUM
        row = re.sub("([cC][mM]\d+)|([cC][hH][gG]\d+)", "CM_NUM", str(row)).lower()

        # Remove punctuations at the end of a word
        row = re.sub("(\.\s+)", " ", str(row)).lower()
        row = re.sub("(\-\s+)", " ", str(row)).lower()
        row = re.sub("(\:\s+)", " ", str(row)).lower()

        # Replace any url to only the domain name
        try:
            url = re.search(r"((https*:\/*)([^\/\s]+))(.[^\s]+)", str(row))
            repl_url = url.group(3)
            row = re.sub(r"((https*:\/*)([^\/\s]+))(.[^\s]+)", repl_url, str(row))
        except:
            pass

        # Remove multiple spaces
        row = re.sub("(\s+)", " ", str(row)).lower()

        # Remove the single character hanging between any two spaces
        row = re.sub("(\s+.\s+)", " ", str(row)).lower()
        
        row = '_START_ ' + row + ' _END_'

    return column

In [12]:
text = text_strip(df['text'])
summary = text_strip(df['headlines'])

In [13]:
from sklearn.model_selection import train_test_split
import numpy as np
x_tr, x_val, y_tr, y_val = train_test_split(
    np.array(text),
    np.array(summary),
    test_size=0.1,
    random_state=0,
    shuffle=True,
)

In [14]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [15]:
tok = Tokenizer(num_words = 1000)
tok.fit_on_texts(x_tr)
X_tr = tok.texts_to_sequences(x_tr)

In [16]:
tok = Tokenizer(num_words = 1000)
tok.fit_on_texts(y_tr)
Y_tr = tok.texts_to_sequences(y_tr)

In [17]:
X_tr = pad_sequences(X_tr, maxlen=100, padding='post')
Y_tr = pad_sequences(Y_tr, maxlen=10, padding='post')

In [18]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping

In [23]:
latent_dim = 300
embedding_dim = 200

# Encoder
encoder_inputs = Input(shape=(100, ))

# Embedding layer
enc_emb = Embedding(1000, embedding_dim,
                    trainable=True)(encoder_inputs)

# Encoder LSTM 1
encoder_lstm1 = LSTM(latent_dim, return_sequences=True,
                     return_state=True, dropout=0.4,
                     recurrent_dropout=0.4)
(encoder_output1, state_h1, state_c1) = encoder_lstm1(enc_emb)

# Encoder LSTM 2
encoder_lstm2 = LSTM(latent_dim, return_sequences=True,
                     return_state=True, dropout=0.4,
                     recurrent_dropout=0.4)
(encoder_output2, state_h2, state_c2) = encoder_lstm2(encoder_output1)

# Encoder LSTM 3
encoder_lstm3 = LSTM(latent_dim, return_state=True,
                     return_sequences=True, dropout=0.4,
                     recurrent_dropout=0.4)
(encoder_outputs, state_h, state_c) = encoder_lstm3(encoder_output2)

# Set up the decoder, using encoder_states as the initial state
decoder_inputs = Input(shape=(None, ))

# Embedding layer
dec_emb_layer = Embedding(1000, embedding_dim, trainable=True)
dec_emb = dec_emb_layer(decoder_inputs)

# Decoder LSTM
decoder_lstm = LSTM(latent_dim, return_sequences=True,
                    return_state=True, dropout=0.4,
                    recurrent_dropout=0.2)
(decoder_outputs, decoder_fwd_state, decoder_back_state) = \
    decoder_lstm(dec_emb, initial_state=[state_h, state_c])

# Dense layer
decoder_dense = TimeDistributed(Dense(1000, activation='softmax'))
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [21]:
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')

In [None]:
history = model.fit(
    [X_tr, Y_tr[:, :-1]],
    Y_tr.reshape(Y_tr.shape[0], Y_tr.shape[1], 1)[:, 1:],
    epochs=50,
    batch_size=128
    )

In [27]:
Y_tr[0]

array([ 60,   1, 681, 109, 274,   0,   0,   0,   0,   0])

In [30]:
# Inference Models
# This will be used to get the input for decoder
# Encode the input sequence to get the feature vector
encoder_model = Model(inputs=encoder_inputs, outputs=[encoder_outputs,
                      state_h, state_c])

# Decoder setup

# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(latent_dim, ))
decoder_state_input_c = Input(shape=(latent_dim, ))
decoder_hidden_state_input = Input(shape=(10, latent_dim))

# Get the embeddings of the decoder sequence
dec_emb2 = dec_emb_layer(decoder_inputs)

# To predict the next word in the sequence, set the initial states to the states from the previous time step
(decoder_outputs2, state_h2, state_c2) = decoder_lstm(dec_emb2,
        initial_state=[decoder_state_input_h, decoder_state_input_c])

# A dense softmax layer to generate prob dist. over the target vocabulary
decoder_outputs2 = decoder_dense(decoder_outputs2)

# Final decoder model
decoder_model = Model([decoder_inputs] + [decoder_hidden_state_input,
                      decoder_state_input_h, decoder_state_input_c],
                      [decoder_outputs2] + [state_h2, state_c2])

In [None]:
def decode_sequence(input_seq):

    # Encode the input as state vectors.
    (e_out, e_h, e_c) = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1
    target_seq = np.zeros((1, 1))

    # Populate the first word of target sequence with the start word.
    target_seq[0, 0] = target_word_index['sostok']

    stop_condition = False
    decoded_sentence = ''

    while not stop_condition:
        (output_tokens, h, c) = decoder_model.predict([target_seq]
                + [e_out, e_h, e_c])

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = reverse_target_word_index[sampled_token_index]

        if sampled_token != 'eostok':
            decoded_sentence += ' ' + sampled_token

        # Exit condition: either hit max length or find the stop word.
        if sampled_token == 'eostok' or len(decoded_sentence.split()) \
            >= max_summary_len - 1:
            stop_condition = True

        # Update the target sequence (of length 1)
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update internal states
        (e_h, e_c) = (h, c)

    return decoded_sentence