In [1]:
import os
import numpy as np

import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense

In [2]:
train_path = os.path.join('datasets', 'summary_train.csv')
train_df = pd.read_csv(train_path, usecols=['title', 'text'])
train_df.head()

Unnamed: 0,title,text
0,House Dem Aide: We Didn’t Even See Comey’s Let...,House Dem Aide: We Didn’t Even See Comey’s Let...
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Ever get the feeling your life circles the rou...
2,Why the Truth Might Get You Fired,"Why the Truth Might Get You Fired October 29, ..."
3,15 Civilians Killed In Single US Airstrike Hav...,Videos 15 Civilians Killed In Single US Airstr...
4,Iranian woman jailed for fictional unpublished...,Print \nAn Iranian woman has been sentenced to...


In [3]:
train_df.dropna(inplace=True)
text = train_df['text'].values
targets = train_df['title'].values

In [4]:
MAX_TARGETS_LENGTH = 20

def preprocess_targets(targets):
    seq_list = []
    for line in targets:
        words_list = []
        sentence = '[bos] ' + line.lower() + ' [eos]'
        sentence = sentence.split(' ')
        for word in sentence:
            if len(words_list) < MAX_TARGETS_LENGTH:
                words_list.append(word)
            else:
                break
        seq_list.append(words_list)
    return seq_list
            
targets = preprocess_targets(targets)

In [5]:
NUM_WORDS = 10_000

tokenizer = Tokenizer(num_words=NUM_WORDS)
tokenizer.fit_on_texts(text)
word_index = tokenizer.word_index

In [6]:
word_index = {word: index for word, index in word_index.items() if index <= 10_000}
index_word = {index: word for word, index in word_index.items()}

In [7]:
MAX_LEN = 100
new_text = tokenizer.texts_to_sequences(text)
new_text = pad_sequences(new_text, maxlen=MAX_LEN, 
                         truncating='post', padding='post')

In [8]:
def get_words_mapping(text):
    tokenizer = Tokenizer(num_words=NUM_WORDS)
    tokenizer.fit_on_texts(text)
    word_index = tokenizer.word_index
    word_index = {word: index for word, index in word_index.items() if index <= 10_000}
    index_word = {index: word for word, index in word_index.items()}
    return word_index, index_word



target_word_index, target_index_word = get_words_mapping(targets)

In [9]:
TARGET_MAX_LEN = 10

def get_generator(new_text, targets, batch_size=254):
    nrows = new_text.shape[0]
    num_batches = nrows // batch_size
    for batch_id in range(num_batches):
        low = batch_id * batch_size
        upper = (batch_id + 1) * batch_size
        encoder_input_data = new_text[low:upper]
        decoder_output_data = np.zeros(shape=(batch_size, TARGET_MAX_LEN, NUM_WORDS + 1))
        decoder_input_data = np.zeros(shape=(batch_size, TARGET_MAX_LEN, NUM_WORDS + 1))
        for i, line in enumerate(targets[low:upper]):
            for j, word in enumerate(line):
                if j >= TARGET_MAX_LEN:
                    break
                try:
                    idx = target_word_index[word]
                except KeyError:
                    continue
                else:
                    decoder_input_data[i, j, idx] = 1
                    if i > 0:
                        decoder_output_data[i, j-1, idx] = 1
        yield [encoder_input_data, decoder_input_data], decoder_output_data
        
train_gen = get_generator(new_text, targets)

In [10]:
encoder_input = Input(shape=(None,), name='encoder_input')
encoder_embedding = Embedding(input_dim=NUM_WORDS + 1, output_dim=100, 
                              input_length=MAX_LEN, name='encoder_embedding')
encoder_lstm = LSTM(100, return_state=True, 
                    return_sequences=True, name='encoder_lstm')
encoder_output, encoder_state_h, encoder_state_c = encoder_lstm(encoder_embedding(encoder_input))
encoder_states = [encoder_state_h, encoder_state_c]

decoder_input = Input(shape=(None, NUM_WORDS + 1), name='decoder_input')
decoder_lstm = LSTM(100, return_state=True, 
                    return_sequences=True, name='decoder_lstm')
decoder_output, decoder_state_h, decoder_state_c = decoder_lstm(decoder_input, initial_state=encoder_states)
decoder_dense = Dense(units=NUM_WORDS + 1, activation='softmax', name='decoder_dense')
decoder_output = decoder_dense(decoder_output)

model = Model([encoder_input, decoder_input], decoder_output)
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_input (InputLayer)      [(None, None)]       0                                            
__________________________________________________________________________________________________
encoder_embedding (Embedding)   (None, None, 100)    1000100     encoder_input[0][0]              
__________________________________________________________________________________________________
decoder_input (InputLayer)      [(None, None, 10001) 0                                            
__________________________________________________________________________________________________
encoder_lstm (LSTM)             [(None, None, 100),  80400       encoder_embedding[0][0]          
_______________________________________________________________________________________

In [15]:
encoder_model = Model(encoder_input, encoder_states)

decoder_state_inputs = [Input(shape=(100,)), Input(shape=(100,))]
decoder_output, state_h, state_c = decoder_lstm(decoder_input, initial_state=decoder_state_inputs)
decoder_states = [state_h, state_c]
decoder_output = decoder_dense(decoder_output)
decoder_model = Model([decoder_input] + decoder_state_inputs, [decoder_output] + decoder_states)

In [16]:
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
history = model.fit(train_gen, steps_per_epoch=5, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


Summarize

In [26]:
input_text = text[1]


'Ever get the feeling your life circles the roundabout rather than heads in a straight line toward the intended destination? [Hillary Clinton remains the big woman on campus in leafy, liberal Wellesley, Massachusetts. Everywhere else votes her most likely to don her inauguration dress for the remainder of her days the way Miss Havisham forever wore that wedding dress.  Speaking of Great Expectations, Hillary Rodham overflowed with them 48 years ago when she first addressed a Wellesley graduating class. The president of the college informed those gathered in 1969 that the students needed “no debate so far as I could ascertain as to who their spokesman was to be” (kind of the like the Democratic primaries in 2016 minus the   terms unknown then even at a Seven Sisters school). “I am very glad that Miss Adams made it clear that what I am speaking for today is all of us —  the 400 of us,” Miss Rodham told her classmates. After appointing herself Edger Bergen to the Charlie McCarthys and Mor

In [65]:
input_tokens = tokenizer.texts_to_sequences([input_text])
input_tokens = pad_sequences(input_tokens, maxlen=MAX_LEN, 
                             padding='post', truncating='post')
states_values = encoder_model.predict(input_tokens)

In [66]:
target_seq = np.zeros(shape=(1, 1, NUM_WORDS + 1))
target_seq

array([[[0., 0., 0., ..., 0., 0., 0.]]])

In [78]:
#target_seq[0, 0, target_word_index['[bos]']] = 1
output_tokens, h, c =  decoder_model.predict([target_seq] + states_values)

In [79]:
output_tokens.shape
sample_token_idx = np.argmax(output_tokens[0, -1, :])

In [80]:
word = target_index_word[sample_token_idx]

In [77]:
target_seq = np.zeros((1, 1, NUM_WORDS + 1))
target_seq[0, 0, sample_token_idx] = 1

states_value = [h, c]