In [98]:
import os
import numpy as np

import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [99]:
train_path = os.path.join('datasets', 'summary_train.csv')
train_df = pd.read_csv(train_path, usecols=['title', 'text'])
train_df.head()

Unnamed: 0,title,text
0,House Dem Aide: We Didn’t Even See Comey’s Let...,House Dem Aide: We Didn’t Even See Comey’s Let...
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Ever get the feeling your life circles the rou...
2,Why the Truth Might Get You Fired,"Why the Truth Might Get You Fired October 29, ..."
3,15 Civilians Killed In Single US Airstrike Hav...,Videos 15 Civilians Killed In Single US Airstr...
4,Iranian woman jailed for fictional unpublished...,Print \nAn Iranian woman has been sentenced to...


In [100]:
train_df.dropna(inplace=True)
text = train_df['text'].values
targets = train_df['title'].values

In [101]:
MAX_TARGETS_LENGTH = 20

def preprocess_targets(targets):
    seq_list = []
    for line in targets:
        words_list = []
        sentence = '[bos] ' + line.lower() + ' [eos]'
        sentence = sentence.split(' ')
        for word in sentence:
            if len(words_list) < MAX_TARGETS_LENGTH:
                words_list.append(word)
            else:
                break
        seq_list.append(words_list)
    return seq_list
            
targets = preprocess_targets(targets)

In [102]:
NUM_WORDS = 10_000

tokenizer = Tokenizer(num_words=NUM_WORDS)
tokenizer.fit_on_texts(text)
word_index = tokenizer.word_index

In [103]:
word_index = {word: index for word, index in word_index.items() if index <= 10_000}
index_word = {index: word for word, index in word_index.items()}

In [104]:
MAX_LEN = 100
new_text = tokenizer.texts_to_sequences(text)
new_text = pad_sequences(new_text, maxlen=MAX_LEN, 
                         truncating='post', padding='post')

In [105]:
def get_words_mapping(text):
    tokenizer = Tokenizer(num_words=NUM_WORDS)
    tokenizer.fit_on_texts(text)
    word_index = tokenizer.word_index
    word_index = {word: index for word, index in word_index.items() if index <= 10_000}
    index_word = {index: word for word, index in word_index.items()}
    return word_index, index_word



target_word_index, target_index_word = get_words_mapping(targets)

In [162]:
TARGET_MAX_LEN = 100

def get_generator(new_text, targets, batch_size=124):
    nrows = new_text.shape[0]
    num_batches = nrows // batch_size
    for batch_id in range(num_batches):
        low = batch_id * batch_size
        upper = (batch_id + 1) * batch_size
        encoder_input_data = new_text[low:upper]
        decoder_output_data = np.zeros(shape=(batch_size, TARGET_MAX_LEN, NUM_WORDS + 1))
        decoder_input_data = np.zeros(shape=(batch_size, TARGET_MAX_LEN, NUM_WORDS + 1))
        for i, line in enumerate(targets[low:upper]):
            for j, word in enumerate(line):
                if j >= TARGET_MAX_LEN:
                    break
                try:
                    idx = target_word_index[word]
                except KeyError:
                    continue
                else:
                    decoder_input_data[i, j, idx] = 1
                    if i > 0:
                        decoder_output_data[i, j-1, idx] = 1
        yield [encoder_input_data, decoder_input_data], decoder_output_data
        
data = get_generator(new_text, targets)