In [1]:
import numpy as np
import pandas as pd
from util import contraction
import nltk
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import re
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras import backend as K
from keras.layers import Input, CuDNNLSTM, Embedding, Dense, Concatenate, TimeDistributed, LSTM
from keras.models import Model, model_from_json
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from util import attention
from matplotlib import pyplot
import json

gpus = tf.config.list_physical_devices('GPU')
if gpus:
      for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)


nltk.download("stopwords")

data = pd.read_csv("./Data/dblp-v10.csv", nrows=75000)

data.drop_duplicates(subset=["abstract"], inplace=True)
data.dropna(axis=0, inplace=True)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Andrew\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:

#Preprocessing
stop_words = set(stopwords.words('english'))
def clean_text(text):
    newString = text.lower()
    newString = re.sub(r'\([^)]*\)', '', newString)
    newString = re.sub('"','', newString)
    newString = ' '.join([contraction.contraction_mapping[t] if t in contraction.contraction_mapping else t for t in newString.split(" ")])
    newString = re.sub(r"'s\b","",newString)
    newString = re.sub("[^a-zA-Z]", " ", newString) 
    tokens = [w for w in newString.split() if not w in stop_words]

    cleaned_text = []
    for i in tokens:
        if len(i)>=3:
            cleaned_text.append(i)
    return (" ".join(cleaned_text)).strip()

def clean_summary(text):
    newString = text.lower()
    newString = re.sub(r'\([^)]*\)', '', newString)
    newString = re.sub('"','', newString)
    newString = ' '.join([contraction.contraction_mapping[t] if t in contraction.contraction_mapping else t for t in newString.split(" ")])
    newString = re.sub("[^a-zA-Z]", " ", newString) 
    tokens = newString.split()

    cleaned_text = []
    for i in tokens:
        if len(i)>1:
            cleaned_text.append(i)
    return " ".join(cleaned_text)

cleaned_text = []
for i in data['abstract']:
    cleaned_text.append(clean_text(i))
cleaned_summary = []
for i in data['title']:
    cleaned_summary.append(clean_summary(i))

data['cleaned_text'] = cleaned_text
data['cleaned_summary'] = cleaned_summary
data['cleaned_summary'].replace('', np.nan, inplace=True)
data.dropna(axis=0, inplace=True)
data['cleaned_summary'] = data['cleaned_summary'].apply(lambda x: '_START_' + x + '_END_')

max_len_text=200 
max_len_summary=20
latent_dim = 500

x_tr, x_val, y_tr, y_val = train_test_split(data['cleaned_text'], data['cleaned_summary'], test_size=0.1, random_state=0, shuffle=True);

#Tokenizers
x_tokenizer = Tokenizer()
x_tokenizer.fit_on_texts(list(x_tr))

x_tr = x_tokenizer.texts_to_sequences(x_tr)
x_tr = pad_sequences(x_tr, maxlen=max_len_text, padding='post')
x_val = x_tokenizer.texts_to_sequences(x_val)
x_val = pad_sequences(x_val, maxlen=max_len_text, padding='post')

x_voc_size = len(x_tokenizer.word_index)+1

y_tokenizer = Tokenizer()
y_tokenizer.fit_on_texts(list(y_tr))

y_tr = y_tokenizer.texts_to_sequences(y_tr)
y_tr = pad_sequences(y_tr, maxlen=max_len_summary, padding='post')
y_val = y_tokenizer.texts_to_sequences(y_val)
y_val = pad_sequences(y_val, maxlen=max_len_summary, padding='post')

y_voc_size = len(y_tokenizer.word_index)+1


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['cleaned_summary'].replace('', np.nan, inplace=True)


In [None]:

K.clear_session()

encoder_inputs = Input(shape=(max_len_text))
enc_emb = Embedding(x_voc_size, latent_dim, trainable=True)(encoder_inputs)

encoder_lstm1 = CuDNNLSTM(latent_dim, return_sequences=True, return_state=True)
encoder_output1, state_h1, state_c1 = encoder_lstm1(enc_emb)

encoder_lstm2 = CuDNNLSTM(latent_dim, return_sequences=True, return_state=True)
encoder_output2, state_h2, state_c2 = encoder_lstm2(encoder_output1)

encoder_lstm3 = CuDNNLSTM(latent_dim, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm3(encoder_output2)

decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(y_voc_size, latent_dim, trainable=True)
dec_emb = dec_emb_layer(decoder_inputs)

decoder_lstm = CuDNNLSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, decoder_fwd_state, decoder_back_state = decoder_lstm(dec_emb, initial_state=[state_h, state_c])

# attn_out = AdditiveAttention()([encoder_outputs, decoder_outputs])

attn_layer = attention.AttentionLayer(name='attention_layer')
attn_out, attn_states = attn_layer([encoder_outputs, decoder_outputs]) 

decoder_concat_input = Concatenate(axis=-1, name="concat_layer")([decoder_outputs, attn_out])

decoder_dense = TimeDistributed(Dense(y_voc_size, activation='softmax'))
decoder_outputs = decoder_dense(decoder_concat_input)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy")
es = EarlyStopping(monitor="val_loss", patience=2, mode='auto', verbose=1)
history = model.fit([x_tr, y_tr[:,:-1]], y_tr.reshape(y_tr.shape[0], y_tr.shape[1], 1)[:,1:], epochs=50, callbacks=[es], batch_size=512, validation_data=([x_val, y_val[:,:-1]], y_val.reshape(y_val.shape[0], y_val.shape[1], 1)[:,1:]))
model.save("./seq2seq.keras")

pyplot.plot(history.history['loss'], label='train') 
pyplot.plot(history.history['val_loss'], label='test') 
pyplot.legend() 
pyplot.show()

In [3]:
#testing
# print(np.asarray(dec_emb_layer.get_weights()).shape)



reverse_target_word_index=y_tokenizer.index_word 
reverse_source_word_index=x_tokenizer.index_word 
target_word_index=y_tokenizer.word_index

def build_models(loaded_model):
    encoder_inputs = Input(shape=(max_len_text))
    embedding = Embedding(x_voc_size, latent_dim, trainable=True)
    enc_emb = embedding(encoder_inputs)

    encoder_lstm1 = LSTM(latent_dim, return_sequences=True, return_state=True)
    encoder_output1, state_h1, state_c1 = encoder_lstm1(enc_emb)

    encoder_lstm2 = LSTM(latent_dim, return_sequences=True, return_state=True)
    encoder_output2, state_h2, state_c2 = encoder_lstm2(encoder_output1)

    encoder_lstm3 = LSTM(latent_dim, return_sequences=True, return_state=True)
    encoder_outputs, state_h, state_c = encoder_lstm3(encoder_output2)

    embedding.set_weights(loaded_model.layers[1].get_weights())
    encoder_lstm1.set_weights(loaded_model.layers[2].get_weights())
    encoder_lstm2.set_weights(loaded_model.layers[4].get_weights())
    encoder_lstm3.set_weights(loaded_model.layers[6].get_weights())

    decoder_state_input_h = Input(shape=(latent_dim,))
    decoder_state_input_c = Input(shape=(latent_dim,))
    decoder_hidden_state_input = Input(shape=(max_len_text,latent_dim))

    decoder_inputs = Input(shape=(None,))
    dec_emb_layer = Embedding(y_voc_size, latent_dim, trainable=True)
    dec_emb = dec_emb_layer(decoder_inputs)

    decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
    decoder_outputs, decoder_fwd_state, decoder_back_state = decoder_lstm(dec_emb, initial_state=[decoder_state_input_h, decoder_state_input_c])

    attention_layer = attention.AttentionLayer(name='attention_layer')
    attn_out, attn_states = attention_layer([decoder_hidden_state_input, decoder_outputs]) 

    decoder_concat_input = Concatenate(axis=-1, name="concat_layer")([decoder_outputs, attn_out])

    decoder_dense = TimeDistributed(Dense(y_voc_size, activation='softmax'))
    decoder_outputs = decoder_dense(decoder_concat_input)

    dec_emb_layer.set_weights(loaded_model.layers[5].get_weights())
    decoder_lstm.set_weights(loaded_model.layers[7].get_weights())
    attention_layer.set_weights(loaded_model.layers[8].get_weights())
    decoder_dense.set_weights(loaded_model.layers[10].get_weights())

    encoder_model = Model(inputs=encoder_inputs,outputs=[encoder_outputs, state_h, state_c])
    decoder_model = Model([decoder_inputs] + [decoder_hidden_state_input, decoder_state_input_h, decoder_state_input_c], [decoder_outputs] + [decoder_fwd_state, decoder_back_state])
    return encoder_model, decoder_model

def decode_sequence(input_seq, encoder_model, decoder_model):
    e_out, e_h, e_c = encoder_model.predict(input_seq, verbose=0)

    target_seq = np.zeros((1,1))

    target_seq[0, 0] = target_word_index['start']

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + [e_out, e_h, e_c], verbose=0)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = reverse_target_word_index[sampled_token_index]

        if(sampled_token!='end'):
            decoded_sentence += ' '+sampled_token

        if (sampled_token == 'end' or len(decoded_sentence.split()) >= (max_len_summary-1)):
            stop_condition = True

        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        e_h, e_c = h, c

    return decoded_sentence

#Convert cudnnlstm layers to lstm layers to be CPU compatible
loaded_model = tf.keras.models.load_model("seq2seq.keras", custom_objects={"AttentionLayer": attention.AttentionLayer})
json_config = loaded_model.to_json()
json_config_= json.loads(json_config)
layers = json_config_['config']['layers']
for layer in layers:
    if layer['class_name'].lower() == 'cudnnlstm':
        layer['class_name'] = 'LSTM'
model_json = json.dumps(json_config_)
model = model_from_json(model_json)
model.load_weights("seq2seq.keras")
encoder, decoder = build_models(model)

print("Enter a sentence")
x = input()
print("Input: " + x)
cleaned_text = [clean_text(x)]
cleaned_text = x_tokenizer.texts_to_sequences(cleaned_text)
cleaned_text = pad_sequences(cleaned_text, maxlen=max_len_text, padding='post')
print("Predicted summary:",decode_sequence(np.asarray(cleaned_text[0]).reshape(1,max_len_text), encoder, decoder))


Enter a sentence
Input: In this paper we study the nature of factors that facilitate mobile data services use, as well as the characteristics of early adopters, to shed light into diffusion patterns and inform predictions for future growth. We advocate that the use of mobile data services can be associated with one's level of satisfaction with his/her life. Based on the findings of a questionnaire-based survey (N=388), we have found that users satisfied with their personal life use information, mobile e-mail, and stock broking services more frequently than dissatisfied ones, while users satisfied with their professional life tend to use financial, information, and mobile e-mail services more heavily. Furthermore, we identify early adopters' profiles in terms of their demographic characteristics (gender, age, education, and income) to inform the design of effective target marketing strategies
Predicted summary:  mobile data service for the study of mobile applications
