In [219]:
### Import packages
import numpy as np
import pandas as pd
import re
from bs4 import BeautifulSoup
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import contractions
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping

### need a local attention_local.py file for this.
from attention_local import AttentionLayer
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
import time

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anusseth\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [222]:
class Text_Summarization():
    def __init__(self, num_rows):
        self.num_rows = num_rows
        self.stopwords = set(stopwords.words('english'))
        self.run_process()


    def run_process(self):
        self.data, self.cleaned_text, self.cleaned_summary = self.data_import_preprocess()
        self.x_tr, self.y_tr, self.x_val, self.y_val, self.x_voc, self.y_voc = self.cleanedData()
        #self.x_tr, self.x_val, self.x_voc = self.tokenizer(train, val)
#         self.text_word_count = self.cleaned_data_text(self.cleaned_text)
#         self.summary_word_count = self.cleaned_data_summary(self.cleaned_summary)

    def text_cleaner(self, text):
        newString = text.lower()
        newString = BeautifulSoup(newString, "lxml").text
        newString = re.sub(r'\([^)]*\)', '', newString)
        newString = re.sub('"','', newString)
        newString = ' '.join([contractions.fix(t) for t in newString.split(" ")])
        newString = re.sub(r"'s\b","",newString)
        newString = re.sub("[^a-zA-Z]", " ", newString)
        newString = re.sub('[m]{2,}', 'mm', newString)

        tokens = [w for w in newString.split() if not w in self.stopwords]

        long_words=[]
        for i in tokens:
            if len(i)>1:               #removing short word
                long_words.append(i)
        return (" ".join(long_words)).strip()



    def data_import_preprocess(self):
    #Loading the data from the Amazon Review csv
        data = pd.read_csv(r".\amazon_food_reviews\Reviews.csv", nrows = self.num_rows)


        data.drop(columns = ['ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Score', 'Time'], inplace=True) #drop useless columns
        data.drop_duplicates(subset = ['Text'], inplace=True) #dropping duplicates
        data.replace('', np.nan, inplace=True)
        data.dropna(axis=0, inplace = True) #dropping na

        cleaned_text = []
        for t in data['Text']:
            cleaned_text.append(self.text_cleaner(t))
        cleaned_summary = []
        for t in data['Summary']:
            cleaned_summary.append(self.text_cleaner(t))

        data['cleaned_text'], data['cleaned_summary'] = cleaned_text, cleaned_summary

        return data, cleaned_text, cleaned_summary
    
    def majority_length_data(self, lst, cleaned_data):
#         starttime = time.time()
        dict_summary = {item: lst.count(item) for item in set(lst)}
#         endtime = time.time()
#         print("time diff is", endtime - starttime)
       # dict_summary = {item: lst(count for count in lst).count(item) for item in sorted(set(lst))}

#         print(dict_summary)

        percent = round(0.95 * len(cleaned_data))
        print(percent)

        count = 0
        for key, value in sorted(dict_summary.items()):
            count += value
            if count>=percent:
              return key
    def tokenizer(self, train, val):
        x_tokenizer = Tokenizer()
        x_tokenizer.fit_on_texts(list(train))
        thresh=6

        cnt=0
        tot_cnt=0
        freq=0
        tot_freq=0

        for key,value in x_tokenizer.word_counts.items():
            tot_cnt=tot_cnt+1
            tot_freq=tot_freq+value
            if(value<thresh):
                cnt=cnt+1
                freq=freq+value

        print("% of rare words in vocabulary:",(cnt/tot_cnt)*100)
        print("Total Coverage of rare words:",(freq/tot_freq)*100)
        #prepare a tokenizer for reviews on training data
        x_tokenizer = Tokenizer(num_words=tot_cnt-cnt)
        x_tokenizer.fit_on_texts(list(train))

        #convert text sequences into integer sequences
        x_tr_seq    =   x_tokenizer.texts_to_sequences(train)
        x_val_seq   =   x_tokenizer.texts_to_sequences(val)


        #padding zero upto maximum length
        train    =   pad_sequences(x_tr_seq,  maxlen=max_text_len, padding='post')
        val   =   pad_sequences(x_val_seq, maxlen=max_text_len, padding='post')

        #size of vocabulary ( +1 for padding token)
        x_voc   =  x_tokenizer.num_words + 1
        #print(x_tokenizer.word_counts[text],len(train))
        return x_voc, train, val
            
        
    def emptyRows(self, var):
        ind=[]
        for i in range(len(var)):
            cnt=0
            for j in var[i]:
                if j!=0:
                    cnt=cnt+1
            if(cnt==2):
                ind.append(i)

        return ind
        
    def cleanedData(self):
        text_word_count = []
        summary_word_count = []
        max_text_len = 0
        max_summary_len = 0

        # populate the lists with sentence lengths
        for i in text_class.data['cleaned_text']:
              text_word_count.append(len(i.split()))
        

        for i in text_class.data['cleaned_summary']:
              summary_word_count.append(len(i.split()))

        length_df = pd.DataFrame({'text':text_word_count, 'summary':summary_word_count})
        max_summary_len = self.majority_length_data(summary_word_count, self.cleaned_summary)
        max_text_len = self.majority_length_data(text_word_count, self.cleaned_text)
        print(max_summary_len)
        print(max_text_len)
        
        cleaned_text =np.array(self.cleaned_text)
        cleaned_summary=np.array(self.cleaned_summary)

        short_text=[]
        short_summary=[]

        for i in range(len(cleaned_text)):
            if(len(cleaned_summary[i].split())<=max_summary_len and len(cleaned_text[i].split())<=max_text_len):
                short_text.append(cleaned_text[i])
                short_summary.append(cleaned_summary[i])

        df=pd.DataFrame({'text':short_text,'summary':short_summary})
        df['summary'] = df['summary'].apply(lambda x : 'sostok '+ x + ' eostok')
        x_tr,x_val,y_tr,y_val=train_test_split(np.array(df['text']),np.array(df['summary']),test_size=0.1,random_state=0,shuffle=True)
        x_voc, x_tr, x_val = self.tokenizer(x_tr, x_val)
        y_voc, y_tr, y_val = self.tokenizer(y_tr, y_val)
        ind = self.emptyRows(y_tr)
        y_tr=np.delete(y_tr,ind, axis=0)
        x_tr=np.delete(x_tr,ind, axis=0)
        ind = self.emptyRows(y_val)
        y_val=np.delete(y_val,ind, axis=0)
        x_val=np.delete(x_val,ind, axis=0)
        return x_tr, y_tr,x_val, y_val, x_voc, y_voc

In [223]:
text_class = Text_Summarization(100000)

  newString = BeautifulSoup(newString, "lxml").text
  newString = BeautifulSoup(newString, "lxml").text


84004
84004
6
104
% of rare words in vocabulary: 71.36603637348256
Total Coverage of rare words: 2.2052559981298776
% of rare words in vocabulary: 76.49653434152489
Total Coverage of rare words: 4.721348808175155


In [224]:
text_class.data.head()

Unnamed: 0,Id,Summary,Text,cleaned_text,cleaned_summary
0,1,Good Quality Dog Food,I have bought several of the Vitality canned d...,bought several vitality canned dog food produc...,good quality dog food
1,2,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,product arrived labeled jumbo salted peanuts p...,advertised
2,3,"""Delight"" says it all",This is a confection that has been around a fe...,confection around centuries light pillowy citr...,delight says
3,4,Cough Medicine,If you are looking for the secret ingredient i...,looking secret ingredient robitussin believe f...,cough medicine
4,5,Great taffy,Great taffy at a great price. There was a wid...,great taffy great price wide assortment yummy ...,great taffy


In [225]:
# cnt=0
# for i in text_class.data['cleaned_summary']:
#     if(len(i.split())<=max_summary_len):
#         cnt=cnt+1
# print (cnt)
# print(cnt/len(text_class.data['cleaned_summary']))

In [226]:
# y_tokenizer.word_counts['sostok'],len(y_tr)

In [227]:
from keras import backend as K
import tensorflow as tf
K.clear_session()
#print(K.tensorflow_backend._get_available_gpus())
print(tf.config.experimental.list_physical_devices('GPU'))
latent_dim = 300
embedding_dim=100

#Encoder
encoder_inputs = Input(shape=(max_text_len,))

#embedding layer
enc_emb =  Embedding(text_class.x_voc, embedding_dim,trainable=True)(encoder_inputs)

#encoder lstm 1
encoder_lstm1 = LSTM(latent_dim,return_sequences=True,return_state=True,dropout=0.4,recurrent_dropout=0.4)
encoder_output1, state_h1, state_c1 = encoder_lstm1(enc_emb)

#encoder lstm 2
encoder_lstm2 = LSTM(latent_dim,return_sequences=True,return_state=True,dropout=0.4,recurrent_dropout=0.4)
encoder_output2, state_h2, state_c2 = encoder_lstm2(encoder_output1)

#encoder lstm 3
encoder_lstm3=LSTM(latent_dim, return_state=True, return_sequences=True,dropout=0.4,recurrent_dropout=0.4)
encoder_outputs, state_h, state_c= encoder_lstm3(encoder_output2)

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))

#embedding layer
dec_emb_layer = Embedding(text_class.y_voc, embedding_dim,trainable=True)
dec_emb = dec_emb_layer(decoder_inputs)

decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True,dropout=0.4,recurrent_dropout=0.2)
decoder_outputs,decoder_fwd_state, decoder_back_state = decoder_lstm(dec_emb,initial_state=[state_h, state_c])

# Attention layer
attn_layer = AttentionLayer(name='attention_layer')
attn_out, attn_states = attn_layer([encoder_outputs, decoder_outputs])

# Concat attention input and decoder LSTM output
decoder_concat_input = Concatenate(axis=-1, name='concat_layer')([decoder_outputs, attn_out])

#dense layer
decoder_dense =  TimeDistributed(Dense(text_class.y_voc, activation='softmax'))
decoder_outputs = decoder_dense(decoder_concat_input)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.summary()

[]
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 104)]                0         []                            
                                                                                                  
 embedding (Embedding)       (None, 104, 100)             1269100   ['input_1[0][0]']             
                                                                                                  
 lstm (LSTM)                 [(None, 104, 300),           481200    ['embedding[0][0]']           
                              (None, 300),                                                        
                              (None, 300)]                                                        
                                                                                           

In [228]:
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')

In [229]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,patience=2)

In [None]:
history=model.fit([text_class.x_tr,text_class.y_tr[:,:-1]], text_class.y_tr.reshape(text_class.y_tr.shape[0],text_class.y_tr.shape[1], 1)[:,1:] ,epochs=50,callbacks=[es],batch_size=128, validation_data=([text_class.x_val,text_class.y_val[:,:-1]], text_class.y_val.reshape(text_class.y_val.shape[0],text_class.y_val.shape[1], 1)[:,1:]))

Epoch 1/50
 19/563 [>.............................] - ETA: 7:07:38 - loss: 1.6503

In [218]:
print(x_tr)

[[ 2944  1054  1419 ...     0     0     0]
 [   50   167   648 ...     0     0     0]
 [ 3028  6295   159 ...     0     0     0]
 ...
 [ 3152   225  1279 ...     0     0     0]
 [  194 10173 11485 ...     0     0     0]
 [   24    83   546 ...     0     0     0]]
