In [175]:
### Import packages
import numpy as np
import pandas as pd
import re
from bs4 import BeautifulSoup
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import contractions
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping

### need a local attention_local.py file for this.
from attention_local import AttentionLayer
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anusseth\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [178]:
class Text_Summarization():
    def __init__(self, num_rows):
        self.num_rows = num_rows
        self.stopwords = set(stopwords.words('english'))
        self.run_process()


    def run_process(self):
        self.data, self.cleaned_text, self.cleaned_summary = self.data_import_preprocess()
        self.text_word_count, self.summary_word_count, self.max_summary_len, self.max_text_len = self.cleanedData()
        #self.x_tr, self.x_val, self.x_voc = self.tokenizer(train, val)
#         self.text_word_count = self.cleaned_data_text(self.cleaned_text)
#         self.summary_word_count = self.cleaned_data_summary(self.cleaned_summary)

    def text_cleaner(self, text):
        newString = text.lower()
        newString = BeautifulSoup(newString, "lxml").text
        newString = re.sub(r'\([^)]*\)', '', newString)
        newString = re.sub('"','', newString)
        newString = ' '.join([contractions.fix(t) for t in newString.split(" ")])
        newString = re.sub(r"'s\b","",newString)
        newString = re.sub("[^a-zA-Z]", " ", newString)
        newString = re.sub('[m]{2,}', 'mm', newString)

        tokens = [w for w in newString.split() if not w in self.stopwords]

        long_words=[]
        for i in tokens:
            if len(i)>1:               #removing short word
                long_words.append(i)
        return (" ".join(long_words)).strip()



    def data_import_preprocess(self):
    #Loading the data from the Amazon Review csv
        data = pd.read_csv(r".\amazon_food_reviews\Reviews.csv", nrows = self.num_rows)


        data.drop(columns = ['ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Score', 'Time'], inplace=True) #drop useless columns
        data.drop_duplicates(subset = ['Text'], inplace=True) #dropping duplicates
        data.replace('', np.nan, inplace=True)
        data.dropna(axis=0, inplace = True) #dropping na

        cleaned_text = []
        for t in data['Text']:
            cleaned_text.append(self.text_cleaner(t))
        cleaned_summary = []
        for t in data['Summary']:
            cleaned_summary.append(self.text_cleaner(t))

        data['cleaned_text'], data['cleaned_summary'] = cleaned_text, cleaned_summary

        return data, cleaned_text, cleaned_summary
    
    def majority_length_data(self, lst, cleaned_data):
        dict_summary = {item: lst.count(item) for item in sorted(lst)}
       # dict_summary = {item: lst(count for count in lst).count(item) for item in sorted(set(lst))}

        print(dict_summary)

        percent = round(0.95 * len(cleaned_data))
        print(percent)

        count = 0
        for key, value in sorted(dict_summary.items()):
            count += value
            if count>=percent:
              return key
    def tokenizer(self, train, val):
        x_tokenizer = Tokenizer()
        x_tokenizer.fit_on_texts(list(train))
        thresh=6

        cnt=0
        tot_cnt=0
        freq=0
        tot_freq=0

        for key,value in x_tokenizer.word_counts.items():
            tot_cnt=tot_cnt+1
            tot_freq=tot_freq+value
            if(value<thresh):
                cnt=cnt+1
                freq=freq+value

        print("% of rare words in vocabulary:",(cnt/tot_cnt)*100)
        print("Total Coverage of rare words:",(freq/tot_freq)*100)
        #prepare a tokenizer for reviews on training data
        x_tokenizer = Tokenizer(num_words=tot_cnt-cnt)
        x_tokenizer.fit_on_texts(list(train))

        #convert text sequences into integer sequences
        x_tr_seq    =   x_tokenizer.texts_to_sequences(train)
        x_val_seq   =   x_tokenizer.texts_to_sequences(val)


        #padding zero upto maximum length
        train    =   pad_sequences(x_tr_seq,  maxlen=max_text_len, padding='post')
        val   =   pad_sequences(x_val_seq, maxlen=max_text_len, padding='post')

        #size of vocabulary ( +1 for padding token)
        x_voc   =  x_tokenizer.num_words + 1
        #print(x_tokenizer.word_counts[text],len(train))
        return x_voc, train, val
            
        
        
    def cleanedData(self):
        text_word_count = []
        summary_word_count = []
        max_text_len = 0
        max_summary_len = 0

        # populate the lists with sentence lengths
        for i in text_class.data['cleaned_text']:
              text_word_count.append(len(i.split()))
        

        for i in text_class.data['cleaned_summary']:
              summary_word_count.append(len(i.split()))

        length_df = pd.DataFrame({'text':text_word_count, 'summary':summary_word_count})
        max_summary_len = self.majority_length_data(summary_word_count, self.cleaned_summary)
        max_text_len = self.majority_length_data(text_word_count, self.cleaned_text)
        print(max_summary_len)
        print(max_text_len)
        
        cleaned_text =np.array(self.cleaned_text)
        cleaned_summary=np.array(self.cleaned_summary)

        short_text=[]
        short_summary=[]

        for i in range(len(cleaned_text)):
            if(len(cleaned_summary[i].split())<=max_summary_len and len(cleaned_text[i].split())<=max_text_len):
                short_text.append(cleaned_text[i])
                short_summary.append(cleaned_summary[i])

        df=pd.DataFrame({'text':short_text,'summary':short_summary})
        df['summary'] = df['summary'].apply(lambda x : 'sostok '+ x + ' eostok')
        x_tr,x_val,y_tr,y_val=train_test_split(np.array(df['text']),np.array(df['summary']),test_size=0.1,random_state=0,shuffle=True)
        x_voc, x_tr, x_val = self.tokenizer(x_tr, x_val)
        y_voc, y_tr, y_val = self.tokenizer(y_tr, y_val)
        return text_word_count, summary_word_count, max_summary_len, max_text_len

In [179]:
text_class = Text_Summarization(100000)

  newString = BeautifulSoup(newString, "lxml").text
  newString = BeautifulSoup(newString, "lxml").text


{0: 291, 1: 15077, 2: 28406, 3: 19570, 4: 13223, 5: 6264, 6: 3170, 7: 1404, 8: 531, 9: 232, 10: 129, 11: 63, 12: 37, 13: 15, 14: 3, 15: 8, 16: 2}
84004
{2: 3, 4: 16, 5: 96, 6: 242, 7: 499, 8: 983, 9: 1554, 10: 2116, 11: 2421, 12: 2709, 13: 2832, 14: 2744, 15: 2695, 16: 2594, 17: 2532, 18: 2390, 19: 2247, 20: 2210, 21: 2075, 22: 2043, 23: 1920, 24: 1906, 25: 1757, 26: 1793, 27: 1653, 28: 1634, 29: 1522, 30: 1472, 31: 1383, 32: 1385, 33: 1268, 34: 1228, 35: 1232, 36: 1134, 37: 1043, 38: 1059, 39: 1009, 40: 998, 41: 1012, 42: 968, 43: 868, 44: 804, 45: 843, 46: 777, 47: 767, 48: 709, 49: 691, 50: 679, 51: 613, 52: 572, 53: 566, 54: 574, 55: 575, 56: 482, 57: 536, 58: 506, 59: 457, 60: 477, 61: 436, 62: 438, 63: 422, 64: 395, 65: 404, 66: 392, 67: 358, 68: 362, 69: 317, 70: 295, 71: 304, 72: 310, 73: 303, 74: 278, 75: 238, 76: 251, 77: 237, 78: 232, 79: 225, 80: 193, 81: 206, 82: 210, 83: 197, 84: 210, 85: 204, 86: 212, 87: 173, 88: 159, 89: 179, 90: 132, 91: 144, 92: 165, 93: 139, 94: 136

In [157]:
text_class.data.head()

Unnamed: 0,Id,Summary,Text,cleaned_text,cleaned_summary
0,1,Good Quality Dog Food,I have bought several of the Vitality canned d...,bought several vitality canned dog food produc...,good quality dog food
1,2,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,product arrived labeled jumbo salted peanuts p...,advertised
2,3,"""Delight"" says it all",This is a confection that has been around a fe...,confection around centuries light pillowy citr...,delight says
3,4,Cough Medicine,If you are looking for the secret ingredient i...,looking secret ingredient robitussin believe f...,cough medicine
4,5,Great taffy,Great taffy at a great price. There was a wid...,great taffy great price wide assortment yummy ...,great taffy


In [158]:
# text_word_count = []
# summary_word_count = []
# max_text_len = 0
# max_summary_len = 0

# # populate the lists with sentence lengths
# for i in text_class.data['cleaned_text']:
#       text_word_count.append(len(i.split()))
        

# for i in text_class.data['cleaned_summary']:
#       summary_word_count.append(len(i.split()))

# length_df = pd.DataFrame({'text':text_word_count, 'summary':summary_word_count})
# # max_text_len = max(text_word_count)
# # max_summary_len = max(summary_word_count)

In [159]:
# max_summary_len = text_class.majority_length_data(text_class.summary_word_count,text_class.cleaned_summary)
# max_text_len = text_class.majority_length_data(text_class.text_word_count,text_class.cleaned_text)

In [160]:
# print(max_summary_len)
# print(max_text_len)

In [161]:
cnt=0
for i in text_class.data['cleaned_summary']:
    if(len(i.split())<=max_summary_len):
        cnt=cnt+1
print (cnt)
print(cnt/len(text_class.data['cleaned_summary']))

86001
0.9725869380831212


In [131]:
# cleaned_text =np.array(cleaned_text)
# cleaned_summary=np.array(cleaned_summary)

# short_text=[]
# short_summary=[]

# for i in range(len(cleaned_text)):
#     if(len(cleaned_summary[i].split())<=max_summary_len and len(cleaned_text[i].split())<=max_text_len):
#         short_text.append(cleaned_text[i])
#         short_summary.append(cleaned_summary[i])

# df=pd.DataFrame({'text':short_text,'summary':short_summary})
# df['summary'] = df['summary'].apply(lambda x : 'sostok '+ x + ' eostok')
# x_tr,x_val,y_tr,y_val=train_test_split(np.array(df['text']),np.array(df['summary']),test_size=0.1,random_state=0,shuffle=True)

In [132]:
print(df)

                                                    text  \
0      bought several vitality canned dog food produc...   
1      product arrived labeled jumbo salted peanuts p...   
2      confection around centuries light pillowy citr...   
3      looking secret ingredient robitussin believe f...   
4      great taffy great price wide assortment yummy ...   
...                                                  ...   
82068               love noodle little spicy wife perfct   
82069                 love buy another box done last one   
82070  late father law used rating system meals parti...   
82071  favorite brand korean ramen spicy used eating ...   
82072  like noodles although say spicy somewhat under...   

                     summary  
0      good quality dog food  
1                 advertised  
2               delight says  
3             cough medicine  
4                great taffy  
...                      ...  
82068             good stuff  
82069                  yummy  


In [133]:
# df['summary'] = df['summary'].apply(lambda x : 'sostok '+ x + ' eostok')
# # from sklearn.model_selection import train_test_split
# # x_tr,x_val,y_tr,y_val=train_test_split(np.array(df['text']),np.array(df['summary']),test_size=0.1,random_state=0,shuffle=True)

In [134]:
# from keras.preprocessing.text import Tokenizer

# #prepare a tokenizer for reviews on training data
# x_tokenizer = Tokenizer()
# x_tokenizer.fit_on_texts(list(x_tr))

In [170]:
# thresh=6

# cnt=0
# tot_cnt=0
# freq=0
# tot_freq=0

# for key,value in x_tokenizer.word_counts.items():
#     tot_cnt=tot_cnt+1
#     tot_freq=tot_freq+value
#     if(value<thresh):
#         cnt=cnt+1
#         freq=freq+value

# print("% of rare words in vocabulary:",(cnt/tot_cnt)*100)
# print("Total Coverage of rare words:",(freq/tot_freq)*100)

% of rare words in vocabulary: 71.36603637348256
Total Coverage of rare words: 2.2052559981298776


In [136]:
# #prepare a tokenizer for reviews on training data
# x_tokenizer = Tokenizer(num_words=tot_cnt-cnt)
# x_tokenizer.fit_on_texts(list(x_tr))

# #convert text sequences into integer sequences
# x_tr_seq    =   x_tokenizer.texts_to_sequences(x_tr)
# x_val_seq   =   x_tokenizer.texts_to_sequences(x_val)


# #padding zero upto maximum length
# x_tr    =   pad_sequences(x_tr_seq,  maxlen=max_text_len, padding='post')
# x_val   =   pad_sequences(x_val_seq, maxlen=max_text_len, padding='post')

# #size of vocabulary ( +1 for padding token)
# x_voc   =  x_tokenizer.num_words + 1

In [138]:
# #prepare a tokenizer for reviews on training data
# y_tokenizer = Tokenizer()
# y_tokenizer.fit_on_texts(list(y_tr))


# thresh=6

# cnt=0
# tot_cnt=0
# freq=0
# tot_freq=0

# for key,value in y_tokenizer.word_counts.items():
#     tot_cnt=tot_cnt+1
#     tot_freq=tot_freq+value
#     if(value<thresh):
#         cnt=cnt+1
#         freq=freq+value

# print("% of rare words in vocabulary:",(cnt/tot_cnt)*100)
# print("Total Coverage of rare words:",(freq/tot_freq)*100)

# #prepare a tokenizer for reviews on training data
# y_tokenizer = Tokenizer(num_words=tot_cnt-cnt)
# y_tokenizer.fit_on_texts(list(y_tr))

# #convert text sequences into integer sequences
# y_tr_seq    =   y_tokenizer.texts_to_sequences(y_tr)
# y_val_seq   =   y_tokenizer.texts_to_sequences(y_val)

# #padding zero upto maximum length
# y_tr    =   pad_sequences(y_tr_seq, maxlen=max_summary_len, padding='post')
# y_val   =   pad_sequences(y_val_seq, maxlen=max_summary_len, padding='post')

# #size of vocabulary
# y_voc  =   y_tokenizer.num_words +1

% of rare words in vocabulary: 76.49653434152489
Total Coverage of rare words: 4.721348808175155


In [139]:
y_tokenizer.word_counts['sostok'],len(y_tr)

(73865, 73865)