In [25]:
### Import packages
import numpy as np
import pandas as pd 
import re
from bs4 import BeautifulSoup
from keras.preprocessing.text import Tokenizer 
from keras.utils import pad_sequences
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import contractions
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping

### need a local attention_local.py file for this. 
from attention_local import AttentionLayer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gitan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [31]:
stop_words = set(stopwords.words('english')) 

def text_cleaner(text, num):
    newString = text.lower()
    newString = BeautifulSoup(newString, "lxml").text
    newString = re.sub(r'\([^)]*\)', '', newString)
    newString = re.sub('"','', newString)
    newString = ' '.join([contractions.fix(t) for t in newString.split(" ")])    
    newString = re.sub(r"'s\b","",newString)
    newString = re.sub("[^a-zA-Z]", " ", newString) 
    newString = re.sub('[m]{2,}', 'mm', newString)
    if(num==0):
        tokens = [w for w in newString.split() if not w in stop_words]
    else:
        tokens=newString.split()
    long_words=[]
    for i in tokens:
        if len(i)>1:               #removing short word
            long_words.append(i)   
    return (" ".join(long_words)).strip()


def data_import_preprocess(num_rows):
    #Loading the data from the Amazon Review csv
    data = pd.read_csv(r".\amazon_food_reviews\Reviews.csv", nrows = num_rows)
    
    
    data.drop(columns = ['ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Score', 'Time'], inplace=True) #drop useless columns
    data.drop_duplicates(subset = ['Text'], inplace=True) #dropping duplicates
    data.dropna(axis=0, inplace = True) #dropping na
    
    cleaned_text = []
    for t in data['Text']:
        cleaned_text.append(text_cleaner(t,0)) 
    cleaned_summary = []
    for t in data['Summary']:
        cleaned_summary.append(text_cleaner(t,1))
    
    data['cleaned_text'], data['cleaned_summary'] = cleaned_text, cleaned_summary
    
    return data



In [32]:
data = data_import_preprocess(10000)
data.head()

  newString = BeautifulSoup(newString, "lxml").text


Unnamed: 0,Id,Summary,Text,cleaned_text,cleaned_summary
0,1,Good Quality Dog Food,I have bought several of the Vitality canned d...,bought several vitality canned dog food produc...,good quality dog food
1,2,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,product arrived labeled jumbo salted peanuts p...,not as advertised
2,3,"""Delight"" says it all",This is a confection that has been around a fe...,confection around centuries light pillowy citr...,delight says it all
3,4,Cough Medicine,If you are looking for the secret ingredient i...,looking secret ingredient robitussin believe f...,cough medicine
4,5,Great taffy,Great taffy at a great price. There was a wid...,great taffy great price wide assortment yummy ...,great taffy


In [48]:
class Text_Summarization():
    def __init__(self, num_rows):
        self.num_rows = num_rows
        self.stopwords = set(stopwords.words('english'))
        self.run_process()
        
        
    def run_process(self):
        self.data = self.data_import_preprocess()
        
    def text_cleaner(self, text):
        newString = text.lower()
        newString = BeautifulSoup(newString, "lxml").text
        newString = re.sub(r'\([^)]*\)', '', newString)
        newString = re.sub('"','', newString)
        newString = ' '.join([contractions.fix(t) for t in newString.split(" ")])    
        newString = re.sub(r"'s\b","",newString)
        newString = re.sub("[^a-zA-Z]", " ", newString) 
        newString = re.sub('[m]{2,}', 'mm', newString)

        tokens = [w for w in newString.split() if not w in self.stopwords]

        long_words=[]
        for i in tokens:
            if len(i)>1:               #removing short word
                long_words.append(i)   
        return (" ".join(long_words)).strip()
    
    
    
    def data_import_preprocess(self):
    #Loading the data from the Amazon Review csv
        data = pd.read_csv(r".\amazon_food_reviews\Reviews.csv", nrows = self.num_rows)


        data.drop(columns = ['ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Score', 'Time'], inplace=True) #drop useless columns
        data.drop_duplicates(subset = ['Text'], inplace=True) #dropping duplicates
        data.dropna(axis=0, inplace = True) #dropping na

        cleaned_text = []
        for t in data['Text']:
            cleaned_text.append(self.text_cleaner(t)) 
        cleaned_summary = []
        for t in data['Summary']:
            cleaned_summary.append(self.text_cleaner(t))

        data['cleaned_text'], data['cleaned_summary'] = cleaned_text, cleaned_summary

        return data

In [49]:
text_class = Text_Summarization(10000)

  newString = BeautifulSoup(newString, "lxml").text


In [50]:
text_class.data.head()

Unnamed: 0,Id,Summary,Text,cleaned_text,cleaned_summary
0,1,Good Quality Dog Food,I have bought several of the Vitality canned d...,bought several vitality canned dog food produc...,good quality dog food
1,2,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,product arrived labeled jumbo salted peanuts p...,advertised
2,3,"""Delight"" says it all",This is a confection that has been around a fe...,confection around centuries light pillowy citr...,delight says
3,4,Cough Medicine,If you are looking for the secret ingredient i...,looking secret ingredient robitussin believe f...,cough medicine
4,5,Great taffy,Great taffy at a great price. There was a wid...,great taffy great price wide assortment yummy ...,great taffy
