# Amazon Fine Food Reviews

This notebook contains text preprocessing

## Importing necessary libraries

In [1]:
import pandas as pd
# NLP libraries
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import word_tokenize
from string import punctuation

from bs4 import BeautifulSoup

In [2]:
# Load data 
df = pd.read_csv("ReviewsNew.csv")
df.head()

Unnamed: 0,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,ReviewSummary,ReviewText,Helpfulness,Review_type,%Helpful,word_count
0,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1.0,1.0,5.0,1303862000.0,Good Quality Dog Food,I have bought several of the Vitality canned ...,1.0,1,more than 75%,50
1,B00171APVA,A21BT40VZCCYT4,Carol A. Reed,0.0,0.0,5.0,1351210000.0,Healthy Dog Food,This is a very healthy dog food. Good for the...,0.0,1,not voted,26
2,B0019CW0HE,A2P6ACFZ8FTNVV,Melissa Benjamin,0.0,1.0,1.0,1331165000.0,Bad,I fed this to my Golden Retriever and he hate...,0.0,0,not voted,39
3,B006F2NYI2,A132DJVI37RB4X,Scottdrum,2.0,5.0,2.0,1332374000.0,"Not hot, not habanero","I have to admit, I was a sucker for the large...",0.4,0,between 25% to 75%,72
4,B000P41A28,A82WIMR4RSVLI,Emrose mom,0.0,1.0,4.0,1337472000.0,The best weve tried so far,We have a 7 week old... He had gas and consti...,0.0,1,not voted,199


In [3]:
# creating new dataframe
dfnew = df[["ReviewSummary","ReviewText","Review_type"]]
dfnew.head()

Unnamed: 0,ReviewSummary,ReviewText,Review_type
0,Good Quality Dog Food,I have bought several of the Vitality canned ...,1
1,Healthy Dog Food,This is a very healthy dog food. Good for the...,1
2,Bad,I fed this to my Golden Retriever and he hate...,0
3,"Not hot, not habanero","I have to admit, I was a sucker for the large...",0
4,The best weve tried so far,We have a 7 week old... He had gas and consti...,1


In [4]:
pd.options.mode.chained_assignment = None

In [5]:
dfnew['text'] = dfnew['ReviewSummary'].str.cat(df['ReviewText'],sep=" ")
dfnew.head()

Unnamed: 0,ReviewSummary,ReviewText,Review_type,text
0,Good Quality Dog Food,I have bought several of the Vitality canned ...,1,Good Quality Dog Food I have bought several ...
1,Healthy Dog Food,This is a very healthy dog food. Good for the...,1,Healthy Dog Food This is a very healthy dog ...
2,Bad,I fed this to my Golden Retriever and he hate...,0,Bad I fed this to my Golden Retriever and he...
3,"Not hot, not habanero","I have to admit, I was a sucker for the large...",0,"Not hot, not habanero I have to admit, I was..."
4,The best weve tried so far,We have a 7 week old... He had gas and consti...,1,The best weve tried so far We have a 7 week ...


In [6]:
dfnew.drop(columns=['ReviewSummary','ReviewText'], inplace=True)
dfnew.head()

Unnamed: 0,Review_type,text
0,1,Good Quality Dog Food I have bought several ...
1,1,Healthy Dog Food This is a very healthy dog ...
2,0,Bad I fed this to my Golden Retriever and he...
3,0,"Not hot, not habanero I have to admit, I was..."
4,1,The best weve tried so far We have a 7 week ...


## Text Preprocessing

In [7]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\arkur\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\arkur\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\arkur\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
# List of stopwords
stop_words = set(stopwords.words("english"))

In [9]:
# 20 most common words
common20= pd.Series(' '.join(dfnew["text"]).split()).value_counts()[:20]

In [10]:
common20.index

Index(['the', 'I', 'and', 'a', 'to', 'of', 'is', 'it', 'for', 'in', 'this',
       'that', 'my', 'with', 'but', 'have', 'was', 'are', 'not', 'you'],
      dtype='object')

In [11]:
add_words = ['the', 'I', 'and', 'a', 'to', 'of', 'is', 'it', 'for', 'in', 'this',
             'that', 'my', 'with', 'but', 'have', 'was', 'are', 'you']

stop_words=stop_words.union(add_words)

In [12]:
# creating instance for lemmatizer
lemmatizer = WordNetLemmatizer()

In [13]:
# creating function for preprocessing text
    
corpus =[]
def preprocess_text(text):
    """
    This function preprocess the text and return cleaned text
    """
    #removing links
    text = re.sub(r"http\S+","", text) 
    
    #removing html tags and keeping only texts
    text = BeautifulSoup(text,'lxml').get_text() 
    
    # removing words containing numeric digits
    text = re.sub(r"\S*\d\S*","", text).strip() 
    
    #removing non-alphabetic characters
    text = re.sub(r"[^a-zA-Z]+"," ", text) 
    
    # converting words with characters appearing more than 2 times to the normal meaningful words
    text = re.sub(r"(.)\1+",r"\1\1",text)
    
    # converting to lower case and creating list of tokenized words
    text = word_tokenize(text.lower())
    
    # removing stop words
    text = [word for word in text if not word in stop_words]
    
    # removing punctuations
    text = [word for word in text if word not in punctuation ]
    
    #lemmatization (obtaining verb form of word)
    text = [lemmatizer.lemmatize(word,'v') for word in text] 
    
    # creating list of words appeared in all text data
    corpus.append(text) 
    
    text = " ".join(text)
    
    text.strip()
    
    return text

In [14]:
dfnew['cleanedtext']=dfnew['text'].apply(preprocess_text)

In [15]:
dfnew.to_csv("CleanedText.csv", index=False)