In [1]:
import pandas as pd
import numpy as np
import re, string, unicodedata
import contractions
import inflect
from bs4 import BeautifulSoup
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer

In [2]:
data = pd.read_csv('dual_wall_tumbler_reviews.csv')

In [3]:
data.head()

Unnamed: 0,ReviewProduct,Material,CaptureDate,ReviewId,ReviewHelpfulYes,ReviewHelpfulNo,ReviewRating,ReviewBy,ReviewDate,ReviewTitle,ReviewText
0,B0084IHVOS,Non-Eastman,24-04-2016 00:00,R11FSR4IZ5LZ5Z,0,0,5,zakia N chatman,05-05-2015 00:00,Five Stars,Love it
1,B0084IHVOS,Non-Eastman,24-04-2016 00:00,R11VQ0MB6BTIKU,0,0,5,EarlzMom,01-03-2015 00:00,love it !,"love this cup, so large. I carry it everywhere..."
2,B0084IHVOS,Non-Eastman,24-04-2016 00:00,R11WZ5U04NOLF3,0,0,5,Jassim,13-02-2015 00:00,Five Stars,Its really big lol
3,B0084IHVOS,Non-Eastman,24-04-2016 00:00,R12FI3W0ZGSIT3,0,0,2,vsnaryster,12-03-2015 00:00,Okay cup. Broken straw,The straw was broken and unusable.
4,B0084IHVOS,Non-Eastman,24-04-2016 00:00,R12MEF3RQEDZFD,0,0,5,Amazon Customer,24-12-2015 00:00,Thank You,Just what we ordered. Perfect.


In [4]:
data.describe()

Unnamed: 0,ReviewHelpfulYes,ReviewHelpfulNo,ReviewRating
count,1970.0,1970.0,1970.0
mean,0.456345,0.0,4.052284
std,3.071416,0.0,1.377025
min,0.0,0.0,1.0
25%,0.0,0.0,3.0
50%,0.0,0.0,5.0
75%,0.0,0.0,5.0
max,93.0,0.0,5.0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1970 entries, 0 to 1969
Data columns (total 11 columns):
ReviewProduct       1970 non-null object
Material            1970 non-null object
CaptureDate         1970 non-null object
ReviewId            1970 non-null object
ReviewHelpfulYes    1970 non-null int64
ReviewHelpfulNo     1970 non-null int64
ReviewRating        1970 non-null int64
ReviewBy            1970 non-null object
ReviewDate          1970 non-null object
ReviewTitle         1969 non-null object
ReviewText          1970 non-null object
dtypes: int64(3), object(8)
memory usage: 169.4+ KB


In [6]:
useful_data = data.loc[:,['ReviewProduct','Material','ReviewRating','ReviewText']]

In [7]:
useful_data['ReviewText']

0                                                 Love it
1       love this cup, so large. I carry it everywhere...
2                                      Its really big lol
3                      The straw was broken and unusable.
4                         Just what we ordered.  Perfect.
5       I love this cup and use it every day , no swea...
6       I was super bummed because I was expected the ...
7       This is a great re-usable cup that can hold a ...
8       Keeps the drink colder for longer. However, it...
9                                                    good
10      These are NOT the double insulated cups that s...
11      Transparent and extremely large size of the cu...
12      purchased the cup and 4 straws as a bundle<br ...
13      This is my favorite cup, keeps my water cold m...
14                                                Love it
15          wonderful.<br />Holds an entire canned drink.
16      The cup does sweat a little bit if the drink h...
17            

In [8]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

def furtherCleaning(text):
    tokens = word_tokenize(text)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuation from each word
    import string
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words
    from nltk.corpus import stopwords
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    return ' '.join(words)

def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    text = furtherCleaning(text)
    return text

In [9]:
useful_data['ReviewText'] = useful_data['ReviewText'].apply(denoise_text)

  ' that document to Beautiful Soup.' % decoded_markup


In [10]:
useful_data['ReviewText']

0                                                    love
1       love cup large carry everywhere keeps water co...
2                                          really big lol
3                                   straw broken unusable
4                                         ordered perfect
5       love cup use every day sweating handy sturdy w...
6       super bummed expected double insulated cup all...
7       great reusable cup hold lot drink put buy chea...
8       keeps drink colder longer however coffee cup f...
9                                                    good
10      double insulated cups separate nice enough exa...
11      transparent extremely large size cup likedi ch...
12      purchased cup straws bundleitems ordered price...
13         favorite cup keeps water cold day use everyday
14                                                   love
15                     wonderfulholds entire canned drink
16      cup sweat little bit drink lot ice deal breake...
17            

In [11]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [12]:
sid = SentimentIntensityAnalyzer()

In [13]:
sentences = useful_data['ReviewText']

In [15]:
sentiment = []
for sentence in sentences:
    ss = sid.polarity_scores(sentence)
    max_value = 0
    max_key = ''
    ss.pop('compound',None)
    for k, v in ss.items():
        if v > max_value:
            max_value = v
            max_key = k
    if max_key == 'neg':
        max_value = "-" + str(max_value)
        max_value = float(max_value)
    if max_key == 'neu':
        max_value = 0
    sentiment.append(max_value)

In [16]:
useful_data['Sentiments'] = pd.Series(sentiment)

In [20]:
useful_data.groupby(['ReviewProduct','Material'])['Sentiments'].mean()

ReviewProduct  Material   
B0084IHVOS     Non-Eastman    0.235902
B008BXWGR8     Eastman        0.211907
B009ZHEG68     Non-Eastman    0.221646
B00FALRU6Q     Non-Eastman    0.302139
B00IR77KVK     Eastman        0.260567
B00J5ASJS6     Non-Eastman    0.192552
B00YOK11Z0     Eastman        0.120151
Name: Sentiments, dtype: float64

In [21]:
useful_data.groupby(['Material'])['Sentiments'].mean()

Material
Eastman        0.198266
Non-Eastman    0.225556
Name: Sentiments, dtype: float64