# Predicting sentiment from product reviews


In [133]:
import sklearn as s
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Read some product review data

Loading reviews for a set of baby products. 

In [134]:
products = pd.read_csv('data/amazon_baby.csv')

# Let's explore this data 

Data includes the product name, the review text and the rating of the review. 

In [135]:
products.head()

Unnamed: 0,name,review,rating
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5


In [136]:
products.isnull().any() #This find if any null value is in dataset and we can see name & review have null values.

name       True
review     True
rating    False
dtype: bool

In [5]:
len(products)

183531

###### Since name and review contains null value, drop those rows.

In [6]:
products=products[pd.notnull(products['name'])]

In [7]:
products=products[pd.notnull(products['review'])]

In [8]:
len(products)

182384

In [9]:
183531-182384 #No. of rows contains null value or doesn't have any value.

1147

In [10]:
# examine the class distribution
products['name'].describe()


count                               182384
unique                               32315
top       Vulli Sophie the Giraffe Teether
freq                                   779
Name: name, dtype: object

In [11]:
products['rating'].describe()

count    182384.000000
mean          4.119188
std           1.285358
min           1.000000
25%           4.000000
50%           5.000000
75%           5.000000
max           5.000000
Name: rating, dtype: float64

# Examining the reviews for most-sold product:  'Vulli Sophie the Giraffe Teether'

In [12]:
giraffe_reviews = products[products['name'] == 'Vulli Sophie the Giraffe Teether']

In [13]:
len(giraffe_reviews)

779

In [14]:
giraffe_reviews['rating'].describe()

count    779.000000
mean       4.291399
std        1.230949
min        1.000000
25%        4.000000
50%        5.000000
75%        5.000000
max        5.000000
Name: rating, dtype: float64

# Build a sentiment classifier

In [15]:
products['rating'].head()

0    3
1    5
2    5
3    5
4    5
Name: rating, dtype: int64

## Define what's a positive and a negative sentiment

We will ignore all reviews with rating = 3, since they tend to have a neutral sentiment.  Reviews with a rating of 4 or higher will be considered positive, while the ones with rating of 2 or lower will have a negative sentiment.   

In [16]:
# ignore all 3* reviews
products = products[products['rating'] != 3]

In [25]:
# positive sentiment = 4* or 5* reviews
products['sentiment']= (products['rating']>=4).astype(int)
        

In [26]:
products.head()

Unnamed: 0,name,review,rating,sentiment
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,1
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,1
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,1
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5,1


## Train-Test split

In [29]:
X=products.review
y=products.sentiment

In [30]:
print(X.shape," ",y.shape)

(165679,)   (165679,)


In [31]:
# split X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(124259,)
(41420,)
(124259,)
(41420,)


## Vectorizing our dataset

In [32]:
# import and instantiate CountVectorizer (with the default parameters)
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()

In [42]:
# learn training data vocabulary, then use it to create a document-term matrix
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)

In [43]:
# examine the document-term matrix
X_train_dtm

<124259x55555 sparse matrix of type '<class 'numpy.int64'>'
	with 6608004 stored elements in Compressed Sparse Row format>

In [44]:
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)
X_test_dtm

<41420x55555 sparse matrix of type '<class 'numpy.int64'>'
	with 2201954 stored elements in Compressed Sparse Row format>

## Let's train the sentiment classifier

In [45]:
# import and instantiate a Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [46]:
# train the model using X_train_dtm (timing it with an IPython "magic command")
%time nb.fit(X_train_dtm, y_train)

Wall time: 195 ms


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

# Evaluate the sentiment model

In [47]:
# make class predictions for X_test_dtm
y_pred_class = nb.predict(X_test_dtm)

In [48]:
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.90243843553838721

In [49]:
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)

array([[ 4528,  2024],
       [ 2017, 32851]])

In [51]:
# print review for the false positives (predicted positive but its negative)
X_test[(y_pred_class>y_test)].head()

2751      I do not like these bags.  The twist tie closu...
160997    I got this for the 3 year old that I babysit. ...
21764     We purchased a Graco Travel Lite for a few upc...
101819    The price was great but the white part had bla...
114681    I bought this high chair based on the great re...
Name: review, dtype: object

In [54]:
X_test[21764] #A false positive review

"We purchased a Graco Travel Lite for a few upcoming overnight trips where space would be tight. At the same time, I bought two of these crib sheets.  While I was very happy with the pack-n-play itself, the sheets were not satisfactory.  They were thin and stiff, and have very little elastic.  They do fit the mattress well, but they are not serving the purpose of providing a cool and comfortable covering to make my baby\\'s sleep better.  And since I bought two, for the same price, I could have purchased a set of two 600-thread count queen-sized pillowcases, that would have done a better job.  I recommend doing that instead of buying these."

In [53]:
# print review for the false negative (predicted negative but its positive)
X_test[y_pred_class<y_test].head()

62065     Would you like to smell poop from your fresh b...
80770     Great product! Had no problems with set up & d...
135056    This is our first video baby monitor. We had a...
138702    I use it as a burglar alarm for my condo a hun...
21200     i dont understand why all these ppl are writin...
Name: review, dtype: object

In [56]:
X_test[21200] #A false negative review

'i dont understand why all these ppl are writing neg reviews. we live in a house with a finished basement and havent had any issues with this monitor. my son is going on 3 yrs old. im buying another set for my twins bc i loved how sensitive this monitor was, i could hear him breathing! also i have walked across my street to the neighbors house while my son was napping and had NO problems even in her house! i have the old white monitor and love the upgrades to the grey one. it CLEARLY says in the instructions to not leave it plugged into the AC adapter for longer then 10 hrs bc it will burn out the rechargable battery. this monitor is meant to be portable, u charge it, then use. so if your monitor has gone "BAD" chances are its by your own doing and not a faulty monitor.'

## Lets use another classifier( Logistic regression) and see how it work

In [57]:
# import and instantiate a logistic regression model
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

In [58]:
# train the model using X_train_dtm
%time logreg.fit(X_train_dtm, y_train)

Wall time: 2min 33s


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [59]:
# make class predictions for X_test_dtm
y_pred_class = logreg.predict(X_test_dtm)

In [60]:
# calculate accuracy
metrics.accuracy_score(y_test, y_pred_class)

0.93312409464027035

#### Performed better as compare to prevoius model where accuracy was 0.9024

In [65]:
metrics.confusion_matrix(y_test,y_pred_class)

array([[ 4845,  1707],
       [ 1063, 33805]])

# Applying the learned model to understand sentiment for Giraffe

In [84]:
products['name'].describe()

count                               165679
unique                               30629
top       Vulli Sophie the Giraffe Teether
freq                                   717
Name: name, dtype: object

In [99]:
giraffe_reviews=products[products['name']=='Vulli Sophie the Giraffe Teether']

In [100]:
giraffe_reviews.head()

Unnamed: 0,name,review,rating,sentiment
34313,Vulli Sophie the Giraffe Teether,He likes chewing on all the parts especially t...,5,1
34314,Vulli Sophie the Giraffe Teether,My son loves this toy and fits great in the di...,5,1
34315,Vulli Sophie the Giraffe Teether,There really should be a large warning on the ...,1,0
34316,Vulli Sophie the Giraffe Teether,All the moms in my moms\' group got Sophie for...,5,1
34317,Vulli Sophie the Giraffe Teether,I was a little skeptical on whether Sophie was...,5,1


In [108]:
giraffe_reviews=giraffe_reviews.sort_values(['sentiment'], ascending=False)

In [109]:
giraffe_reviews.head()

Unnamed: 0,name,review,rating,sentiment
34313,Vulli Sophie the Giraffe Teether,He likes chewing on all the parts especially t...,5,1
34809,Vulli Sophie the Giraffe Teether,This was recommended by a friend and I have gi...,5,1
34811,Vulli Sophie the Giraffe Teether,My baby loves this teether. She likes to wave ...,5,1
34812,Vulli Sophie the Giraffe Teether,We bought this when my son was 3 months old an...,5,1
34814,Vulli Sophie the Giraffe Teether,We love Sophie at our house... she is a great ...,5,1


## Most positive reviews for the giraffe

In [127]:
giraffe_reviews['review'][34313]

'He likes chewing on all the parts especially the head and the ears! It has helped when he has been cranky because of teething. Great purchase!'

In [128]:
giraffe_reviews['review'][34809]



## Show most negative reviews for giraffe

In [130]:
giraffe_reviews['review'].tail()

34471    My baby loved Sophie but after reading on Vull...
34917    Although my son love sucking face with Sophie,...
34411    I was so looking forward to getting this for m...
34994    When I received this the paint was peeling off...
34765    my grand daughter had this teether, she loved ...
Name: review, dtype: object

In [131]:
giraffe_reviews['review'][34765]

'my grand daughter had this teether, she loved it until I noticed the paint coming off. I took it away immediately. So it sits on the shelf. What a waste of money. I never saw a teething toy that had the paint come off in my life. I hope it was not toxic. Maybe it would make a good dog toy.'