# Analyze Product Sentiment

In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import warnings
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction import DictVectorizer
warnings.filterwarnings('ignore')

# Read product review data

In [2]:
products = pd.read_csv('amazon_baby.csv')

# Explore data

In [3]:
products

Unnamed: 0,name,review,rating
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5
...,...,...,...
183526,Baby Teething Necklace for Mom Pretty Donut Sh...,Such a great idea! very handy to have and look...,5
183527,Baby Teething Necklace for Mom Pretty Donut Sh...,This product rocks! It is a great blend of fu...,5
183528,Abstract 2 PK Baby / Toddler Training Cup (Pink),This item looks great and cool for my kids.......,5
183529,"Baby Food Freezer Tray - Bacteria Resistant, B...",I am extremely happy with this product. I have...,5


# Examine the reivews for the most-reviewed product

In [4]:
giraffe_reviews = products[products['name']=='Vulli Sophie the Giraffe Teether']

In [5]:
giraffe_reviews

Unnamed: 0,name,review,rating
34313,Vulli Sophie the Giraffe Teether,He likes chewing on all the parts especially t...,5
34314,Vulli Sophie the Giraffe Teether,My son loves this toy and fits great in the di...,5
34315,Vulli Sophie the Giraffe Teether,There really should be a large warning on the ...,1
34316,Vulli Sophie the Giraffe Teether,All the moms in my moms\' group got Sophie for...,5
34317,Vulli Sophie the Giraffe Teether,I was a little skeptical on whether Sophie was...,5
...,...,...,...
159649,Vulli Sophie the Giraffe Teether,My baby loves her Sophie Chew Toy. She can che...,5
159650,Vulli Sophie the Giraffe Teether,Sophie the Giraffe was a big hit at the baby s...,5
159651,Vulli Sophie the Giraffe Teether,quick shipping and perfect product. I would pu...,5
159652,Vulli Sophie the Giraffe Teether,My baby who is currently teething love his Sop...,5


In [6]:
len(giraffe_reviews)

785

In [7]:
giraffe_reviews['rating'].describe()

count    785.000000
mean       4.294268
std        1.227358
min        1.000000
25%        4.000000
50%        5.000000
75%        5.000000
max        5.000000
Name: rating, dtype: float64

# Building a sentiment classifier

## Build word count vectors

In [8]:
products['word_count'] = products['review'].apply(lambda x: Counter(str(x).split()))

In [9]:
products

Unnamed: 0,name,review,rating,word_count
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3,"{'These': 1, 'flannel': 1, 'wipes': 2, 'are': ..."
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,"{'it': 2, 'came': 1, 'early': 1, 'and': 3, 'wa..."
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,"{'Very': 1, 'soft': 1, 'and': 2, 'comfortable'..."
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,"{'This': 1, 'is': 4, 'a': 2, 'product': 2, 'we..."
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,"{'All': 1, 'of': 1, 'my': 1, 'kids': 2, 'have'..."
...,...,...,...,...
183526,Baby Teething Necklace for Mom Pretty Donut Sh...,Such a great idea! very handy to have and look...,5,"{'Such': 1, 'a': 1, 'great': 2, 'idea!': 1, 'v..."
183527,Baby Teething Necklace for Mom Pretty Donut Sh...,This product rocks! It is a great blend of fu...,5,"{'This': 1, 'product': 2, 'rocks!': 1, 'It': 1..."
183528,Abstract 2 PK Baby / Toddler Training Cup (Pink),This item looks great and cool for my kids.......,5,"{'This': 1, 'item': 1, 'looks': 1, 'great': 2,..."
183529,"Baby Food Freezer Tray - Bacteria Resistant, B...",I am extremely happy with this product. I have...,5,"{'I': 9, 'am': 2, 'extremely': 1, 'happy': 1, ..."


# Define what is positive and negative sentiment

In [10]:
products['rating'].describe()

count    183531.000000
mean          4.120448
std           1.285017
min           1.000000
25%           4.000000
50%           5.000000
75%           5.000000
max           5.000000
Name: rating, dtype: float64

In [11]:
#ignore all 3*  reviews
products = products[products['rating']!= 3]

In [12]:
#positive sentiment = 4-star or 5-star reviews
products['sentiment'] = products['rating'].apply(lambda rating : +1 if rating >= 4 else -1)

In [13]:
products

Unnamed: 0,name,review,rating,word_count,sentiment
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,"{'it': 2, 'came': 1, 'early': 1, 'and': 3, 'wa...",1
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,"{'Very': 1, 'soft': 1, 'and': 2, 'comfortable'...",1
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,"{'This': 1, 'is': 4, 'a': 2, 'product': 2, 'we...",1
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,"{'All': 1, 'of': 1, 'my': 1, 'kids': 2, 'have'...",1
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5,"{'When': 1, 'the': 5, 'Binky': 3, 'Fairy': 3, ...",1
...,...,...,...,...,...
183526,Baby Teething Necklace for Mom Pretty Donut Sh...,Such a great idea! very handy to have and look...,5,"{'Such': 1, 'a': 1, 'great': 2, 'idea!': 1, 'v...",1
183527,Baby Teething Necklace for Mom Pretty Donut Sh...,This product rocks! It is a great blend of fu...,5,"{'This': 1, 'product': 2, 'rocks!': 1, 'It': 1...",1
183528,Abstract 2 PK Baby / Toddler Training Cup (Pink),This item looks great and cool for my kids.......,5,"{'This': 1, 'item': 1, 'looks': 1, 'great': 2,...",1
183529,"Baby Food Freezer Tray - Bacteria Resistant, B...",I am extremely happy with this product. I have...,5,"{'I': 9, 'am': 2, 'extremely': 1, 'happy': 1, ...",1


In [14]:
products['sentiment'].describe()

count    166752.000000
mean          0.682247
std           0.731124
min          -1.000000
25%           1.000000
50%           1.000000
75%           1.000000
max           1.000000
Name: sentiment, dtype: float64

# Train our sentiment classifier

    training_set = products.sample(frac=0.8, random_state=0)
    test_set = products.drop(training_set.index)

In [15]:
vectorizer = DictVectorizer()
products_matrix = vectorizer.fit_transform(products['word_count'])

In [None]:
sentiment_model = LogisticRegression().fit(products_matrix, products['sentiment'])

# Apply the sentiment classifier to better understand the Giraffe reviews

In [None]:
sentiment_model_preds_probs = sentiment_model.predict_proba(products_matrix)
predicted_sentiment = []

for p in sentiment_model_preds_probs :
    if p[0] > p[1] :
        predicted_sentiment.append(p[0])
    elif p[0] < p[1] :
        predicted_sentiment.append(p[1])
        
products['predicted_sentiment'] = predicted_sentiment

In [None]:
giraffe_reviews = products[products['name']== 'Vulli Sophie the Giraffe Teether']

In [None]:
giraffe_reviews

# Sort the Giraffe reviews according to predicted sentiment

In [None]:
giraffe_reviews = giraffe_reviews.sort_values('predicted_sentiment', ascending=False)

In [None]:
giraffe_reviews

In [None]:
giraffe_reviews.tail()

## Show the Most Positive Reviews

In [None]:
giraffe_reviews['review'].iloc[0]

In [None]:
giraffe_reviews['review'].iloc[1]

## Show Most Negative Reivews

In [None]:
giraffe_reviews['review'].iloc[-1]

In [None]:
giraffe_reviews['review'].iloc[-2]

# Build New Features with Counts for Each Words of Selected Words

In [None]:
selected_words = ['awesome', 'great', 'fantastic', 'amazing', 'love', 'horrible', 'bad', 'terrible', 'awful', 'wow', 'hate']

In [None]:
def select_words(words_count):
    selected_words = ['awesome', 'great', 'fantastic', 'amazing', 'love', 'horrible', 'bad', 'terrible', 'awful', 'wow', 'hate']
    d = {}
    for word in selected_words:
        if word not in words_count: d[word] = 0
        else: d[word] = words_count[word]
    return d

In [None]:
products['selected_word_count'] = products['word_count'].apply(select_words)

In [None]:
products['selected_word_count']

In [None]:
for word in selected_words:
    products[word] = products['word_count'].apply(lambda word_count: word_count[word] if word in word_count else 0)
    
# The code below is to answer the following quiz question
max_count, min_count = 0, 9223372036854775807
max_word, min_word = selected_words[0], selected_words[0]
for word in selected_words:
    total = products[word].sum()
    if total > max_count:
        max_word = word
        max_count = total
    if total < min_count:
        min_word = word
        min_count = total
print('most used  : ', max_word, '\t\tcount :', max_count)
print('least used : ', min_word,   '\t\tcount :', min_count)

<font color = 'steelblue'><b> Quiz : Using the .sum() method on each of the new columns you created, answer the following questions: Out of the selected_words, which one is most used in the dataset and which one is least used? </b></font>

<font color = 'mediumvioletred'><b> Answer : *{{max_word}}* is the most used word and *{{min_word}}* is the least used word </b></font>

# Create a new sentiment analysis model using only the selected_words

Above, we used word counts for all words as features for our sentiment classifier. Now, you are just going to use the selected_words

In [None]:
training_set = products.sample(frac=0.8, random_state=0)
test_set = products.drop(training_set.index)

In [None]:
selected_words_model = LogisticRegression().fit(training_set[select_words], training_set['sentiment'])

In [None]:
print('Selected Words Model Coefficients : ', *selected_words_model.coef_,'\n')
print('Selected Words Model Intercept : ', *selected_words_model.intercept_)

coef_data = {'Coefficient' : selected_words, 'Value' : np.array(selected_words_model.coef_).flatten()}
coef_df = pd.DataFrame(coef_data)
coef_df

# The code below is to answer the followin quiz question
#print('\n\n')

most_positive = -1
most_positive_word = ''
most_negative = 1
most_negative_word = ''


for name,val in zip(coef_df['Coefficient'],coef_df['Value']) :
    if val > most_positive :
        most_positive = val
        most_positive_word = name
        
    elif val < most_negative :
        most_negative = val
        most_negative_word = name
        
print('most positive word : ', most_positive_word, '    \t\tvalue :', most_positive)
print('most negative word : ', most_negative_word, '\t\tvalue :', most_negative)

<font color = 'steelblue'><b> Quiz : Out of the 11 words in selected_words, which one got the most positive weight? Which one got the most negative weight? Do these values make sense for you? </b></font>

<font color = 'mediumvioletred'><b> Answer : *{{most_positive_word}}* is the most positive word and *{{most_negative_word}}* is the most negative word </b></font>

# Comparing the Accuracy of Different Sentiment Analysis Models

In [None]:
sentiment_model_preds = sentiment_model.predict(products_matrix)
sentiment_model_acc = accuracy_score(products['sentiment'], sentiment_model_preds)

selected_words_model_preds = selected_words_model.predict(test_set[select_words])
selected_words_model_acc = accuracy_score(test_set['sentiment'], selected_words_model_preds)

print('Sentiment Model Accuracy :', sentiment_model_acc)
print('Selected Words Model Accuracy :', selected_words_model_acc)

# Interpreting the Difference in Performance Between the Models

To understand why the model with all word counts performs better than the one with only the selected_words, we will now examine the reviews for a particular product.

* We will investigate a product named ‘Baby Trend Diaper Champ’. (This is a trash can for soiled baby diapers, which keeps the smell contained.)
* Just like we did for the reviews for the giraffe toy in the Jupyter Notebook in the lecture video, before we start our analysis you should select all reviews where the product name is ‘Baby Trend Diaper Champ’. Let’s call this table diaper_champ_reviews.
* Again, just as in the video, use the sentiment_model to predict the sentiment of each review in diaper_champ_reviews and sort the results according to their ‘predicted_sentiment’.
* What is the ‘predicted_sentiment’ for the most positive review for ‘Baby Trend Diaper Champ’ according to the sentiment_model from the Jupyter Notebook from lecture? Save this result to answer the quiz at the end.
* Now use the selected_words_model you learned using just the selected_words to predict the sentiment most positive review you found above. Hint: if you sorted the diaper_champ_reviews in descending order (from most positive to most negative)

In [None]:
diaper_champ_reviews = products[products['name']=='Baby Trend Diaper Champ']

In [None]:
diaper_champ_reviews

In [None]:
diaper_champ_reviews = diaper_champ_reviews.sort_values('rating', ascending = False)

In [None]:
diaper_champ_reviews = diaper_champ_reviews.sort_values('sentiment', ascending = False)

In [None]:
diaper_champ_reviews

In [None]:
selected_words_model_diaper_preds_probs = selected_words_model.predict_proba(diaper_champ_reviews[select_words].iloc[0:10])

for p in selected_words_model_diaper_preds_probs :
    if p[0] > p[1] :
        print(p[0])
    elif p[0] < p[1] :
        print(p[1])

In [None]:
diapers_matrix = vectorizer.fit_transform(diaper_champ_reviews['word_count'])

sentiment_model_diaper = LogisticRegression().fit(diapers_matrix, diaper_champ_reviews['sentiment'])

In [None]:
sentiment_model_diapers_preds_probs = sentiment_model_diaper.predict_proba(diapers_matrix)

for p in sentiment_model_diapers_preds_probs[0:10] :
    if p[0] > p[1] :
        print(p[0])
    elif p[0] < p[1] :
        print(p[1])

In [None]:
sentiment_model_diapers_preds = sentiment_model_diaper.predict(diapers_matrix)
sentiment_model_diapers_acc = accuracy_score(diaper_champ_reviews['sentiment'], sentiment_model_diapers_preds)

selected_words_model_diapers_preds = selected_words_model.predict(test_set[select_words])
selected_words_model_diapers_acc = accuracy_score(test_set['sentiment'], selected_words_model_diapers_preds)

print('Sentiment Model Diapers Accuracy :', sentiment_model_diapers_acc)
print('Selected Words Model Diapers Accuracy :', selected_words_model_acc)

In [None]:
diaper_predicted_sentiment = []

for p in sentiment_model_diapers_preds_probs :
    if p[0] > p[1] :
        diaper_predicted_sentiment.append(p[0])
    elif p[0] < p[1] :
        diaper_predicted_sentiment.append(p[1])
        
diaper_champ_reviews['predicted_sentiment'] = diaper_predicted_sentiment

In [None]:
diaper_champ_reviews['review'].iloc[0]

In [None]:
diaper_champ_reviews['word_count'].iloc[0]

In [None]:
diaper_champ_reviews[['review','sentiment','predicted_sentiment']]