# Predicting sentiment from product reviews


The goal of this first project is to explore logistic regression and feature engineering with existing Turi Create functions.

In this project we will use product review data from Amazon.com to predict whether the sentiments about a product (from its reviews) are positive or negative.

* Use SFrames to do some feature engineering
* Train a logistic regression model to predict the sentiment of product reviews.
* Inspect the weights (coefficients) of a trained logistic regression model.
* Make a prediction (both class and probability) of sentiment for a new product review.
* Given the logistic regression weights, predictors and ground truth labels, write a function to compute the **accuracy** of the model.
* Inspect the coefficients of the logistic regression model and interpret their meanings.
* Compare multiple logistic regression models.
    
## Fire up Turi Create

In [1]:
from __future__ import division
import turicreate as tc
import string
import math

# Data

In [2]:
products = tc.SFrame("amazon_baby.sframe")

# Prep Data

In [3]:
def remove_punctuation(text):
    translator = text.maketrans('', '', string.punctuation)
    text = text.translate(translator)
    return text

review_without_punctuation = products['review'].apply(remove_punctuation)
products['word_count'] = tc.text_analytics.count_words(review_without_punctuation)

In [4]:
products = products[products['rating'] != 3]
products['sentiment'] = products['rating'].apply(lambda rating : +1 if rating > 3 else -1)

In [5]:
train_data, test_data = products.random_split(.8, seed=1)

# Train and test simple model

In [6]:
sentiment_model = tc.logistic_classifier.create(train_data,
                                                        target = 'sentiment',
                                                        features=['word_count'],
                                                        validation_set=None,verbose = False)

In [7]:
weights = sentiment_model.coefficients
num_positive_weights = len(weights[weights['value'] >= 0])
print(num_positive_weights)

91073


In [8]:
sample_test_data = test_data[10:13]
scores = sentiment_model.predict(sample_test_data, output_type='margin')
l = []
for i in scores:
    if i > 0:
        l.append(1)
    else:
        l.append(-1)
print(l,sentiment_model.predict(sample_test_data))

[1, -1, -1] [1, -1, -1]


In [9]:
prob = []
for i in scores:
    p = 1/(1+ math.exp(-i))
    prob.append(p)
prob

[0.9917471313286872, 0.047390547487155835, 0.00027775277121725486]

In [10]:
test_prob = sentiment_model.predict(test_data, output_type='probability')
test_data['probability'] = test_prob
test_data_sorted = test_data.topk('probability', 20)
test_data_sorted.print_rows(20)

+-------------------------------+-------------------------------+--------+
|              name             |             review            | rating |
+-------------------------------+-------------------------------+--------+
| Fisher-Price Cradle 'N Swi... | My husband and I cannot st... |  5.0   |
| The Original CJ's BuTTer (... | I'm going to try to review... |  4.0   |
| Baby Jogger City Mini GT D... | We are well pleased with t... |  4.0   |
| Diono RadianRXT Convertibl... | Like so many others before... |  5.0   |
| Diono RadianRXT Convertibl... | I bought this seat for my ... |  5.0   |
| Graco Pack 'n Play Element... | My husband and I assembled... |  4.0   |
| Maxi-Cosi Pria 70 with Tin... | We love this car seat!! It... |  5.0   |
| Britax 2012 B-Agile Stroll... | [I got this stroller for m... |  4.0   |
| Quinny 2012 Buzz Stroller,... | Choice - Quinny Buzz 2011 ... |  4.0   |
| Roan Rocco Classic Pram St... | Great Pram Rocco!!!!!!I bo... |  5.0   |
| Britax Decathlon Conver

In [11]:
def get_classification_accuracy(model, data, true_labels):
    # First get the predictions
    predictions = model.predict(data)
    
    # Compute the number of correctly classified examples
    ctr = 0
    for i in range(len(predictions)):
        if predictions[i] == true_labels[i]:
            ctr += 1

    # Then compute accuracy by dividing num_correct by total number of examples
    accuracy = ctr / len(predictions)
    
    return accuracy

In [12]:
acc1 = get_classification_accuracy(sentiment_model, test_data, test_data['sentiment'])
acc1

0.9221862251019919

# Train logistic model with significant words

In [13]:
significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 
      'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 
      'work', 'product', 'money', 'would', 'return']

train_data['word_count_subset'] = train_data['word_count'].dict_trim_by_keys(significant_words, exclude=False)
test_data['word_count_subset'] = test_data['word_count'].dict_trim_by_keys(significant_words, exclude=False)

simple_model = tc.logistic_classifier.create(train_data,
                                                     target = 'sentiment',
                                                     features=['word_count_subset'],
                                                     validation_set=None, verbose = False)

acc2 = get_classification_accuracy(simple_model, test_data, test_data['sentiment'])

simple_model.coefficients.sort('value', ascending=False).print_rows(num_rows=21)

+-------------------+--------------+-------+----------------------+
|        name       |    index     | class |        value         |
+-------------------+--------------+-------+----------------------+
| word_count_subset |    loves     |   1   |  1.6772714555592905  |
| word_count_subset |   perfect    |   1   |  1.5144862670271344  |
| word_count_subset |     love     |   1   |  1.3654354936790376  |
|    (intercept)    |     None     |   1   |  1.2995449552027034  |
| word_count_subset |     easy     |   1   |  1.1936618983284653  |
| word_count_subset |    great     |   1   |  0.9446912694798453  |
| word_count_subset |    little    |   1   |  0.5206286360250189  |
| word_count_subset |     well     |   1   |  0.5042567463979287  |
| word_count_subset |     able     |   1   | 0.19143830229475103  |
| word_count_subset |     old      |   1   | 0.08539618866781597  |
| word_count_subset |     car      |   1   | 0.05883499006802043  |
| word_count_subset |     less     |   1   | -0.

# Compare simple model and sentiment model

In [14]:
acc2train = get_classification_accuracy(simple_model, train_data, train_data['sentiment'])
acc1train = get_classification_accuracy(sentiment_model, train_data, train_data['sentiment'])
acc2train>acc1train

False

In [15]:
print(acc2train,acc1train)

0.8668150746537147 0.976494573364514


In [16]:
num_positive  = (train_data['sentiment'] == +1).sum()
num_negative = (train_data['sentiment'] == -1).sum()
print(num_positive)
print(num_negative)

112164
21252


In [17]:
print(acc2,acc1)

0.8693004559635229 0.9221862251019919
