In [2]:
import turicreate as tc
import turicreate.aggregate as agg

In [3]:
products = tc.SFrame('../input/basicml-lecture1/amazon_baby.sframe')
products['word_count'] = tc.text_analytics.count_words(products['review'])

In [4]:
selected_words = ['awesome', 'great', 'fantastic', 'amazing', 'love', 'horrible', 'bad', 'terrible', 'awful', 'wow', 'hate']

In [5]:
for i in range(11):
    word = selected_words[i]
    products[word] = products.apply(lambda x: x['word_count'][word] if word in x['word_count'] else 0)

In [7]:
max_num = 0
min_num = 999999
for i in range(11):
    word = selected_words[i]
    tmp = products[word].sum()
    if tmp > max_num:
        max_num = tmp
        max_name = word
    if tmp < min_num:
        min_num = tmp
        min_name = word
        
print("The most used word is {}, It appears {} times".format(max_name, max_num))
print("The least used word is {}, It appears {} times".format(min_name, min_num))

The most used word is great, It appears 59536.0 times
The least used word is wow, It appears 461 times


### Question 1: [5]great
### Question 2: [1]wow

In [7]:
#ignore all 3* reviews
products = products[products['rating']!= 3]
#positive sentiment = 4-star or 5-star reviews
products['sentiment'] = products['rating'] >= 4

train_data,test_data = products.random_split(.8,seed=0)

In [8]:
sentiment_model = tc.logistic_classifier.create(train_data,target='sentiment', features=['word_count'], validation_set=test_data)

In [9]:
selected_words_model = tc.logistic_classifier.create(train_data,target='sentiment', features=selected_words, validation_set=test_data)

In [10]:
weight = selected_words_model.coefficients.sort('value')
print("The word with the most positive weight is {}: {}".format(weight[-1]['name'], weight[-1]['value']))
print("The word with the most negative weight is {}: {}".format(weight[0]['name'], weight[0]['value']))

The word with the most positive weight is love: 1.359268866922504
The word with the most negative weight is horrible: -2.251335236759102


### Question 3: The most positive weight: [3]love
### Question 4: THe most negative weight: [1]horrible

In [13]:
selected_words_model.evaluate(test_data)['accuracy']

0.8463848186404036

In [14]:
sentiment_model.evaluate(test_data)['accuracy']

0.9176975738650012

### Question 5: selected_words_model: [2]0.841 to 0.871
### Question 6: sentiment_model: [4]0.901 to 0.931

In [15]:
test_data.groupby('sentiment', operations={'sum': agg.COUNT()})

sentiment,sum
0,5328
1,27976


In [16]:
print("Accuracy of majority class classifier: {:.2f}".format(27976/(27976+5328)))

Accuracy of majority class classifier: 0.84


### Question 7: majority class classifier: [1]0.811 to 0.843
### Question 8: [3]The model learned using all words performed much better than the two. The other two approaches performed about the same.

In [17]:
diaper_champ_reviews = products[products['name'] == 'Baby Trend Diaper Champ']
diaper_champ_reviews['predicted_sentiment'] = sentiment_model.predict(diaper_champ_reviews, output_type = 'probability')
sorted_reviews = diaper_champ_reviews.sort('predicted_sentiment', False)
print('The most positive: {} '.format(sorted_reviews[0]['predicted_sentiment']))

The most positive: 0.9999999999895941 


### Question 9: [4]0.9 to 1

In [18]:
selected_word_result = sorted_reviews[0:1]
selected_word_result['predicted_sentiment'] = selected_words_model.predict(selected_word_result, output_type = 'probability')
print('The result in predicted_words_model : {} '.format(selected_word_result[0]['predicted_sentiment']))

The result in predicted_words_model : 0.7919288370624482 


### Quesion 10: [2]0.7 to 0.8

In [20]:
selected_word_result

name,review,rating,word_count,awesome,great,fantastic
Baby Trend Diaper Champ,I read a review below that can explain exactly ...,4.0,"{'key': 1.0, 'have': 1.0, 'pieces': 1.0, 'betwe ...",0,0.0,0.0

amazing,love,horrible,bad,terrible,awful,wow,hate,sentiment,predicted_sentiment
0,0.0,0,0,0,0,0,0,1,0.7919288370624482


### Question 11: [4]None of the selected words appeared in the text of this review.
As you can see, the columns of the selected words are all 0.