In [None]:
## Importing necessary Libraries 
import ctypes, inspect, os, graphlab
from graphlab import SFrame
import pandas as pd

In [None]:
# This code reads the data into a graphlab Sframe. The dataset used is an Office Products dataset containing 53,257 reviews   
# retrieved from: http://jmcauley.ucsd.edu/data/amazon/ citation: "Image-based recommendations on styles and 
# substitutes" J. McAuley, C. Targett, J. Shi, A. van den Hengel, SIGIR, 2015
Office_Products = graphlab.SFrame.read_json('Office_Products_5.json', orient='lines')

In [None]:
## Exploring the dataset
Office_Products.head()

In [None]:
## Counting the number of rows 
len(Office_Products)

In [None]:
## Setting target as the notebook
graphlab.canvas.set_target('ipynb')

In [None]:
## Checking the rating distribution. Most reviews have a 5 rating. 30,327 votes (56.944%)
Office_Products['overall'].show(view='Categorical') 

In [None]:
# building a word count vector for each review 
Office_Products['word_count'] = graphlab.text_analytics.count_words(Office_Products['reviewText'])

In [None]:
# Removing all stopwords
Office_Products['No_Stop_words'] = Office_Products['word_count'].dict_trim_by_keys(graphlab.text_analytics.stopwords(), exclude=True)
#Removing all punctuation
Office_Products['No_punct'] = Office_Products['No_Stop_words'].dict_trim_by_keys(['!','.',':','*','~','(2),'], exclude=True)
#Calculating TF-IDF
Office_Products['tfidf'] = graphlab.text_analytics.tf_idf(Office_Products['No_punct'])

In [None]:
# Limiting the review data to only helpful reviews. The ‘helpful’ column contains values like ‘[x, y]’.
# The first value represents the number of helpful votes, the second represents overall votes. 
# If at least 60% consider the review helpful, we want those reviews. A binary 1 or 0 column will identify the review as helpful or not.
import numpy as np
Office_Products['helpful_count'] = Office_Products['helpful'].apply(lambda x: x[0])
Office_Products['overall_count'] = Office_Products['helpful'].apply(lambda x: x[1])
Office_Products['helpful_percent'] = Office_Products['helpful_count'] / Office_Products['overall_count']
Office_Products['helpful_review'] = Office_Products['helpful_percent'] >= .6
# Removing reviews with no votes 
Office_Products = Office_Products[Office_Products['overall_count']!= 0 ]

In [None]:
Office_Products.head()

In [None]:
# Building the sentiment classifier. Performing a random 80/20 split to select the training and testing data.
train_data, test_data = Office_Products.random_split(.8, seed=0)

In [None]:
sentiment_model_Logist = graphlab.logistic_classifier.create(train_data,
                                                            target = 'helpful_review',
                                                            features = ['tfidf'],
                                                            validation_set = test_data)

In [None]:
## evaluating the model
sentiment_model_Logist.evaluate(test_data, metric = 'roc_curve')

In [None]:
# True Positive = 1189 False Negative = 1718 Accuracy = 0.685 Precision = 0.423 false Positive = 1625 True Negative = 6077 recall = 0.409
sentiment_model_Logist.show(view = 'Evaluation')

In [None]:
## Finding the ID of the most frequently reviewed item.  Value	Count	Percent
# B000WU4H5C 156 reviews (0.742%)
Office_Products['asin'].show()

In [None]:
B000WU4H5C_Reviews = Office_Products[Office_Products['asin'] == 'B000WU4H5C']

In [None]:
len(B000WU4H5C_Reviews)

In [None]:
# Applying the Logistic Regression model to B000WU4H5C_Reviews
B000WU4H5C_Reviews['Predicted_Helpfulness_LR'] = sentiment_model_Logist.predict(B000WU4H5C_Reviews, output_type = 'probability')

In [None]:
B000WU4H5C_Reviews.head()

In [None]:
## Sorting reviews based on the predicted sentiment 
B000WU4H5C_Reviews = B000WU4H5C_Reviews.sort('Predicted_Helpfulness_LR', ascending =False)

In [None]:
B000WU4H5C_Reviews.head()

In [None]:
B000WU4H5C_Reviews[0]['reviewText']

In [None]:
B000WU4H5C_Reviews[-1]['reviewText']

In [None]:
B000WU4H5C_Reviews[0]['summary']

In [None]:
B000WU4H5C_Reviews[-1]['summary']

In [None]:
## Trying a SVM model
sentiment_model_SVM = graphlab.svm_classifier.create(train_data,
                                                     target = 'helpful_review',
                                                     features = ['tfidf'],
                                                     validation_set = test_data)

In [None]:
## Evaluating the model
coefficients = sentiment_model_SVM['coefficients']

In [None]:
# Make predictions (as margins, or class)
predictions = sentiment_model_SVM.predict(Office_Products)    # Predicts 0/1
predictions = sentiment_model_SVM.predict(Office_Products, output_type='margin')

In [None]:
results = sentiment_model_SVM.evaluate(Office_Products)

In [None]:
results

In [None]:
# Applying the SVM model to B000WU4H5C_Reviews. Note: SVM does not currently support predictions as probability estimates.
B000WU4H5C_Reviews['Predicted_Helpfulness_SVM'] = sentiment_model_SVM.predict(B000WU4H5C_Reviews, output_type = 'class')

In [None]:
B000WU4H5C_Reviews.head()

In [None]:
## Sorting reviews based on the predicted sentiment 
B000WU4H5C_Reviews = B000WU4H5C_Reviews.sort('Predicted_Helpfulness_SVM', ascending =False)

In [None]:
B000WU4H5C_Reviews[0]['reviewText']

In [None]:
B000WU4H5C_Reviews[-1]['reviewText']

In [None]:
B000WU4H5C_Reviews.head()

In [None]:
B000WU4H5C_Reviews[1]['reviewText']

In [None]:
B000WU4H5C_Reviews[2]['reviewText']

In [None]:
B000WU4H5C_Reviews[3]['reviewText']

In [None]:
Office_Products['tfidf'] = graphlab.text_analytics.tf_idf(Office_Products['No_Stop_words'])


In [None]:
Office_Products.head()

In [None]:
Office_Products[['tfidf']].stack('tfidf', new_column_name=['word', 'tfidf']).sort('tfidf', ascending =False)