In [1]:
import os
print os.getcwd()

C:\Users\Chauncey


In [54]:
# Importing the necessary libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import nltk
import sklearn
import random
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn import svm

In [3]:
# This code reads the data into a pandas data frame. The dataset used is an Office Products dataset containing 53,257 reviews   
# retrieved from: http://jmcauley.ucsd.edu/data/amazon/ citation: "Image-based recommendations on styles and 
# substitutes" J. McAuley, C. Targett, J. Shi, A. van den Hengel, SIGIR, 2015

def parse(path):
#  opens a binary file in read or write mode.    
  g = open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df_review_data = getDF('Office_Products_5.json')


In [32]:
# Exploring the dataset
print(df_review_data.head(5))
len(df_review_data)

       reviewerID        asin         reviewerName helpful  unixReviewTime  \
0  A32T2H8150OJLU  B00000JBLH                  ARH  [3, 4]      1094169600   
1  A3MAFS04ZABRGO  B00000JBLH     Let it Be "Alan"  [7, 9]      1197676800   
2  A1F1A0QQP2XVH5  B00000JBLH               Mark B  [3, 3]      1293840000   
3   A49R5DBXXQDE5  B00000JBLH         R. D Johnson  [7, 8]      1145404800   
4  A2XRMQA6PJ5ZJ8  B00000JBLH  Roger J. Buffington  [0, 0]      1375574400   

                                          reviewText  overall   reviewTime  \
0  I bought my first HP12C in about 1984 or so, a...        5   09 3, 2004   
1  WHY THIS BELATED REVIEW? I feel very obliged t...        5  12 15, 2007   
2  I have an HP 48GX that has been kicking for mo...        2   01 1, 2011   
3  I've started doing more finance stuff recently...        5  04 19, 2006   
4  For simple calculations and discounted cash fl...        5   08 4, 2013   

                                             summary  helpful_

53258

In [5]:
print(df_review_data.tail(5))

           reviewerID        asin  \
53253  A1ODOGXEYECQQ8  B00KYA0RC2   
53254  A2XX2A4OJCDNLZ  B00KYA0RC2   
53255  A3LGT6UZL99IW1  B00KYA0RC2   
53256  A1XJOSJN6FHFO0  B00KYA0RC2   
53257   AAEVGE52KL0DJ  B00KYA0RC2   

                                           reviewerName helpful  \
53253                                            Nuknuk  [0, 0]   
53254                               RatherLiveInKeyWest  [2, 2]   
53255  Richard C. Drew "Anaal Nathra/Uthe vas Bethod...  [1, 1]   
53256  Shirley Priscilla  Johnson "Author/Reviewer -...  [0, 0]   
53257                                               Tim  [3, 4]   

       unixReviewTime                                         reviewText  \
53253      1405555200  What I like about this scale is you can power ...   
53254      1405296000  This Accuteck ShipPro digital scale works very...   
53255      1405468800  I ship a lot of stuff.  I sell small parts, ma...   
53256      1405814400  This is a great little scale to have. It can w.

In [50]:
# Limiting the review data to only helpful reviews. The ‘helpful’ column contains values like ‘[x, y]’.
# The first value represents the number of helpful votes, the second represents overall votes. 
# If at least 50% consider the review helpful, we want those reviews.

df_review_data['helpful_count'] = df_review_data.helpful.apply(lambda x: x[0])
df_review_data['overall_count'] = df_review_data.helpful.apply(lambda x: x[1])
df_review_data['helpful_percent'] = df_review_data['helpful_count'] / df_review_data['overall_count']
df_review_data['helpful_review'] = np.where((df_review_data.helpful_percent > .5) & (df_review_data.helpful_count > 1), "Yes", "No")
helpful_reviews = df_review_data[df_review_data['helpful_review']=='Yes']

In [51]:
#Tokenization 
helpful_reviews['tokenized_review_data'] = helpful_reviews['reviewText'].apply(nltk.word_tokenize)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [52]:
# Building the sentiment Classifier 
# Define positive and negative sentiment 4 and 5 star reviews = positive sentiment, ignore all reviews with a rating of 3
# and using reviews with 1 and 2 stars to represent negative sentiment 
helpful_reviews = helpful_reviews[helpful_reviews['overall']!= 3.0]
# Binary colmmn true or false. Is it greater than or equal to 4
helpful_reviews['sentiment'] = helpful_reviews['overall'] >=4
print(helpful_reviews.head(5))
len(helpful_reviews)

       reviewerID        asin       reviewerName   helpful  unixReviewTime  \
0  A32T2H8150OJLU  B00000JBLH                ARH    [3, 4]      1094169600   
1  A3MAFS04ZABRGO  B00000JBLH   Let it Be "Alan"    [7, 9]      1197676800   
2  A1F1A0QQP2XVH5  B00000JBLH             Mark B    [3, 3]      1293840000   
3   A49R5DBXXQDE5  B00000JBLH       R. D Johnson    [7, 8]      1145404800   
5  A2JFOHC9W629IE  B00000JBLH  scott_from_dallas  [10, 12]      1011744000   

                                          reviewText  overall   reviewTime  \
0  I bought my first HP12C in about 1984 or so, a...        5   09 3, 2004   
1  WHY THIS BELATED REVIEW? I feel very obliged t...        5  12 15, 2007   
2  I have an HP 48GX that has been kicking for mo...        2   01 1, 2011   
3  I've started doing more finance stuff recently...        5  04 19, 2006   
5  While I don't have an MBA, it's hard to believ...        5  01 23, 2002   

                                             summary  helpful_

7542

In [60]:
# Training the sentiment classifier 
helpful_reviews['sentiment'] = pd.DataFrame(np.random.randn(7542, 1508))
msk = np.random.rand(len(helpful_reviews['sentiment'])) < 0.8
train_data = helpful_reviews['sentiment'][msk]
test_data = helpful_reviews['sentiment'][~msk]

In [61]:
len(test_data)

1529

In [62]:
len(train_data)

6013

In [65]:
clf = svm.SVC(gamma=0.001, C=100)
x, y = helpful_reviews['tokenized_review_data'][:-1], helpful_reviews['sentiment'][:-1]

clf.fit(x, y)

print('Prediction:',clf.predict(helpful_reviews['tokenized_review_data'][:-1]))

ValueError: setting an array element with a sequence.