In [1]:
import numpy
import urllib.request
import scipy.optimize
import random
from collections import defaultdict
import nltk
import string
from nltk.stem.porter import *
from sklearn import linear_model

#nltk.download("stopwords")
from nltk.corpus import stopwords

def parseData(fname):
    for l in urllib.request.urlopen(fname):
        yield eval(l)

stopWords = set(stopwords.words("english"))

### Just the first 5000 reviews

print("Reading data...")
data = list(parseData("http://jmcauley.ucsd.edu/cse190/data/beer/beer_50000.json"))[:5000]
print("done")

### Ignore capitalization and remove punctuation, and split into bigrams

wordCount = defaultdict(int)
punctuation = set(string.punctuation)
stemmer = PorterStemmer()
for d in data:
    r = ''.join([c for c in d['review/text'].lower() if not c in punctuation])
    size = len(r.split())
    for i in range(size-1):
        w1 = r.split()[i]
        w2 = r.split()[i+1]
        w1 = stemmer.stem(w1) # with stemming
        w2 = stemmer.stem(w2) # with stemming
#        if not (w1 in stopWords and w2 in stopWords):
        w = w1 + ' ' + w2
        wordCount[w] += 1

Reading data...
done


In [2]:
### Just take the most popular words...

counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

num = len(counts)
times = [x[0] for x in counts[:5]]
words = [x[1] for x in counts[:5]]

print("There are " + str(num) + " unique bigrams amongst all 5000 reviews")
for n in range(5):
    i = times[n]
    j = words[n]
    print("No." + str(n+1) + ": Word ["+ j + "] occurs " + str(i) + " times in the corpus")

There are 162557 unique bigrams amongst all 5000 reviews
No.1: Word [with a] occurs 4587 times in the corpus
No.2: Word [in the] occurs 2595 times in the corpus
No.3: Word [of the] occurs 2245 times in the corpus
No.4: Word [is a] occurs 2056 times in the corpus
No.5: Word [on the] occurs 2033 times in the corpus


In [3]:
### Just take the most popular words...

counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

times = [x[0] for x in counts[:1000]]
words = [x[1] for x in counts[:1000]]

### Sentiment analysis

wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

def feature(datum):
    feat = [0]*len(words)
    r = ''.join([c for c in datum['review/text'].lower() if not c in punctuation])
    size = len(r.split())
    for i in range(size-1):
        w1 = r.split()[i]
        w2 = r.split()[i+1]
        w1 = stemmer.stem(w1) # with stemming
        w2 = stemmer.stem(w2) # with stemming
        w = w1 + ' ' + w2
        if w in words:
            feat[wordId[w]] += 1
    feat.append(1) #offset
    return feat

X = [feature(d) for d in data]
y = [d['review/overall'] for d in data]

#With regularization
clf = linear_model.Ridge(1.0, fit_intercept=False)
clf.fit(X, y)
theta = clf.coef_
predictions = clf.predict(X)
num = len(predictions)
MSE = 0
for i in range(num):
    MSE += (y[i]-predictions[i])**2
mse = MSE/num
print("MSE of the prediction base on the 1000 most common bigrams is",mse)

MSE of the prediction base on the 1000 most common bigrams is 0.340302362439


In [25]:
### Ignore capitalization and remove punctuation, and split into bigrams

wordCount = defaultdict(int)
punctuation = set(string.punctuation)
stemmer = PorterStemmer()
for d in data:
    r = ''.join([c for c in d['review/text'].lower() if not c in punctuation])
    size = len(r.split())
    for w in r.split():
        w = stemmer.stem(w) # with stemming
        wordCount[w] += 1
    for i in range(size-1):
        w1 = r.split()[i]
        w2 = r.split()[i+1]
        w1 = stemmer.stem(w1) # with stemming
        w2 = stemmer.stem(w2) # with stemming
#        if not (w1 in stopWords and w2 in stopWords):
        w = w1 + ' ' + w2
        wordCount[w] += 1
        
### Just take the most popular words...

counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

times = [x[0] for x in counts[:1000]]
words = [x[1] for x in counts[:1000]]

### Sentiment analysis

wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

def feature(datum):
    feat = [0]*len(words)
    r = ''.join([c for c in datum['review/text'].lower() if not c in punctuation])
    size = len(r.split())
    for w in r.split():
        if w in words:
            feat[wordId[w]] += 1
    for i in range(size-1):
        w1 = r.split()[i]
        w2 = r.split()[i+1]
        w1 = stemmer.stem(w1) # with stemming
        w2 = stemmer.stem(w2) # with stemming
        w = w1 + ' ' + w2
        if w in words:
            feat[wordId[w]] += 1
    feat.append(1) #offset
    return feat

X = [feature(d) for d in data]
y = [d['review/overall'] for d in data]

#With regularization
clf = linear_model.Ridge(1.0, fit_intercept=False)
clf.fit(X, y)
theta = clf.coef_
predictions = clf.predict(X)
num = len(predictions)
MSE = 0
for i in range(num):
    MSE += (y[i]-predictions[i])**2
mse = MSE/num
print("MSE of the prediction base on the 1000 most common bigrams is",mse)

MSE of the prediction base on the 1000 most common bigrams is 0.312846481728


In [27]:
Impact = [(theta[wordId[w]], w) for w in words]
Impact.sort()
Impact.reverse()
print("The 5 unigrams/bigrams with the most positive associated weights and their weights are listed as:")
for i in range(5):
    print(Impact[i][1] + "------weight: " + str(Impact[i][0]))
print("The 5 unigrams/bigrams with the most negative associated weights and their weights are listed as:")
for i in range(5):
    print(Impact[len(Impact)-i-1][1] + "------weight: " + str(Impact[len(Impact)-i-1][0]))

The 5 unigrams/bigrams with the most positive associated weights and their weights are listed as:
wa------weight: 0.385810095911
impress------weight: 0.312095684774
the best------weight: 0.25788118686
quit------weight: 0.243636388717
not too------weight: 0.238452957975
The 5 unigrams/bigrams with the most negative associated weights and their weights are listed as:
coffe------weight: -0.323729455127
corn------weight: -0.281909925982
water------weight: -0.277372578656
carbon------weight: -0.250541574282
straw------weight: -0.237242201396
