In [None]:
# Mohit Bhasin
# HW4 Q4
# Beer Reviews
# These data contain 2,924,163 reviews by 40,213 unique users on 110,419 unique types of beer.

# J. McAuley and J. Leskovec. Hidden factors and hidden topics: understanding rating dimensions with review text. RecSys, 2013.

In [1]:
from pyspark.mllib.feature import HashingTF, IDF, Normalizer
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
import re
import json

In [2]:
all_reviews = sc.textFile("s3n://stat-37601/ratings.json", minPartitions=1000).map(json.loads)
reviews, reviews_test = all_reviews.randomSplit([.01, .4]) # working on only 50% of the data
reviews.cache
reviews.count()

71182

In [4]:
# Let's see what a review looks like
reviews.take(1)[0]

{u'beer_ABV': u'5.4',
 u'beer_beerId': u'63836',
 u'beer_brewerId': u'8481',
 u'beer_name': u'John Harvards Simcoe IPA',
 u'beer_style': u'India Pale Ale &#40;IPA&#41;',
 u'review_appearance': u'4/5',
 u'review_aroma': u'6/10',
 u'review_overall': u'13/20',
 u'review_palate': u'3/5',
 u'review_profileName': u'hopdog',
 u'review_taste': u'6/10',
 u'review_text': u'On tap at the Springfield, PA location. Poured a deep and cloudy orange (almost a copper) color with a small sized off white head. Aromas or oranges and all around citric. Tastes of oranges, light caramel and a very light grapefruit finish. I too would not believe the 80+ IBUs - I found this one to have a very light bitterness with a medium sweetness to it. Light lacing left on the glass.',
 u'review_time': u'1157587200'}

##Building a text parser
Make text lowercase, remove punctuation and split on blank spaces.
Then count the occurences of each word.
Finally see what words show up often so that we can filter to the words we want.

In [3]:
def removePunctuation(text):
    """
    Replaces anything that is not a lowercase letter, a space, or an apostrophe with a space:
    """
    stripped = re.sub('[^a-z\ \']+', " ", text)
    return stripped

word_counts = reviews.map(lambda x: x["review_text"].lower()).map(removePunctuation).flatMap(lambda line: line.split()).map(lambda word: (word, 1)).reduceByKey(lambda a,b: a + b)
print(word_counts.take(10)),

[(u'oraneg', 2), (u'promoted', 3), (u'bifrost', 1), (u'hadent', 1), (u'dba', 2), (u'matbe', 1), (u'marti', 1), (u'nun', 8), (u'buccal', 1), (u'appelwine', 2)]


In [4]:
count_words = word_counts.map(lambda (a,b): (b,a)).sortByKey(False)
wordcount = count_words.count()
wordcount

51611

In [5]:
percent_words =  count_words.map(lambda (a,b): (round(float(a)/wordcount,2),b))
print(percent_words.take(10)),

[(3.22, u'a'), (2.65, u'and'), (2.32, u'the'), (1.95, u'with'), (1.72, u'of'), (1.42, u'is'), (0.97, u'head'), (0.83, u'aroma'), (0.79, u'to'), (0.7, u'in')]


We can see that the "stop words" are words that occur frequently. We filter these words out. Our final word list comprises our feature vector that we need to compute our TF-IDF on

In [6]:
percent_words_small = percent_words.filter(lambda (k,v): k<.85 and k>.001)
print(percent_words_small.take(5)),

[(0.83, u'aroma'), (0.79, u'to'), (0.7, u'in'), (0.69, u'this'), (0.65, u'but')]


In [7]:
wordlist = percent_words_small.map(lambda (k,v): v).collect()
n_features = len(wordlist)
print(n_features)

1162


In [8]:
def reviewParser(review_text):
    """
    Filter the review text to only words that show up in the vocabulary we built above 
    """
    words = review_text.lower()
    words = re.sub('[^a-z\ \']+', " ", words).split()
    words = [word for word in words if word in wordlist]
    return (words)

filtered_reviews = reviews.map(lambda x: x['review_text']).map(reviewParser)

##TF-IDF
TF-IDF is better than using simple word frequency counts as our "X" predictors
The term frequency function below creates a sparse vector of frequency counts for each word in a given review . The hash is a Spark function that does this more efficiently, for tactibility we hash down to a smaller feature vector size (large number of collisions). Notice that the length of the sparse vector is less than the length of our wordlist, this is aresult of the hashing.

In [9]:
hashingTF = HashingTF(20)
tf = hashingTF.transform(filtered_reviews)
tf.take(1)

[SparseVector(20, {0: 1.0, 1: 2.0, 2: 2.0, 3: 3.0, 4: 3.0, 5: 5.0, 6: 2.0, 7: 1.0, 9: 1.0, 10: 3.0, 11: 2.0, 12: 1.0, 14: 3.0, 15: 4.0, 16: 4.0, 17: 3.0, 19: 4.0})]

In [10]:
tf.cache()
idf = IDF(minDocFreq=2).fit(tf)
features = idf.transform(tf)
features.take(1)

[SparseVector(20, {0: 0.2485, 1: 1.3804, 2: 0.4383, 3: 0.4973, 4: 0.5758, 5: 1.3373, 6: 0.5718, 7: 0.3155, 9: 0.2277, 10: 1.1553, 11: 0.6072, 12: 0.2026, 14: 0.7342, 15: 1.7154, 16: 0.987, 17: 0.7708, 19: 1.7826})]

In [11]:
def getLabel(review):
    """
    Get the overall rating from a review
    """
    label, total = review["review_overall"].split("/")
    return float(label) / float(total)
labels = reviews.map(getLabel)
data = features.zip(labels).map(lambda (feature, label): LabeledPoint(label, feature))

Create a function to calculate the Mean squared error for our models

In [20]:
def meanSquaredError(lAndP):
    """
    Calculate mean squared error 
    """
    mse_err = lAndP.map(lambda (v,p): (v-p)*(v-p)).sum()/float(lAndP.count())
    #mse_err = lAndP.map(lambda (v,p): (v-p)**2).reduce(lambda a,b: a+b)/float(lAndP.count())
    return (mse_err)

## Decision Trees
Build a decision tree on the (x,y) data designed above. Use that to make predictions and take a look at those predictions. Finally compute the mean squared error based on the predictions.

```python
classmethod trainRegressor(data, categoricalFeaturesInfo, impurity='variance', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0)
```
The mse below shows that we get a 2.6% In-Sample error rate.

In [13]:
model = DecisionTree.trainRegressor(data, categoricalFeaturesInfo={},
                                    impurity='variance', maxDepth=2, maxBins=2)
predictions = model.predict(data.map(lambda x: x.features)) # Predict on in-sample
labelsAndPredictions = data.map(lambda lp: lp.label).zip(predictions) 

In [14]:
labelsAndPredictions.take(5)

[(0.8, 0.6738495252008759),
 (0.65, 0.6161767411059191),
 (0.7, 0.711222679759265),
 (0.6, 0.6161767411059191),
 (0.65, 0.711222679759265)]

In [21]:
meanSquaredError(labelsAndPredictions) #0.026535660477238206

0.026535660477238206

## Random Forests
Repeat the same steps from above for a Random Forest

```python
classmethod trainRegressor(data, categoricalFeaturesInfo, numTrees, featureSubsetStrategy='auto', impurity='variance', maxDepth=4, maxBins=32, seed=None)
```

In [18]:
model_rf = RandomForest.trainRegressor(data, {}, 2, impurity='variance', maxDepth=2, maxBins=2, seed=42)
predictions_rf = model_rf.predict(data.map(lambda x: x.features)) # Predict on in-sample
labelsAndPredictions_rf = data.map(lambda lp: lp.label).zip(predictions_rf) 

In [19]:
labelsAndPredictions_rf.take(5)

[(0.8, 0.7301089416883975),
 (0.7, 0.7057365798909006),
 (0.75, 0.7057365798909006),
 (0.6, 0.6339424204090021),
 (0.65, 0.6339424204090021)]

In [20]:
meanSquaredError(labelsAndPredictions_rf)

0.026358335341867243

## Gradient Boosted Trees
Repeat the same steps from above for GBTs. We want stumps (shallow trees) so we use maxDepth=1

```python 
classmethod trainRegressor(data, categoricalFeaturesInfo, loss='leastSquaresError', numIterations=100, learningRate=0.1, maxDepth=3)
```

In [16]:
model_gbt = GradientBoostedTrees.trainRegressor(data, {}, loss='leastSquaresError', maxDepth=1)

In [17]:
predictions_gbt = model_gbt.predict(data.map(lambda x: x.features)) # Predict on in-sample
labelsAndPredictions_gbt = data.map(lambda lp: lp.label).zip(predictions_gbt) 

In [18]:
labelsAndPredictions_gbt.take(5)

[(0.8, 0.6913117627223787),
 (0.65, 0.6285516026410404),
 (0.7, 0.7150184495414104),
 (0.6, 0.6271736171888832),
 (0.65, 0.6917288086749391)]

In [19]:
meanSquaredError(labelsAndPredictions_gbt)

0.02553981435951626

The models above were tested on a small subset of the data with shallow depths for illustrative purposes. It's clear that Gradient Boosted trees have better mean squared error from the answers above.