In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import pylab
import json
import numpy as np

from pyspark.sql import Row
from datetime import datetime
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark import SparkContext
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit

## Using Spark to build the model and do predictions

In [2]:
sc =SparkContext()
sqlContext = SQLContext(sc)

### Loading Training Data

In [4]:
reviews = sc.textFile('../Filter_Data/yelp.train.rating').map(lambda l: l.split("\t"))
training = reviews.map(lambda l: Rating(int(l[0]),int(l[1]),float(l[2])))
training.take(5)

[Rating(user=0, product=0, rating=3.0),
 Rating(user=0, product=1, rating=3.0),
 Rating(user=0, product=2, rating=4.0),
 Rating(user=0, product=3, rating=4.0),
 Rating(user=0, product=4, rating=3.0)]

### Parameters for ALS
Tuned over several runs

In [5]:
rank = 100
numIter = 20
model = ALS.train(training, rank, numIter)

Running Predictions and calculating MSE on Training Data 

In [9]:
test = training.map(lambda p: (p[0], p[1]))
preds = model.predictAll(test).map(lambda r: ((r[0], r[1]), r[2]))

ratesJoinPreds = reviews.map(lambda l: ((int(l[0]),int(l[1])),int(l[2]))).join(preds)
MSE = ratesJoinPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
print("Mean Squared Error On Training Data= ", MSE)

Mean Squared Error On Training Data= 0.002728928954329906


We can see that the MSE is really low and the predictions are pretty accurate for the training data. 

### Loading Test Data
Running Predictions and calculating MSE on Test Data 

In [10]:
test_reviews = sc.textFile('../Filter_Data/yelp.test.rating').map(lambda l: l.split("\t")).map(lambda l: ((int(l[0]),int(l[1])),int(l[2])))
testdata = test_reviews.map(lambda p: (p[0][0], p[0][1]))
testdata.take(5)

[(0, 43), (1, 72), (2, 127), (3, 151), (4, 411)]

In [11]:
predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))
ratesAndPreds = test_reviews.join(predictions)
MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
print("Mean Squared Error On Test Data = " + str(MSE))

Mean Squared Error On Test Data = 2.2475724933205883


We can see that the MSE is really bad for testing data, whereas the training data gave us really good MSE. This means that we have overfit our model!

Lets look at regularizing the ALS model by tweaking the value of the regularization parameter $\lambda$.

In [12]:
from pyspark.ml.recommendation import ALS

als = ALS(userCol = "user_id", itemCol ="buss_id", ratingCol= "rating")

rank = [100]
maxIter = [20]
# Values of lambda choosen after several runs
reg =  [.15,.17]
param_grid = ParamGridBuilder().addGrid(als.rank, rank).addGrid(als.maxIter, maxIter).addGrid(als.regParam, reg).build()

The idea was to run cross validation to choose from a range of values for the parameter, but due to the memory issues on the laptop, this could not be achieved. Thus, we resorted to running several runs to pick the one with the least MSE.  

In [23]:
evaluator = RegressionEvaluator(metricName= "rmse", labelCol= "rating", predictionCol= "prediction")
cv = TrainValidationSplit(estimator= als, estimatorParamMaps = param_grid, evaluator= evaluator)

In [14]:
# converting to DF to be used with the libraries
training = sqlContext.createDataFrame(reviews.map(lambda x: Row(user_id=int(x[0]), buss_id=int(x[1]), rating=int(x[2]))))

In [15]:
model = cv.fit(training)

In [16]:
best_model = model.bestModel

In [18]:
# Removing businesses from test data which were not a part of our training dataset 
# neccessary for evaluator to work
testdata = sqlContext.createDataFrame(test_reviews.map(lambda x: Row(user_id=x[0][0], buss_id=x[0][1], rating = x[1])))
test_b = set(testdata.select(testdata.buss_id).distinct().rdd.map(lambda r: r[0]).collect())
train_b = set(training.select(training.buss_id).distinct().rdd.map(lambda r: r[0]).collect())
diff_b = test_b.difference(train_b)
testdata = testdata.filter(~testdata.buss_id.isin(diff_b))

Predicitions and MSE on test data with Regularization

In [34]:
predictions = best_model.transform(testdata)
evaluator = RegressionEvaluator(metricName= "mse", labelCol= "rating", predictionCol= "prediction")
mse = evaluator.evaluate(predictions)
print("MSE with test data and regularization is = ", mse)

MSE with test data and regularization is =  1.4891042305989572


As we can see regularization helped us bring down the overfitting and get better MSE on the test data.

#### Hit Ratio

In [49]:
threshold = 1.5
print("Hit Ratio is:", predictions.filter((predictions.rating < predictions.prediction+threshold) & (predictions.rating > predictions.prediction-threshold)).count()/predictions.count()) 

Hit Ratio is: 0.7733565788124295


With the threshold of +-1.5 we get the above hit ratio, which is descent but not perfect. In order to improve it ffurther we need to increase the number of ranks to be involved in model building. We had to restrict ourselves to a max of 100 because of resource constraints(memory and cpu), but when utilizing a cloud based platform, it should give us much better hit ratio and MSE for higher number of ranks. More in the report.