In [4]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import pylab
import json
import numpy as np

from pyspark.sql import Row
from datetime import datetime
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark import SparkContext
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit

## Using Spark to process the data and build the model

In [5]:
sc =SparkContext()
sqlContext = SQLContext(sc)

### Loading Training Data

In [10]:
reviews = sc.textFile('../Filter_Data/yelp.train.rating').map(lambda l: l.split("\t"))
training = reviews.map(lambda l: Rating(int(l[0]),int(l[1]),float(l[2])))
training.take(5)

[Rating(user=0, product=0, rating=3.0),
 Rating(user=0, product=1, rating=3.0),
 Rating(user=0, product=2, rating=4.0),
 Rating(user=0, product=3, rating=4.0),
 Rating(user=0, product=4, rating=3.0)]

### Parameters for AlS

In [13]:
rank = 150
numIterations = 20
model = ALS.train(training, rank, numIterations)

Running Predictions and calculating MSE on Training Data 

In [20]:
testdata = training.map(lambda p: (p[0], p[1]))
predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:57943)
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\py4j\java_gateway.py", line 852, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\py4j\java_gateway.py", line 990, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [WinError 10061] No connection could be made because the target machine actively refused it


Py4JNetworkError: An error occurred while trying to connect to the Java server (127.0.0.1:57943)

In [None]:
ratesAndPreds = training.map(lambda l: ((l[0],l[1]),l[2])).join(predictions)
MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
print("Mean Squared Error On Training Data= " + str(MSE))

### Loading Test Data
Running Predictions and calculating MSE on Test Data 

In [95]:
test_reviews = sc.textFile('filter_dataset/yelp.test.rating').map(lambda l: l.split("\t")).map(lambda l: ((int(l[0]),int(l[1])),int(l[2])))
testdata = test_reviews.map(lambda p: (p[0][0], p[0][1]))
predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))

ratesAndPreds = test_reviews.join(predictions)
MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
print("Mean Squared Error On Test Data = " + str(MSE))

Mean Squared Error On Test Data = 2.4955397071962864


In [104]:
from pyspark.ml.recommendation import ALS

als = ALS(userCol = "user_id", itemCol ="buss_id", ratingCol= "rating")

rank = [100, 125, 150]
maxIter = [17, 18, 19, 20]
reg =  [.15, .17, .18, .19]
param_grid = ParamGridBuilder().addGrid(als.rank, rank).addGrid(als.maxIter, maxIter).addGrid(als.regParam, reg).build()

In [105]:
evaluator = RegressionEvaluator(metricName= "rmse", labelCol= "rating", predictionCol= "prediction")

cv = TrainValidationSplit(estimator= als, estimatorParamMaps = param_grid, evaluator= evaluator)

In [106]:
#model = cv.fit(reviews.toDF("user_id", "buss_id", "rating")
review_df = sqlContext.createDataFrame(reviews.map(lambda x: Row(user_id=int(x[0]), buss_id=int(x[1]), rating=int(x[2]))))

In [107]:
model = cv.fit(review_df)

KeyboardInterrupt: 

In [None]:
best_model = model.bestModel

In [None]:
testdata = sqlContext.createDataFrame(testdata.map(lambda x: Row(user_id=int(x[0]), buss_id=int(x[1]))))
predictions = best_model.transform(testdata)
rmse = evaluator.evaluate(predictions)
print("RMSE with test data is = ", rmse)