In [1]:
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
from pyspark import SparkContext
import csv
from itertools import combinations
from collections import OrderedDict,Counter
import random
from numpy.random import rand
from matplotlib import pyplot as plt
import pandas as pd
%matplotlib inline

In [2]:
def emit_idx_rating(lines,uid2idx_bc, bid2idx_bc):
    for line in lines:
        yield Rating(uid2idx_bc.value[line[0]],bid2idx_bc.value[line[1]],line[2])

def emit_idx_rating_test(lines,uid2idx_bc, bid2idx_bc):
    for line in lines:
        if line[0] in uid2idx_bc.value.keys():
            if line[1] in bid2idx_bc.value.keys():
                yield Rating(uid2idx_bc.value[line[0]],bid2idx_bc.value[line[1]],line[2])
            else:
                yield Rating(uid2idx_bc.value[line[0]],-1,line[2])
        else:
            if line[1] in bid2idx_bc.value.keys():
                yield Rating(-1,bid2idx_bc.value[line[1]],line[2])
            else:
                yield Rating(-1,-1,line[2])

In [20]:

train_file = "/Users/liangsiqi/Documents/Dataset/yelp_rec_data/yelp_train.csv"
val_file = "/Users/liangsiqi/Documents/Dataset/yelp_rec_data/yelp_val.csv"


raw_train = []
with open(train_file, 'r') as f:
    csv_reader = csv.reader(f, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if line_count == 0:
            line_count += 1
        else:
            raw_train.append([row[0],row[1],float(row[2])])

raw_val = []
with open(val_file, 'r') as f:
    csv_reader = csv.reader(f, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if line_count == 0:
            line_count += 1
        else:
            raw_val.append([row[0],row[1],float(row[2])])

print("train data size: ", len(raw_train))
print("validation data size: ", len(raw_val))

# numPartitions = 4
# [user_id, business_id, stars]
sc = SparkContext.getOrCreate()
train_data = sc.parallelize(raw_train,5)
val_data = sc.parallelize(raw_val,5)

user_ids = train_data.map(lambda x: x[0]).distinct().collect()
user_cnt = len(user_ids)
business_ids = train_data.map(lambda x: x[1]).distinct().collect()
business_cnt = len(business_ids)

uid2idx = dict()
bid2idx = dict()
idx2bid = dict()
for idx, uid in enumerate(user_ids, 0):
    uid2idx[uid] = idx

for idx, bid in enumerate(business_ids, 0):
    bid2idx[bid] = idx
    idx2bid[idx] = bid

idx2bid_bc = sc.broadcast(idx2bid)
uid2idx_bc = sc.broadcast(uid2idx)
bid2idx_bc = sc.broadcast(bid2idx)

train data size:  455854
validation data size:  142044


In [21]:
train_ratings = train_data.mapPartitions(lambda lines: emit_idx_rating(lines,uid2idx_bc, bid2idx_bc))
val_ratings = val_data.mapPartitions(lambda lines: emit_idx_rating_test(lines,uid2idx_bc, bid2idx_bc))

uid_miss_num = val_ratings.filter(lambda x: x[0]==-1 and x[1]!=-1).count()
bid_miss_num = val_ratings.filter(lambda x: x[0]!=-1 and x[1]==-1).count()
both_id_miss_num = val_ratings.filter(lambda x: x[0]==-1 and x[1]==-1).count()
print("Number of missing uid, while bid exists: ", uid_miss_num)
print("Number of missing bid, while uid exists: ", bid_miss_num)
print("Number of missing both bid and uid: ", both_id_miss_num)

Number of missing uid, while bid exists:  0
Number of missing bid, while uid exists:  307
Number of missing both bid and uid:  0


In [22]:
# Build the recommendation model using Alternating Least Squares
ranks = [2,3,4,5,6,7,8,10]
lambdas = [0.001, 0.1, 1]
numIters = [10,15,20]

model = ALS.train(train_ratings, 5, 15,0.1)

In [7]:
# Evaluate the model on training data
testdata = train_ratings.map(lambda p: (p[0], p[1]))
predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))
ratesAndPreds = train_ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
RMSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()**0.5
print("train RMSE = " + str(RMSE))

train RMSE = 0.7240783223993239


In [8]:
# Evaluate the model on validation data
valdata = val_ratings.map(lambda p: (p[0], p[1]))
val_predictions = model.predictAll(valdata).map(lambda r: ((r[0], r[1]), r[2]))
val_ratesAndPreds = val_ratings.map(lambda r: ((r[0], r[1]), r[2])).join(val_predictions)
RMSE = val_ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()**0.5
print("validation RMSE = " + str(RMSE))

validation RMSE = 1.1380954567893296


In [24]:
ranks = [2, 3, 4, 5, 6, 7]
lambdas = [0.01, 0.02, 0.04, 0.08, 0.1, 0.2, 0.4, 0.7]
numIters = [10, 15, 20]
partition_num = [2, 3, 4, 5, 6, 7, 8]
print(len(ranks)*len(lambdas)*len(numIters)*len(partition_num))

1008
