In [1]:
from pyspark import SparkContext
import csv
from collections import OrderedDict
import random
from numpy.random import rand

In [2]:
sc = SparkContext.getOrCreate()

In [9]:
input_file = "../data/yelp_train.csv"
minPartition = 3
raw_data = sc.textFile(input_file) #(input_file, minPartition)
header = raw_data.first()
clean_data = raw_data.filter(lambda x: x != header).mapPartitions(lambda x: csv.reader(x))

print("Raw data example:")
print(header)
for line in clean_data.take(3):
    print(line)
print("\nTotal number in '%s': %d" % (input_file, clean_data.count()))

user_ids = clean_data.map(lambda x: x[0]).distinct().collect()
business_ids = clean_data.map(lambda x: x[1]).distinct().collect()
rating = clean_data.map(lambda x: float(x[2]))
print("User id numbers: %d" % len(user_ids))
print("Business id numbers: %d" % len(business_ids))
print("Min value of rating: %.1f" % rating.min())

uid2idx = dict()
bid2idx = dict()
idx2bid = dict()
for idx,uid in enumerate(user_ids,0):
    uid2idx[uid] = idx

for idx,bid in enumerate(business_ids,0):
    bid2idx[bid] = idx
    idx2bid[idx] = bid

def id_to_index(iteration,order="u,b"):
    iteration = list(iteration)
    for review in iteration:
        uidx = uid2idx[review[0]]
        bidx = bid2idx[review[1]]
        if order == "u,b":
            yield [uidx,[bidx]]
        elif order == "b,u":
            yield [bidx,[uidx]]

def sort_idxs(iteration):
    iteration = list(iteration)
    for line in iteration:
        yield [line[0],sorted(line[1])]
        
# convert to [uidx,[bidx1, bidx2, ...]], [bidx1, bidx2, ...] is sorted
rows = clean_data.mapPartitions(lambda iters: id_to_index(iters,"u,b")).reduceByKey(lambda a,b: a+b).mapPartitions(lambda iters: sort_idxs(iters))
rows.persist() # TODO???
# convert to [bidx,[uidx1, uidx2, ...]] , [uidx1, uidx2, ...] is sorted
columns = clean_data.mapPartitions(lambda iters: id_to_index(iters,"b,u")).reduceByKey(lambda a,b: a+b).mapPartitions(lambda iters: sort_idxs(iters))
columns.persist() # TODO???

Raw data example:
user_id, business_id, stars
['vxR_YV0atFxIxfOnF9uHjQ', 'gTw6PENNGl68ZPUpYWP50A', '5.0']
['o0p-iTC5yTBV5Yab_7es4g', 'iAuOpYDfOTuzQ6OPpEiGwA', '4.0']
['-qj9ouN0bzMXz1vfEslG-A', '5j7BnXXvlS69uLVHrY9Upw', '2.0']

Total number in '../data/yelp_train.csv': 455854
User id numbers: 11270
Business id numbers: 24732
Min value of rating: 1.0


In [7]:
random.seed(12345)
user_cnt = len(user_ids)
business_cnt = len(business_ids)
hash_num = 100
a_values = []
b_values = []
for i in range(hash_num):
    a_r = random.randint(1,business_cnt)
    while a_r in a_values:
        a_r = random.randint(1,business_cnt)
    a_values.append(a_r)
    b_values.append(random.randint(1,business_cnt))

    
    
# [uidx, [h1(uidx), h2(uidx), ...]]
def cal_hash(iters,a_list,b_list,hash_num):
    iters = list(iters)
    for x in iters:
        yield (x,[(x*a + b) % hash_num for a,b in zip(a_list,b_list)])

# {uidx1: [h1(uidx1), h2(uidx1), ...], 
#  uidx2: [h1(uidx2), h2(uidx2), ...],
#  ...}
hash_values = rows.keys().mapPartitions(lambda iters: cal_hash(iters, a_values,b_values,hash_num)).collectAsMap()

In [49]:
hash_values_bc = sc.broadcast(hash_values)
def min_hash(iters,hash_bc,hash_num):
    iters = list(iters)
    for business_col in iters:
        bidx = business_col[0]
        uidxs = business_col[1]
        tmp_signature_col = [[] for i in range(hash_num)]
        for uidx in uidxs:
            hash_col_u = hash_bc.value[uidx]
            for i in range(hash_num):
                tmp_signature_col[i].append(hash_col_u[i])
        signature_col = [min(i) for i in tmp_signature_col]  # a little bit faster than using list(map(min,tmp_signature_col))
        yield (bidx, signature_col)

# (bidx, [sig1,sig2, ...])
signature_mat = columns.mapPartitions(lambda iteration: min_hash(iteration, hash_values_bc,hash_num))

In [44]:
mat = rand(hash_num,business_cnt).tolist()

%timeit list(map(min,mat))
%timeit [min(i) for i in mat]

24.4 ms ± 651 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
23.8 ms ± 428 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [57]:
r = [1,2,3,4,5,8,10,20,30,60,120]
b = [int(120/i) for i in r]
v = [(1/bb)**(1/rr) for bb,rr in zip(b,r)]

In [58]:
v

[0.008333333333333333,
 0.12909944487358055,
 0.2924017738212866,
 0.42728700639623407,
 0.5296119205244061,
 0.7128343062413697,
 0.7799771419043033,
 0.914307826761828,
 0.9548416039104165,
 0.9885140203528962,
 1.0]