In [1]:
import json
from pyspark import SparkContext, Spark
from time import time
import csv

In [6]:
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import col
from pyspark.sql import SparkSession

In [7]:
spark = SparkSession.builder \
     .master("local") \
     .appName("Word Count") \
     .config("spark.some.config.option", "some-value") \
     .getOrCreate()

In [3]:
data = [(0, Vectors.sparse(6, [0, 1, 2], [1.0, 1.0, 1.0]),),
        (1, Vectors.sparse(6, [2, 3, 4], [1.0, 1.0, 1.0]),),
        (2, Vectors.sparse(6, [0, 2, 4], [1.0, 1.0, 1.0]),)]

In [8]:
df = spark.createDataFrame(data, ["id", "features"])

In [10]:
df.show()

+---+--------------------+
| id|            features|
+---+--------------------+
|  0|(6,[0,1,2],[1.0,1...|
|  1|(6,[2,3,4],[1.0,1...|
|  2|(6,[0,2,4],[1.0,1...|
+---+--------------------+



In [2]:
path = "/Users/liangsiqi/Documents/Dataset/yelp_dataset/"
user_file = "user.json"
review_file = "review.json"
business_file = "business.json"

In [8]:
sc = SparkContext.getOrCreate()
reviewRDD = sc.textFile(path+review_file).map(json.loads)
businessRDD = sc.textFile(path+business_file).map(json.loads)

business_state_kvRDD = businessRDD.map(lambda x: [x['business_id'], x['state']])

In [11]:
business_state = business_state_kvRDD.collect()

In [14]:
business_state_dict = {}
for i in business_state:
    business_state_dict[i[0]]= i[1]

In [6]:
reviewRDD_clean = reviewRDD.map(lambda x: {'user_id': x['user_id'],
                                           'business_id': x['business_id'],
                                           'stars': x['stars']})


In [20]:
review_with_stateRDD = reviewRDD_clean.map(lambda x: {'user_id': x['user_id'],
                               'business_id': x['business_id'],
                               'stars': x['stars'],
                               'state': business_state_dict[x['business_id']]})

In [21]:
review_with_stateRDD.take(1)

[{'user_id': 'hG7b0MtEbXx5QzbzE6C_VA',
  'business_id': 'ujmEBvifdJM6h6RLv4wQIg',
  'stars': 1.0,
  'state': 'NV'}]

In [35]:
possible_subset = review_with_stateRDD.map(lambda x: (x['user_id'],x['business_id'],"%.1f" % x['stars'])).collect()

In [30]:
to_write = ["user_id, business_id, stars"]
_ = [to_write.append(",".join(line)) for line in possible_subset]

In [37]:
with open('./possible_subset.csv', 'w') as writeFile:
    print("user_id, business_id, stars",file=writeFile)
    writer = csv.writer(writeFile)
    writer.writerows(possible_subset)


In [40]:
len(possible_subset)*0.2

1337180.0

In [43]:
raw_data = sc.textFile("../data/yelp_train.csv")
header = raw_data.first()
raw_data_without_header = raw_data.filter(lambda x: x != header)
train_data = raw_data_without_header.map(lambda line: [i for i in line.strip().split(',')])
                        
raw_data = sc.textFile("../data/yelp_val.csv")
header = raw_data.first()
raw_data_without_header = raw_data.filter(lambda x: x != header)
val_data = raw_data_without_header.map(lambda line: [i for i in line.strip().split(',')])