In [1]:
import findspark
findspark.init()

import pyspark # only run after findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

df = spark.sql('''select 'spark' as hello ''')
df.show()

from pyspark import SparkContext, SparkConf

appName = "ReLU"
conf = SparkConf().setAppName(appName).setMaster("local")
sc = SparkContext.getOrCreate()

+-----+
|hello|
+-----+
|spark|
+-----+



In [2]:

from pyspark.sql.types import StructType, StructField, TimestampType, StringType, DoubleType

schema = StructType([StructField("review_id", StringType(), True), StructField("date", TimestampType(), True), StructField("user_id", StringType(), True), StructField("business_id", StringType(), True), StructField("stars", DoubleType(), True)])

data_set = spark.read.json("./mini_review.json", schema=schema, timestampFormat="yyyy-MM-dd HH:mm:ss")

In [3]:
import pyspark.sql

In [4]:
data_set.createOrReplaceTempView("reviews")
last_data_set = spark.sql("SELECT R1.review_id, R1.date, R1.user_id, R1.business_id, R1.stars FROM reviews AS R1 JOIN (SELECT max(R2.date) AS last_date, R2.user_id, R2.business_id FROM reviews AS R2 GROUP BY R2.user_id, R2.business_id) table2 ON R1.date=table2.last_date AND R1.user_id=table2.user_id AND R1.business_id=table2.business_id")

In [5]:
data_set.select("business_id", "user_id").count()

99836

In [6]:
from scipy import sparse
import numpy as np

In [7]:
unique_users = np.concatenate(
        last_data_set.select("user_id").distinct().rdd.glom().map(
          lambda x: np.array([elem[0] for elem in x]))
        .collect())

dictOfUsers = { unique_users[i] : i for i in range(0, len(unique_users) ) }

unique_business = np.concatenate(
        last_data_set.select("business_id").distinct().rdd.glom().map(
          lambda x: np.array([elem[0] for elem in x]))
        .collect())

dictOfBusiness = { unique_business[i] : i for i in range(0, len(unique_business) ) }

In [8]:
from pyspark.sql.functions import udf

def translate(mapping):
    def translate_(col):
        return mapping.get(col)
    return udf(translate_, StringType())

last_data_set2 = last_data_set.withColumn('int_id_user', translate(dictOfUsers)('user_id'))
last_data_set2 = last_data_set2.withColumn('int_id_business', translate(dictOfBusiness)('business_id'))

In [9]:
temp= last_data_set2.select("int_id_user", "int_id_business", "stars")

In [10]:
temp.persist(pyspark.StorageLevel.MEMORY_ONLY)

DataFrame[int_id_user: string, int_id_business: string, stars: double]

In [11]:
tuples = temp.rdd.map(lambda x: (int(x[0]), int(x[1]), x[2]))
tuples.persist(pyspark.StorageLevel.MEMORY_AND_DISK)

PythonRDD[77] at RDD at PythonRDD.scala:53

In [13]:
tuples.take(10)

[(16081, 4680, 5.0),
 (46249, 12102, 5.0),
 (51911, 1921, 4.0),
 (67241, 11972, 3.0),
 (59105, 9979, 4.0),
 (79531, 3403, 4.0),
 (5157, 1407, 5.0),
 (44602, 712, 4.0),
 (54734, 4802, 3.0),
 (4761, 5002, 4.0)]

In [14]:
from pyspark.mllib.linalg.distributed import CoordinateMatrix, MatrixEntry
smat=CoordinateMatrix(tuples)

In [15]:
m = smat.numRows()  # 3
n = smat.numCols()  # 2
print(m,n)

79906 13951
