In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as func
from pyspark.sql.types import StructType, StructField, StringType, LongType, DateType

In [2]:
spark = SparkSession.builder.appName("YelpHelp")\
    .master("local")\
    .config("spark.executor.memory", "16g")\
    .config("spark.driver.memory", "16g")\
    .getOrCreate()

In [4]:
u_schema = StructType([
    StructField("user_id", StringType(), False),
    StructField("business_id", StringType(), False),
    StructField("text", StringType(), True),
    StructField("compliment_count",LongType(),True),
    StructField("date",DateType(),True)    
])
userDataset = spark.read.json("../yelp_dataset/yelp_academic_dataset_tip.json", schema=u_schema)
userDataset.show(5)

+--------------------+--------------------+--------------------+----------------+----------+
|             user_id|         business_id|                text|compliment_count|      date|
+--------------------+--------------------+--------------------+----------------+----------+
|hf27xTME3EiCp6NL6...|UYX5zL_Xj9WEc_Wp-...|Here for a quick mtg|               0|2013-11-26|
|uEvusDwoSymbJJ0au...|Ch3HkwQYv1YKw_FO0...|Cucumber strawber...|               0|2014-06-15|
|AY-laIws3S7YXNl_f...|rDoT-MgxGRiYqCmi0...|Very nice good se...|               0|2016-07-18|
|Ue_7yUlkEbX4AhnYd...|OHXnDV01gLokiX1EL...|It's a small plac...|               0|2014-06-06|
|LltbT_fUMqZ-ZJP-v...|GMrwDXRlAZU2zj5nH...|8 sandwiches, $24...|               0|2011-04-08|
+--------------------+--------------------+--------------------+----------------+----------+
only showing top 5 rows



In [7]:
userDataset.sort(func.col("compliment_count").desc()).show(5)

+--------------------+--------------------+--------------------+----------------+----------+
|             user_id|         business_id|                text|compliment_count|      date|
+--------------------+--------------------+--------------------+----------------+----------+
|wTfb2nfzPIyFcYQAr...|gwdQwe1JHLe-vPY9P...|Ending your pet's...|              15|2016-11-15|
|wTfb2nfzPIyFcYQAr...|Rrzm4bQDrrUnDyA5e...|1st Pet was very ...|              12|2016-09-14|
|8DEyKVyplnOcSKx39...|j7zJxmr8BfYJhC3KW...|License photograp...|              11|2015-09-17|
|Fv0e9RIV9jw5TX3ct...|QJ6GXAAMkgCZPF0bZ...|Heads up.... The ...|               9|2015-06-18|
+--------------------+--------------------+--------------------+----------------+----------+
only showing top 5 rows



In [None]:
dataset = userDataset.withColumn('year', func.year("date")).repartition("year")
dataset.write.partitionBy("year").json("../YelpDatasetYearly/Tip/yelp_academic_dataset_tip")
