In [39]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.types import StructType, StructField, LongType, StringType, DoubleType
from time import sleep
from pyspark.sql.functions import col, size, length

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("airbnb_reviews")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")
sparkConf.set("spark.sql.streaming.checkpointLocation", '/home/jovyan/checkpoint')
# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()


dataSchema = StructType(
        [StructField("index", LongType(), True),
         StructField("Renter_ID", LongType(), True),
         StructField("Listing_ID", LongType(), True),
         StructField("Review", StringType(), True),
         StructField("Date", StringType(), True),])


conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

# Read from a source 
sdf = spark.readStream.schema(dataSchema).option("maxFilesPerTrigger", 1) \
        .option("header","true") \
        .csv("gs://assignment2_airbnb/airbnb_data/Reviews")

#Do a calculation
reviews = sdf.select(['Renter_ID','Listing_ID','Review','Date'])
reviews = reviews.filter("Review is not NULL")
reviews = reviews.filter("Listing_ID <10000000000")


# Write to a sink - here, the output is memory (only for testing). The query name (i.e., activity_counts) will be the Spark SQL table name.
activityQuery = reviews.writeStream \
                    .format("csv").outputMode("append") \
                    .option ("path", "gs://assignment2_airbnb/Results_stream") \
                    .option("header", "true") \
                    .start()
# Testing 
#for x in range(10):
#    spark.sql("SELECT * FROM reviews").show()
#    sleep(5)

In [38]:
spark.stop()
