In [None]:
from pyspark.sql import SparkSession

from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import window, col, avg, concat, lit
from pyspark.sql.types import StructType, StructField, LongType, StringType, DoubleType, IntegerType
from time import sleep
import pandas as pd

In [None]:
def load_df():
	from google.cloud import storage

	storage_client = storage.Client()
	bucket_name = "group6_chicagocrime"
	file_name = 'sales.csv'
	pd.read_csv(f'gs://{bucket_name}/{file_name}')
	# Creates the new bucket
	bucket = storage_client.create_bucket(bucket_name)
	blob = bucket.blob()
	print(f"Bucket {bucket.name} created.")

	with blob.open("r") as f:
		file = pd.read_csv(f.read())
	return file



sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("Lab8_Ex1")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

dataSchema = StructType(
    [StructField("uname", StringType(), True),
     StructField("tname", StringType(), True),
     StructField("score", IntegerType(), True),
     StructField("timestamp_in_ms", LongType(), True),
     StructField("readable_time", StringType(), True)
     ])

# Read from a source 
sdf = spark.readStream.schema(dataSchema).option("maxFilesPerTrigger", 1) \
    .csv("/home/jovyan/data/gamescore")
    
    

# create the event time column 
withEventTimedf = sdf.selectExpr(
    "*",
    "cast(timestamp_in_ms/1000.0 as timestamp) as event_time")

withEventTimedf.printSchema()

avgscoredf = withEventTimedf \
    .groupBy(window(col("event_time"), "10 seconds"), "uname", "tname") \
    .agg(avg("score").alias("value"))

resultdf = avgscoredf.select(concat(col("uname"), lit(" "), col("tname")).alias("key"), col("value"))

query = resultdf \
    .writeStream \
    .queryName("avg_score_window") \
    .format("memory") \
    .outputMode("complete") \
    .start()

try:
    for x in range(100):
        spark.sql("SELECT * FROM avg_score_window").show()
        sleep(10)
except KeyboardInterrupt:
    query.stop()
    # Stop the spark context
    spark.stop()
    print("Stoped the streaming query and the spark context")