# Hackathon 03: Tumbling 8-hour Time Window example
## Goal: Which 'Category' in which 8-hour Time window are SF criminals most likely to Strike? 
## (Midnight-08AM,  08AM-4:00PM, 4:00PM-Midnight)

In [0]:
from pyspark.sql.types import StructType, StructField, StringType

policeSchema = StructType([StructField('IncidentNum', StringType(), True), StructField('Category', StringType(), True), StructField('Description', StringType(), True), StructField('DayOfWeek', StringType(), True), StructField('Date', StringType(), True), StructField('Time', StringType(), True), StructField('PdDistrict', StringType(), True),  StructField('Resolution', StringType(), True), StructField('Address', StringType(), True), StructField('X', StringType(), True), StructField('Y', StringType(), True), StructField('Loc', StringType(), True), StructField('PdId', StringType(), True)])

In [0]:
# But need Timestamp format for Streaming window (IE: 2015-06-21T15:00:00.000+0000)
# I can 'build' a Timestamp format using 'Date' and 'Time' columns
df = spark.read.schema(policeSchema).csv("dbfs:/FileStore/tables/sfpd1/")
display(df)

In [0]:
# Here's how I manufactured the 'tstmp' column with Timestamp format
from pyspark.sql.functions import *

df = spark.read.schema(policeSchema).csv("dbfs:/FileStore/tables/sfpd1/")
df = (df.withColumn("dtY", concat(substring("Date", 7,4)))
        .withColumn("dtM", concat(substring("Date", 1,2)))
        .withColumn("dtD", concat(substring("Date", 4,2)))
        .withColumn("tm", concat("Time", lit(":00")))
        .withColumn("tstamp", concat("dtY", lit("-"), "dtM", lit("-"), "dtD", lit("T"), "tm"))
     )
df = df.select("IncidentNum", "Category", "Description", "PdDistrict", "tstamp")
df = df.withColumn("tstmp", (col("tstamp").cast("timestamp"))).drop("tstamp")
display(df)

df.write.format("parquet").mode("overwrite").save("dbfs:/FileStore/tables/sfpd2/")

In [0]:
# Here's the Data we will be using in Streaming Window lab
# Notice 'tstmp' column is in Timestamp format
df = spark.read.parquet("dbfs:/FileStore/tables/sfpd2/")
df.printSchema()
display(df)

In [0]:
# Here's the 'readStream'
ReadStreamDF = (spark.readStream
    .schema(DDL_Schema)
    .option("maxFilesPerTrigger", 1)
    .parquet("dbfs:/FileStore/tables/sfpd2/"))                             

In [0]:
# And here's the 8-hour Tumbing Time window aggregating 'Category'
# Goal: Which 8-hour time windows are Criminals most likely to stike (Midnight-08AM,  08AM-4:00PM, 4:00-Midnight)
AggrCat = ReadStreamDF.groupBy("Category", window("tstmp", "8 hours")).count()    

In [0]:
# After viewing Streaming data, ensure click 'Cancel' hotlink before doing next Cell
display(AggrCat)

In [0]:
# Here's the 'writeStream'. Output will be a TempView
spark.conf.set("spark.sql.shuffle.partitions", "2")

# View stream in real-time
winQuery = (AggrCat
    .writeStream
    .format("memory")
    .queryName("WinCts")
    .outputMode("complete")
    .trigger(processingTime = "30 seconds")
    .start())

In [0]:
%sql
-- Every minute, Run this query to see which Time Window most Crimes occur
SELECT * FROM WinCts ORDER BY count DESC, window ASC

In [0]:
# Stop Streaming Job
winQuery.stop()

#  Deadline for email submission is Tuesday
## To save this Notebook as file, do the following:
  
#####1) In gray vertical bar on left, click on 'Workspace' icon
#####2) Find notebook, and click on 'down' arrow on right side of it
#####3) Select 'Export' > 'IPython Notebook'
#####4) Click 'Save'
#####5) Email file to following 2 email addresses: ottmk@ucmail.uc.edu;  XXXXX@mail.uc.edu;
#####   along with names of any team members so I can assign grades
#####6) You will be graded on 2 things.  I can run all the cells without Error and the Answer sets in Cells are correct.

# End of Hackathon 03: Tumbling 8-hour Time Window