In [154]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType
from pyspark.sql.functions import from_json, col

In [155]:
dataSchema = StructType([
    StructField("Case Number", StringType(), True),
    StructField("Date", DateType(), True),
    StructField("Block", StringType(), True),
    StructField("Primary Type", StringType(), True),
    StructField("Description", StringType(), True),
    StructField("District", StringType(), True),
    StructField("Community Area", StringType(), True),
    StructField("weeknumber", IntegerType(), True),
    StructField("year", IntegerType(), True),
])

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("Lab7_1")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")
bucket = "chicagocrime-bigquery-temp-storage"

# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

spark.conf.set("temporaryGcsBucket",bucket)

In [156]:
df = spark.readStream.format('kafka')\
    .option("kafka.bootstrap.servers", "kafka1:9093")\
    .option("subscribe", "crimes2") \
    .option('failOnDataLoss', 'false')\
    .option("startingOffsets", "latest")\
    .load()

In [157]:
sdf = df.selectExpr('key', 'timestamp','topic','partition','offset','CAST(value AS STRING)')\
    .withColumn('value', from_json(col('value'), dataSchema))
# df_withEventTime = df2.withWatermark("timestamp", "1 minutes")\
#     .groupBy(window(col('timestamp'), '10 seconds'), 'value.`Community Area`').count()

In [158]:
from pyspark.sql.functions import explode, split, concat, col, lit
words = sdf.select('value.weeknumber','value.year','value.Community Area', explode(split(sdf.value.Description, " ")).alias("word"))

word_count = words.groupBy('year','weeknumber','Community Area', 'word').count()
word_count.printSchema()

root
 |-- year: integer (nullable = true)
 |-- weeknumber: integer (nullable = true)
 |-- Community Area: string (nullable = true)
 |-- word: string (nullable = false)
 |-- count: long (nullable = false)



In [159]:
bucket = "group6_chicagocrime"  #  bucket for the assignment
spark.conf.set('temporaryGcsBucket', bucket)

# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

def my_foreach_batch_function(df, batch_id):
    df.write.format('bigquery') \
      .option('table', 'datatengineering-group6.crimedescription.wordcounts') \
      .option("temporaryGcsBucket",bucket)\
      .mode("append") \
      .save()
    
# Write to a sink - here, the output is written to a Big Query Table
activityQuery = word_count.writeStream.outputMode("update")\
                    .option("checkpointLocation", "/home/jovyan/checkpoint/crimes")\
                    .foreachBatch(my_foreach_batch_function).start()

In [None]:
spark.stop()