In [1]:
from pyspark.sql.functions import expr, col, from_json
from pyspark.sql.types import StructType, StructField, StringType, LongType, DoubleType, IntegerType, ArrayType
from pyspark.sql.functions import *

In [2]:
spark = SparkSession \
        .builder \
        .appName("File Streaming Demo") \
        .master("local[3]") \
        .config("spark.streaming.stopGracefullyOnShutdown", "true") \
        .getOrCreate()

In [3]:
schema = StructType([
        StructField("matchday", StringType()),
        StructField("minute", LongType()),
        StructField("teamId_1", StringType()),
        StructField("teamId_2", StringType()),
        StructField("goal_1", IntegerType()),
        StructField("goal_2", IntegerType()),
        StructField("poss_1", IntegerType()),
        StructField("poss_2", IntegerType()),
        StructField("passes_attp_1", IntegerType()),
        StructField("pass_comp_1", DoubleType()),
        StructField("shots_attp_1", IntegerType()),
        StructField("shots_trg_1", IntegerType()),
        StructField("passes_attp_2", IntegerType()),
        StructField("pass_comp_2", DoubleType()),
        StructField("shots_attp_2", IntegerType()),
        StructField("shots_trg_2", IntegerType()),
        StructField("date", StringType()),
        StructField("timestamp", StringType()),
    ])

In [4]:
kafka_df = spark.readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "localhost:9092") \
        .option("subscribe", "matchesF") \
        .option("startingOffsets", "earliest") \
        .load()

In [5]:
kafka_df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [6]:
value_df = kafka_df.select(from_json(col("value").cast("string"), schema).alias("value")) 

In [7]:
value_df.printSchema()

root
 |-- value: struct (nullable = true)
 |    |-- matchday: string (nullable = true)
 |    |-- minute: long (nullable = true)
 |    |-- teamId_1: string (nullable = true)
 |    |-- teamId_2: string (nullable = true)
 |    |-- goal_1: integer (nullable = true)
 |    |-- goal_2: integer (nullable = true)
 |    |-- poss_1: integer (nullable = true)
 |    |-- poss_2: integer (nullable = true)
 |    |-- passes_attp_1: integer (nullable = true)
 |    |-- pass_comp_1: double (nullable = true)
 |    |-- shots_attp_1: integer (nullable = true)
 |    |-- shots_trg_1: integer (nullable = true)
 |    |-- passes_attp_2: integer (nullable = true)
 |    |-- pass_comp_2: double (nullable = true)
 |    |-- shots_attp_2: integer (nullable = true)
 |    |-- shots_trg_2: integer (nullable = true)
 |    |-- date: string (nullable = true)
 |    |-- timestamp: string (nullable = true)



In [8]:
explode_df = value_df.selectExpr("value.matchday", "value.teamId_1", "value.teamId_2",
                                 "value.minute", 
                                 "value.goal_1", 
                                 "value.goal_2", 
                                 "value.poss_1",
                                 "value.poss_2",
                                 "value.pass_comp_1",
                                 "value.pass_comp_2",
                                 "value.date",
                                 "value.timestamp") \
                     .withColumn("timestamp", to_timestamp(col("timestamp"), "yyyy-MM-dd HH:mm:ss")) \
                     .withWatermark("timestamp", "90 minute") # imaginemos que no hay descanso

In [9]:
res_df = explode_df.withWatermark("timestamp", "90 minute") \
                 .groupBy(['matchday','teamId_1','teamId_2']) \
                 .agg(max('minute').alias('minute'),
                      sum('goal_1').alias('goal_1'),
                      sum('goal_2').alias('goal_2'),
                      mean('poss_1').alias('avg_poss_1'),
                      mean('poss_2').alias('avg_poss_2'),
                      mean('pass_comp_1').alias('avg_pass_comp_1'),
                      mean('pass_comp_2').alias('avg_pass_comp_2'))                 

In [None]:
#Se imprime los resultados en la consola
window_query = res_df.writeStream \
        .format("console") \
        .outputMode("complete") \
        .option("checkpointLocation", "./checkpoint/tf") \
        .trigger(processingTime="5 minute") \
        .start()
window_query.awaitTermination()

In [None]:
#proc_match_query = res_df.writeStream \
#        .format("json") \
#        .queryName("Proc Matches") \
#        .outputMode("update") \
#        .option("path", "./output") \
#        .option("checkpointLocation", "./checkpoints/checkpoint-stream-kafka") \
#        .trigger(processingTime="1 minute") \
#        .start()
#invoice_writer_query.awaitTermination()