In [1]:
#import pandas as pd

#in_csv = 'conflicts.csv'
#number_lines = sum(1 for row in (open(in_csv)))
#rowsize = 5000
#for i in range(1,number_lines,rowsize):
#    df = pd.read_csv(in_csv,
#          header=None,
#          nrows = rowsize,#number of rows to read at each loop
#          skiprows = i)
#    out_csv = 'conflicts' + str(i) + '.csv'
#    df.to_csv(out_csv,
#          index=False,
#          header=False,
#          mode='a',#append data to csv file
#          chunksize=rowsize)#size of data to append for each loop

from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T

spark = SparkSession \
    .builder \
    .appName("ConflictsApp") \
    .getOrCreate()

In [24]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

dfraw = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "localhost:9092") \
  .option("subscribe", "conflicts") \
  .option("startingOffsets", "earliest") \
  .load()
dfraw.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [3]:
schema = StructType([StructField("", StringType(),True),
                         StructField("ISO", StringType(),True),
                         StructField("EVENT_DATE", StringType(),True),
                         StructField("EVENT_TYPE", StringType(),True),
                         StructField("SUB_EVENT_TYPE", StringType(),True),
                         StructField("ACTOR1", StringType(),True),
                         StructField("ASSOC_ACTOR_1", StringType(),True),
                         StructField("ACTOR2", StringType(),True),
                         StructField("ASSOC_ACTOR_2", StringType(),True),
                         StructField("INTERACTION", IntegerType(),True),
                         StructField("REGION", StringType(),True),
                         StructField("COUNTRY", StringType(),True),
                         StructField("ADMIN1", StringType(),True),
                         StructField("ADMIN2", StringType(),True),
                         StructField("LOCATION", StringType(),True),
                         StructField("SOURCE", StringType(),True),
                         StructField("NOTES", StringType(),True),
                         StructField("FATALITIES", StringType(),True)
                    ])

#df = dfraw.selectExpr("CAST(value AS STRING)")
df = dfraw.selectExpr("CAST(value AS STRING)", "CAST(timestamp AS TIMESTAMP)") \
  .select(from_json("value", schema).alias("data"), "timestamp") \
  .select("data.*", "timestamp")
df.printSchema()

root
 |-- : string (nullable = true)
 |-- ISO: string (nullable = true)
 |-- EVENT_DATE: string (nullable = true)
 |-- EVENT_TYPE: string (nullable = true)
 |-- SUB_EVENT_TYPE: string (nullable = true)
 |-- ACTOR1: string (nullable = true)
 |-- ASSOC_ACTOR_1: string (nullable = true)
 |-- ACTOR2: string (nullable = true)
 |-- ASSOC_ACTOR_2: string (nullable = true)
 |-- INTERACTION: integer (nullable = true)
 |-- REGION: string (nullable = true)
 |-- COUNTRY: string (nullable = true)
 |-- ADMIN1: string (nullable = true)
 |-- ADMIN2: string (nullable = true)
 |-- LOCATION: string (nullable = true)
 |-- SOURCE: string (nullable = true)
 |-- NOTES: string (nullable = true)
 |-- FATALITIES: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)



In [4]:
dfraw.writeStream.queryName("showDf").format("memory").outputMode("append").start()
df.writeStream.queryName("showRaw").format("memory").outputMode("append").start()

<pyspark.sql.streaming.StreamingQuery at 0x7fa2ac68efd0>

In [5]:
spark.sql("select * from showDf").show()

+----+--------------------+---------+---------+------+--------------------+-------------+
| key|               value|    topic|partition|offset|           timestamp|timestampType|
+----+--------------------+---------+---------+------+--------------------+-------------+
|null|[7B 22 22 3A 20 2...|conflicts|        0|     0|2021-01-28 01:12:...|            0|
|null|[7B 22 22 3A 20 2...|conflicts|        0|     1|2021-01-28 01:12:...|            0|
|null|[7B 22 22 3A 20 2...|conflicts|        0|     2|2021-01-28 01:12:...|            0|
|null|[7B 22 22 3A 20 2...|conflicts|        0|     3|2021-01-28 01:12:...|            0|
|null|[7B 22 22 3A 20 2...|conflicts|        0|     4|2021-01-28 01:12:...|            0|
|null|[7B 22 22 3A 20 2...|conflicts|        0|     5|2021-01-28 01:12:...|            0|
|null|[7B 22 22 3A 20 2...|conflicts|        0|     6|2021-01-28 01:12:...|            0|
|null|[7B 22 22 3A 20 2...|conflicts|        0|     7|2021-01-28 01:12:...|            0|
|null|[7B 

In [16]:
ds = df.toDF(*df.columns)

In [20]:
query = ds.groupBy("REGION").count()
query \
    .writeStream \
    .format("memory") \
    .outputMode("complete") \
    .queryName("region") \
    .start()

<pyspark.sql.streaming.StreamingQuery at 0x7fa2ad115ca0>

In [22]:
query.isStreaming

True

In [27]:
windowedCounts = ds \
    .withWatermark("timestamp", "10 minutes") \
    .groupBy(
        window(ds.timestamp, "10 minutes", "5 minutes"),
        ds.ISO) \
    .count()

In [31]:
windowedCounts.writeStream.format("console").outputMode("update").start()

<pyspark.sql.streaming.StreamingQuery at 0x7fa2ad14cf40>