In [None]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [1]:
from pyspark.sql.types import (StructType, StructField,
                               StringType, IntegerType)

recordSchema = StructType([StructField('date', StringType(), True),
                           StructField('delay', IntegerType(), True),
                           StructField('distance', IntegerType(), True),
                           StructField('origin', StringType(), True),
                           StructField('destination', StringType(), True)])

In [25]:
df = spark.readStream.format("csv") \
    .schema(recordSchema) \
    .load("MyInputStream/")

In [10]:
import pyspark.sql.functions as F

In [26]:
df2 = df.groupBy('destination').agg(F.avg('delay').alias('AverageDelay'))

In [27]:
df2

DataFrame[destination: string, AverageDelay: double]

In [33]:
df = spark.readStream.format("parquet") \
    .schema(recordSchema) \
    .load("MyInputStream/")

In [34]:
df2 = df.groupBy('destination').agg(F.avg('delay').alias('AverageDelay'))

In [17]:
writer = df.writeStream.outputMode("append") \
    .format("console") \
    .option("truncate", False) \
    .option("numRows", 200) 

In [36]:
writerAvg = df2.writeStream.outputMode("complete") \
    .format("console") \
    .option("truncate", False) \
    .option("numRows", 200) 

In [37]:
query = writerAvg.start()

23/04/19 14:27:13 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-d5feeeba-3871-41fc-9a3b-24ec2a0a557f. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
23/04/19 14:27:13 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+-----------+------------------+
|destination|AverageDelay      |
+-----------+------------------+
|ATL        |13.642857142857142|
|ORD        |0.0               |
|DTW        |-2.0              |
+-----------+------------------+





-------------------------------------------
Batch: 1
-------------------------------------------
+-----------+------------------+
|destination|AverageDelay      |
+-----------+------------------+
|ATL        |18.26086956521739 |
|ORD        |28.4              |
|DTW        |13.636363636363637|
+-----------+------------------+



                                                                                

-------------------------------------------
Batch: 2
-------------------------------------------
+-----------+------------------+
|destination|AverageDelay      |
+-----------+------------------+
|ATL        |13.46875          |
|ORD        |13.777777777777779|
|DTW        |6.647058823529412 |
+-----------+------------------+



In [38]:
query.stop()

In [4]:
query = writer.start()

23/04/19 14:15:34 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-b1872b8e-8189-4e66-8e80-5b094e278f7c. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
23/04/19 14:15:34 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
-------------------------------------------
Batch: 0
-------------------------------------------
+-------+-----+--------+------+-----------+
|date   |delay|distance|origin|destination|
+-------+-----+--------+------+-----------+
|date   |null |null    |origin|destination|
|1011245|6    |602     |ABE   |ATL        |
|1020600|-8   |369     |ABE   |DTW        |
|1021245|-2   |602     |ABE   |ATL        |
|1020605|-4   |602     |ABE   |ATL        |
|1031245|-4   |602    

In [18]:
query = writer.start()

23/04/19 14:22:56 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-7ecb8bc6-8291-43dc-b265-243334058465. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
23/04/19 14:22:56 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


[Stage 6:>                                                          (0 + 1) / 1]                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+-------+-----+--------+------+-----------+
|date   |delay|distance|origin|destination|
+-------+-----+--------+------+-----------+
|1011245|6    |602     |ABE   |ATL        |
|1020600|-8   |369     |ABE   |DTW        |
|1021245|-2   |602     |ABE   |ATL        |
|1020605|-4   |602     |ABE   |ATL        |
|1031245|-4   |602     |ABE   |ATL        |
|1030605|0    |602     |ABE   |ATL        |
|1041243|10   |602     |ABE   |ATL        |
|1040605|28   |602     |ABE   |ATL        |
|1051245|88   |602     |ABE   |ATL        |
|1050605|9    |602     |ABE   |ATL        |
|1061215|-6   |602     |ABE   |ATL        |
|1061725|69   |602     |ABE   |ATL        |
|1061230|0    |369     |ABE   |DTW        |
|1060625|-3   |602     |ABE   |ATL        |
|1070600|0    |369     |ABE   |DTW        |
|1071725|0    |602     |ABE   |ATL        |
|1071230|0    |369     |ABE   |DTW        |
|1070625|0    |602     

In [30]:
query.stop()

### Note:
 - Complete output mode not supported when there are no streaming aggregations on streaming DataFrames/Datasets.