Batch mode (batch dataframes)

In [0]:
spark.sql("USE CATALOG catalog")
spark.sql("USE schema")


from pyspark.sql.functions import col, sum as _sum
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

schema = StructType([
   StructField("Id", IntegerType(), True),
   StructField("name", StringType(), True),
   StructField("age", IntegerType(), True),
   StructField("money", IntegerType(), True),
   StructField("sales", IntegerType(), True),
   StructField("units", IntegerType(), True),
])

# Ingest data from JSON files
sales_spark_batch_df = (spark.read
                        .format("json")
                        .option("multiLine", True)
                        .schema(schema)
                        .load("/Volumes/json/")
                        .na.drop()
                        .filter(col("money") > 0)
                        .select(
                           col("Id"),
                           col("money").cast("double")
                           )
                        )

In [0]:
sales_spark_batch_df.display()

Id,money
10,1300.0
11,1750.0
12,1450.0
7,1100.0
8,1600.0
9,2000.0
1,1200.0
2,850.0
3,1500.0
4,2100.0


Streaming mode (streaming dataframes)

In [0]:
# Ingest streaming data from JSON files
sales_spark_streaming_df = (spark.readStream
                            .format("json")
                            .option("multiLine", True)
                            .schema(schema)
                            .load("/Volumes/json/")
                            .na.drop()
                            .filter(col("money") > 0)
                            .select(
                                col("Id"),
                                col("money").cast("double")
                                )
                            )

In [0]:
display(sales_spark_streaming_df)

Id,money
7,1100.0
8,1600.0
9,2000.0
4,2100.0
5,950.0
6,1800.0
10,1300.0
11,1750.0
12,1450.0
1,1200.0


Write streaming query results

In [0]:
checkpoint_path = "/Volumes/checkpoints_spark_query"

salesspark_query = (sales_spark_streaming_df
                    .writeStream
                    .format("delta")
                    .queryName("salesspark_query ")
                    .outputMode("append")
                    .trigger(processingTime="10 seconds")
                    .option("checkpointLocation", checkpoint_path)
                    .toTable("salesspark")
                    )

Monitor streaming query

In [0]:
salesspark_query.name

'salesspark_query '

In [0]:
salesspark_query.status

{'message': 'Waiting for next trigger',
 'isDataAvailable': False,
 'isTriggerActive': False}

In [0]:
salesspark_query.lastProgress

In [0]:
import time
time.sleep(20)

salesspark_query.stop()

In [0]:
salesspark_query.awaitTermination()
# awaitTermination blocks the current thread until the stream query is terminated. For stand-alone structured applications, this is used to prevent the main thread from terminating while the streaming query is still executing (here is useful in cas of a run all to run the notebook). This prevents subsequent command cells from executing until the streaming query has fully terminated.


To stop a query by its name

In [0]:
query_name = "salesspark_query"
for q in spark.streams.active:
    if q.name == query_name:
        q.stop()

In [None]:
for q in spark.streams.active:
    q.stop()