In [15]:
import numpy as np
import pandas as pd

from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql.functions import *

import warnings
warnings.filterwarnings('ignore')

# Q2

In [16]:
spark = SparkSession.builder.config("spark.executor.memory", "16g").config("spark.driver.memory", "16g").config("spark.sql.shuffle.partitions", "3").appName("ass3_q2").getOrCreate()

## (1)

In [17]:
static = spark.read.json("data/activity-data/part-00000-tid-730451297822678341-1dda7027-2071-4d73-a0e2-7fb6a91e1d1f-0-c000.json")
dataSchema = static.schema
static.printSchema()
static.show(5)

root
 |-- Arrival_Time: long (nullable = true)
 |-- Creation_Time: long (nullable = true)
 |-- Device: string (nullable = true)
 |-- Index: long (nullable = true)
 |-- Model: string (nullable = true)
 |-- User: string (nullable = true)
 |-- gt: string (nullable = true)
 |-- x: double (nullable = true)
 |-- y: double (nullable = true)
 |-- z: double (nullable = true)

+-------------+-------------------+--------+-----+------+----+-----+-------------+------------+-------------+
| Arrival_Time|      Creation_Time|  Device|Index| Model|User|   gt|            x|           y|            z|
+-------------+-------------------+--------+-----+------+----+-----+-------------+------------+-------------+
|1424686735175|1424686733176178965|nexus4_1|   35|nexus4|   g|stand| 0.0014038086|   5.0354E-4|-0.0124053955|
|1424686735378|1424686733382813486|nexus4_1|   76|nexus4|   g|stand|-0.0039367676| 0.026138306|  -0.01133728|
|1424686735577|1424686733579072031|nexus4_1|  115|nexus4|   g|stand|  0.00354003

long overflow -> ref: https://stackoverflow.com/search?q=%5Bapache-spark%5Dstream+convert+timestamp

In [18]:
static.withColumn("Creation_Time",to_timestamp(col("Creation_Time")/1000000000)).show(5,truncate=False)

+-------------+--------------------------+--------+-----+------+----+-----+-------------+------------+-------------+
|Arrival_Time |Creation_Time             |Device  |Index|Model |User|gt   |x            |y           |z            |
+-------------+--------------------------+--------+-----+------+----+-----+-------------+------------+-------------+
|1424686735175|2015-02-23 10:18:53.176179|nexus4_1|35   |nexus4|g   |stand|0.0014038086 |5.0354E-4   |-0.0124053955|
|1424686735378|2015-02-23 10:18:53.382813|nexus4_1|76   |nexus4|g   |stand|-0.0039367676|0.026138306 |-0.01133728  |
|1424686735577|2015-02-23 10:18:53.579072|nexus4_1|115  |nexus4|g   |stand|0.003540039  |-0.034744263|-0.019882202 |
|1424686735779|2015-02-23 10:49:41.834321|nexus4_2|163  |nexus4|g   |stand|0.002822876  |0.005584717 |0.017318726  |
|1424686735982|2015-02-23 10:49:42.035859|nexus4_2|203  |nexus4|g   |stand|0.0017547607 |-0.018981934|-0.022201538 |
+-------------+--------------------------+--------+-----+------+

In [19]:
import os 
import shutil  
if os.path.exists('checkpoint/activity-data/offsets'): 
    shutil.rmtree('checkpoint/activity-data/offsets')     # 强制删除文件夹

In [20]:
streaming = spark.readStream.schema(dataSchema).option("maxFilesPerTrigger",10).json("data/activity-data/*.json")
checkpointDir = "checkpoint/activity-data"

activityCounts = streaming.withColumn("Creation_Time",to_timestamp(col("Creation_Time")/1000000000)).withWatermark("Creation_Time", "1 minute").groupBy("user",window("Creation_Time","6 minutes","3 minutes")).count()

#* with checkpoint
activityQuery = activityCounts.writeStream.queryName("activity_query").format("memory").outputMode("update").trigger(processingTime="2 seconds").option("checkpointLocation", checkpointDir).start()

# * no checkpoint
# activityQuery = activityCounts.writeStream.queryName("activity_query").format("memory").outputMode("update").trigger(processingTime="2 seconds").start()

24/05/02 09:38:25 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


<!-- 如果没有checkpoint，数据会从头开始跑，每次程序的启动 重复运行下面的查询语句结果会不一样；

如果有checkpoint，spark强制让我删除offset，但是offset存储偏移量的（这里很奇怪，每次运行前让我删除offset，否则报错），删除之后checkpoint可以说是没有用了，因为数据会从头开始跑，每次程序的启动 重复运行下面的查询语句结果会不一样；但是最后的效果是，我每次查询后的结果都是一样的，结果都是一样的，就像是checkpoint起到了作用（数据全部跑完，不用再更新查询的结果），但是理论上删除了offset后，是不会存储前面查询的呀。 -->

In [21]:
spark.sql("SELECT * FROM activity_query order by window").show(5,truncate=False)

+----+------------------------------------------+-----+
|user|window                                    |count|
+----+------------------------------------------+-----+
|a   |{2015-02-22 00:36:00, 2015-02-22 00:42:00}|9    |
|a   |{2015-02-22 00:36:00, 2015-02-22 00:42:00}|16   |
|a   |{2015-02-22 00:39:00, 2015-02-22 00:45:00}|9    |
|a   |{2015-02-22 00:39:00, 2015-02-22 00:45:00}|16   |
|g   |{2015-02-23 10:15:00, 2015-02-23 10:21:00}|4696 |
+----+------------------------------------------+-----+
only showing top 5 rows



In [22]:
activityQuery.stop()

24/05/02 09:38:32 ERROR TorrentBroadcast: Store broadcast broadcast_120 fail, remove all pieces of the broadcast


In [23]:
activityQuery.status

{'message': 'Stopped', 'isDataAvailable': False, 'isTriggerActive': False}

## (2)

different checkpoints directory

In [26]:
import os 
import shutil  
if os.path.exists('checkpoint/activity-data-append-parquet/offsets'): 
    shutil.rmtree('checkpoint/activity-data-append-parquet/offsets')     # 强制删除文件夹

In [27]:
parquetOutputPath = "output/activity-data"
activityQuery2 = activityCounts.writeStream \
    .queryName("activity_query2")\
    .format("parquet") \
    .outputMode("append") \
    .option("checkpointLocation", "checkpoint/activity-data-append-parquet") \
    .option("path", parquetOutputPath) \
    .start()

24/05/02 09:41:05 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


In [28]:
import os 
import shutil  
if os.path.exists('checkpoint/activity-data-append-memory/offsets'): 
    shutil.rmtree('checkpoint/activity-data-append-memory/offsets')     # 强制删除文件夹

In [29]:
activityQuery3 = activityCounts.writeStream \
    .queryName("activity_query3")\
    .format("memory") \
    .outputMode("append") \
    .option("checkpointLocation", "checkpoint/activity-data-append-memory") \
    .start()

24/05/02 09:41:10 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


                                                                                

In [35]:
spark.sql("select * from activity_query3").show(10,False)

+----+------------------------------------------+-----+
|user|window                                    |count|
+----+------------------------------------------+-----+
|c   |{2015-02-23 13:06:00, 2015-02-23 13:12:00}|13936|
|h   |{2015-02-23 14:00:00, 2015-02-23 14:06:00}|14733|
|g   |{2015-02-23 11:03:00, 2015-02-23 11:09:00}|16878|
|b   |{2015-02-24 13:57:00, 2015-02-24 14:03:00}|18191|
|i   |{2015-02-24 12:09:00, 2015-02-24 12:15:00}|14899|
|g   |{2015-02-23 10:54:00, 2015-02-23 11:00:00}|23180|
|g   |{2015-02-23 10:24:00, 2015-02-23 10:30:00}|16071|
|g   |{2015-02-23 10:42:00, 2015-02-23 10:48:00}|13892|
|e   |{2015-02-24 14:48:00, 2015-02-24 14:54:00}|23272|
|d   |{2015-02-24 12:42:00, 2015-02-24 12:48:00}|16393|
+----+------------------------------------------+-----+
only showing top 10 rows



                                                                                

                                                                                