In [0]:
%run ./Includes/Dataset-Mounting

-sandbox
Smartphone Accelerometer Samples: 

| Field          | Description |
| ------------- | ----------- |
| Arrival_Time | time data was received |
| Creation_Time | event time |
| Device | type of Model |
| Index | unique identifier of event |
| Model | i.e Nexus4  |
| User | unique user identifier |
| geolocation | city & country |
| gt | transportation mode |
| id | unused null field |
| x | acceleration in x-dir |
| y | acceleration in y-dir |
| z | acceleration in z-dir |

Accelerometors handle axis-based motion sensing (basically measures acceleration).

So accelerometer can figure out how fast our smartphone is moving and which direction it's pointing in.

Therefore, it can track our steps and know which way our handset in pointing among other things.

In [0]:
%fs ls "/mnt/training/definitive-guide/data/activity-json/streaming"

path,name,size
dbfs:/mnt/training/definitive-guide/data/activity-json/streaming/00.json,00.json,1210796
dbfs:/mnt/training/definitive-guide/data/activity-json/streaming/01.json,01.json,1201714
dbfs:/mnt/training/definitive-guide/data/activity-json/streaming/02.json,02.json,1210039
dbfs:/mnt/training/definitive-guide/data/activity-json/streaming/03.json,03.json,1204858
dbfs:/mnt/training/definitive-guide/data/activity-json/streaming/04.json,04.json,1271459
dbfs:/mnt/training/definitive-guide/data/activity-json/streaming/05.json,05.json,1298766
dbfs:/mnt/training/definitive-guide/data/activity-json/streaming/06.json,06.json,1311932
dbfs:/mnt/training/definitive-guide/data/activity-json/streaming/07.json,07.json,1292782
dbfs:/mnt/training/definitive-guide/data/activity-json/streaming/08.json,08.json,1226703
dbfs:/mnt/training/definitive-guide/data/activity-json/streaming/09.json,09.json,1215917


In [0]:
schema = "Arrival_Time BIGINT, Creation_Time BIGINT, Device STRING, Index BIGINT, Model STRING, User STRING, geolocation STRUCT<city: STRING, country: STRING>, gt STRING, id BIGINT, x DOUBLE, y DOUBLE, z DOUBLE"

mountPoint,source,encryptionType
/mnt/training,s3a://databricks-corp-training-ap-northeast-1/common,


In [0]:
path = "/mnt/training/definitive-guide/data/activity-json/streaming"

streamDF = (spark
           .readStream
           .format("json")
           .schema(schema)
           .option("maxFilesPerTrigger", 1)
           .load(path)
           )

In [0]:
display(streamDF, streamName="display_stream")

Arrival_Time,Creation_Time,Device,Index,Model,User,geolocation,gt,id,x,y,z
1424686734999,1424688581053803834,nexus4_2,8,nexus4,g,"List(Nanjing, China)",stand,,0.0006866455,0.01626587,0.021591187
1424686735099,1424688581154755982,nexus4_2,28,nexus4,g,"List(Nanjing, China)",stand,,0.0006866455,-0.039276123,0.037612915
1424686735199,1424686733201478037,nexus4_1,40,nexus4,g,"List(Nanjing, China)",stand,,0.0024719238,-0.009109497,-0.005996704
1424686735297,1424686733287171396,nexus4_1,57,nexus4,g,"List(Nanjing, China)",stand,,0.0024719238,0.00050354,-0.0017242432
1424686735398,1424688581451783570,nexus4_2,87,nexus4,g,"List(Nanjing, China)",stand,,0.0017547607,0.0045166016,-0.0061798096
1424686735491,1424686733488526377,nexus4_1,97,nexus4,g,"List(Nanjing, China)",stand,,0.0056762695,-0.013381958,0.014297485
1424686735597,1424686733599152598,nexus4_1,119,nexus4,g,"List(Nanjing, China)",stand,,0.0024719238,-0.034744263,-0.023086548
1424686735698,1424688581753724488,nexus4_2,147,nexus4,g,"List(Nanjing, China)",stand,,-0.001449585,0.02053833,-0.013656616
1424686735795,1424688581849427613,nexus4_2,166,nexus4,g,"List(Nanjing, China)",stand,,0.0038909912,-0.0050964355,0.01838684
1424686735898,1424686733901307139,nexus4_1,179,nexus4,g,"List(Nanjing, China)",stand,,-0.006072998,0.028274536,0.018569946


In [0]:
for s in spark.streams.active:
  print(s.id, s.name)

1303d16f-e30c-4cbe-9bad-4b43acbaa0de display_stream


In [0]:
spark.catalog.listTables()

Out[6]: [Table(name='display_stream', database=None, description=None, tableType='TEMPORARY', isTemporary=True)]

In [0]:
%sql
select distinct gt from display_stream

gt
stand
sit
""
walk


In [0]:
from pyspark.sql.functions import *

In [0]:
proDF = (streamDF
         .select((col("Creation_Time")/1E9).alias("time").cast("timestamp"), 
                 col("gt").alias("action"))
        )

In [0]:
countDF = (proDF
           .withWatermark("time", "2 hour")
           .groupBy(col("action"), window(col("time"), "1 hour"))
           .count()
           .select(col("window.start").alias("start"),
                  col("action"),
                  col("count"))
           .orderBy(col("start"), col("count"))
          )

In [0]:
spark.conf.get("spark.sql.shuffle.partitions")

Out[10]: '200'

In [0]:
spark.conf.set("spark.sql.shuffle.partitions", 2)

In [0]:
outputPath = "/activity/smartphone"
checkpointPath = outputPath + "/checkpoint"

dbutils.fs.rm(outputPath, True)

streamQuery = (countDF
               .writeStream
               .format("delta")
               .option("checkpointLocation", checkpointPath)
               .outputMode("complete")
               .queryName("activity q")
               .start(outputPath)
              )

In [0]:
streamQuery.recentProgress

Out[15]: [{'id': '4fc5b0e4-4a4b-41e8-993d-6e8c58eef782',
  'runId': '27145599-5a07-4372-add7-5bb44fe5b175',
  'name': 'activity q',
  'timestamp': '2021-11-26T08:31:54.520Z',
  'batchId': 0,
  'numInputRows': 4999,
  'inputRowsPerSecond': 0.0,
  'processedRowsPerSecond': 120.73421084410096,
  'durationMs': {'addBatch': 36170,
   'getBatch': 792,
   'latestOffset': 2853,
   'queryPlanning': 593,
   'triggerExecution': 41404,
   'walCommit': 525},
  'eventTime': {'avg': '2015-02-23T10:39:18.226Z',
   'max': '2015-02-23T10:58:56.152Z',
   'min': '2015-02-23T10:18:53.201Z',
   'watermark': '1970-01-01T00:00:00.000Z'},
  'stateOperators': [{'operatorName': 'stateStoreSave',
    'numRowsTotal': 6,
    'numRowsUpdated': 6,
    'allUpdatesTimeMs': 379,
    'numRowsRemoved': 0,
    'allRemovalsTimeMs': 0,
    'commitTimeMs': 1887,
    'memoryUsedBytes': 2448,
    'numRowsDroppedByWatermark': 0,
    'numShufflePartitions': 4,
    'numStateStoreInstances': 4,
    'customMetrics': {'loadedMapCache

In [0]:
display(countDF)

start,action,count
2015-02-23T10:00:00.000+0000,stairsup,1131
2015-02-23T10:00:00.000+0000,stairsdown,1262
2015-02-23T10:00:00.000+0000,bike,1325
2015-02-23T10:00:00.000+0000,walk,1591
2015-02-23T10:00:00.000+0000,,2262
2015-02-23T10:00:00.000+0000,sit,2747
2015-02-23T10:00:00.000+0000,stand,3010
2015-02-23T11:00:00.000+0000,sit,330
2015-02-23T11:00:00.000+0000,stairsup,1124
2015-02-23T11:00:00.000+0000,stairsdown,1283


In [0]:
for s in spark.streams.active:
  s.stop()