In [0]:
from pyspark.sql.functions import expr
from pyspark.sql.functions import from_unixtime

events = spark.read \
  .option("inferSchema", "true") \
  .json("/databricks-datasets/structured-streaming/events/") \
  .withColumn("date", expr("time")) \
  .drop("time") \
  .withColumn("date", from_unixtime("date", 'yyyy-MM-dd'))
  
display(events)

action,date
Close,2016-07-28
Close,2016-07-28
Open,2016-07-28
Close,2016-07-28
Open,2016-07-28
Open,2016-07-28
Close,2016-07-28
Close,2016-07-28
Close,2016-07-28
Open,2016-07-28


In [0]:
events.write.format("delta").mode("overwrite").partitionBy("date").save("/delta/events/")

In [0]:
events_delta = spark.read.format("delta").load("/delta/events/")

display(events_delta)

action,date
Close,2016-07-28
Close,2016-07-28
Open,2016-07-28
Close,2016-07-28
Open,2016-07-28
Open,2016-07-28
Close,2016-07-28
Close,2016-07-28
Close,2016-07-28
Open,2016-07-28


In [0]:
display(spark.sql("DROP TABLE IF EXISTS events"))

display(spark.sql("CREATE TABLE events USING DELTA LOCATION '/delta/events/'"))

In [0]:
events_delta.count()

In [0]:
from pyspark.sql.functions import count
display(events_delta.groupBy("action","date").agg(count("action").alias("action_count")).orderBy("date", "action"))

action,date,action_count
Close,2016-07-26,20165
Open,2016-07-26,21176
Close,2016-07-27,24015
Open,2016-07-27,24002
Close,2016-07-28,5820
Open,2016-07-28,4822


In [0]:
historical_events = spark.read \
  .option("inferSchema", "true") \
  .json("/databricks-datasets/structured-streaming/events/") \
  .withColumn("date", expr("time-172800")) \
  .drop("time") \
  .withColumn("date", from_unixtime("date", 'yyyy-MM-dd'))

In [0]:
historical_events.write.format("delta").mode("append").partitionBy("date").save("/delta/events/")

In [0]:
display(events_delta.groupBy("action","date").agg(count("action").alias("action_count")).orderBy("date", "action"))

action,date,action_count
Close,2016-07-24,20165
Open,2016-07-24,21176
Close,2016-07-25,24015
Open,2016-07-25,24002
Close,2016-07-26,25985
Open,2016-07-26,25998
Close,2016-07-27,24015
Open,2016-07-27,24002
Close,2016-07-28,5820
Open,2016-07-28,4822


In [0]:
events_delta.count()

In [0]:
dbutils.fs.ls("dbfs:/delta/events/date=2016-07-25/")

In [0]:
display(spark.sql("OPTIMIZE events"))

path
""


In [0]:
display(spark.sql("DESCRIBE HISTORY events"))

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel
2,2019-01-29 00:38:19,360903564160648,lee.brown@example.com,OPTIMIZE,"Map(predicate -> [], zOrderBy -> [], batchId -> 0)",,List(2433269420249641),0125-221106-miaow213,1.0,SnapshotIsolation
1,2019-01-29 00:38:10,360903564160648,lee.brown@example.com,WRITE,"Map(mode -> Append, partitionBy -> [""date""])",,List(2433269420249641),0125-221106-miaow213,0.0,WriteSerializable
0,2019-01-29 00:37:58,360903564160648,lee.brown@example.com,WRITE,"Map(mode -> Overwrite, partitionBy -> [""date""])",,List(2433269420249641),0125-221106-miaow213,,WriteSerializable


In [0]:
display(spark.sql("DESCRIBE DETAIL events"))

format,id,name,description,location,createdAt,lastModified,partitionColumns,numFiles,sizeInBytes,properties,minReaderVersion,minWriterVersion
delta,f7b17b8a-67db-4822-a4dc-c2a19c577357,,,dbfs:/delta/events,2019-01-29 00:37:55.795,2019-01-29 00:38:19,List(date),5,27683,Map(),1,2


In [0]:
display(spark.sql("DESCRIBE FORMATTED events"))

col_name,data_type,comment
action,string,
date,string,
# Partition Information,,
# col_name,data_type,comment
date,string,
,,
# Detailed Table Information,,
Database,default,
Table,events,
Owner,root,
