In [0]:
from pyspark.sql.functions import expr
from pyspark.sql.functions import from_unixtime

events = spark.read \
  .option("inferSchema", "true") \
  .json("/databricks-datasets/structured-streaming/events/") \
  .withColumn("date", expr("time")) \
  .drop("time") \
  .withColumn("date", from_unixtime("date", 'yyyy-MM-dd'))
  
display(events)

In [0]:
events.write.format("delta").mode("overwrite").partitionBy("date").save("/delta/events/")

In [0]:
events_delta = spark.read.format("delta").load("/delta/events/")

display(events_delta)

In [0]:
display(spark.sql("DROP TABLE IF EXISTS events"))

display(spark.sql("CREATE TABLE events USING DELTA LOCATION '/delta/events/'"))

In [0]:
events_delta.count()

In [0]:
from pyspark.sql.functions import count
display(events_delta.groupBy("action","date").agg(count("action").alias("action_count")).orderBy("date", "action"))

In [0]:
historical_events = spark.read \
  .option("inferSchema", "true") \
  .json("/databricks-datasets/structured-streaming/events/") \
  .withColumn("date", expr("time-172800")) \
  .drop("time") \
  .withColumn("date", from_unixtime("date", 'yyyy-MM-dd'))

In [0]:
historical_events.write.format("delta").mode("append").partitionBy("date").save("/delta/events/")

In [0]:
display(events_delta.groupBy("action","date").agg(count("action").alias("action_count")).orderBy("date", "action"))

In [0]:
events_delta.count()

In [0]:
dbutils.fs.ls("dbfs:/delta/events/date=2016-07-25/")

In [0]:
display(spark.sql("OPTIMIZE events"))

In [0]:
display(spark.sql("DESCRIBE HISTORY events"))

In [0]:
display(spark.sql("DESCRIBE DETAIL events"))

In [0]:
display(spark.sql("DESCRIBE FORMATTED events"))