
##Goals:

1. Write data to delta lake ( managed table)
1. Write data to delta lake (external table)
1. Read data from delta lake ( table)
1. Read data from delta lake (file)

#1. Write data to delta lake (managed table)


In [0]:
#testing access to blob storage from databricks, it sould return an empty list

dbutils.fs.ls("abfss://demo@alejandroauedevdl.dfs.core.windows.net/")

dbutils.fs.ls("abfss://processed@alejandroauedevdl.dfs.core.windows.net/")

dbutils.fs.ls("abfss://raw@alejandroauedevdl.dfs.core.windows.net/")

dbutils.fs.ls("abfss://presentation@alejandroauedevdl.dfs.core.windows.net/")




In [0]:
%sql
CREATE SCHEMA IF NOT EXISTS f1_demo -- Only creates metadata in Unity Catalog (stored in the metastore)

MANAGED LOCATION 'abfss://demo@alejandroauedevdl.dfs.core.windows.net/' -- We want all the managed tables be created in that location
-- Any managed table created on this database will have the data returned to this blob storage

In [0]:
%python


# p: parameter
# v: varriable

dbutils.widgets.text("p_file_date","2021-03-28") #"2021-03-28" is a default value 
v_file_date = dbutils.widgets.get("p_file_date")

# we have 3 files with 3 dates: 2021-03-28 (cutover file), 2021-03-21, 2021-04-18


p_raw_folder_path = 'abfss://raw@alejandroauedevdl.dfs.core.windows.net/'


In [0]:
%python
results_df= spark.read\
.option("inferSchema",True)\
.json("abfss://raw@alejandroauedevdl.dfs.core.windows.net/2021-03-28/results.json")

In [0]:
%python
results_df= spark.read\
.option("inferSchema",True)\
.json(f"{p_raw_folder_path}/{v_file_date}/results.json") #if I want to use the widget and parameters

In [0]:
display(results_df)

In [0]:
results_df.write.format("delta").mode("overwrite").saveAsTable("f1_demo.results_managed")

In [0]:
%sql

SELECT * FROM f1_demo.results_managed;

# 2. Write data to delta lake (external table)

In [0]:
%python

results_df= spark.read\
.option("inferSchema",True)\
.json("abfss://raw@alejandroauedevdl.dfs.core.windows.net/2021-03-28/results.json")

In [0]:
%python

results_df.write.format("delta").mode("overwrite").save("abfss://demo@alejandroauedevdl.dfs.core.windows.net/results_external")

In [0]:
%python

p_demo_folder_path = "abfss://demo@alejandroauedevdl.dfs.core.windows.net/"

results_df.write.format("delta").mode("overwrite").save(f"{p_demo_folder_path}/results_external")

In [0]:
%sql
DROP TABLE IF EXISTS f1_demo.results_external

In [0]:
%sql
    
CREATE TABLE IF NOT EXISTS f1_demo.results_external
USING DELTA
LOCATION 'abfss://demo@alejandroauedevdl.dfs.core.windows.net/results_external'

In [0]:
%sql
SELECT * FROM f1_demo.results_external

# 3. Read data from delta lake ( Table)

In [0]:
       
results_external_df = spark.read.format("delta").load('abfss://demo@alejandroauedevdl.dfs.core.windows.net/results_external')

In [0]:
display(results_external_df)

# Include a partition column when writing a delta managed table

In [0]:
%python

results_df= spark.read\
.option("inferSchema",True)\
.json("abfss://raw@alejandroauedevdl.dfs.core.windows.net/2021-03-28/results.json")

In [0]:
%python

results_df.write.format("delta").mode("overwrite")\
.partitionBy("constructorId")\
.saveAsTable("f1_demo.results_partitioned")



In [0]:
%sql
SHOW PARTITIONS f1_demo.results_partitioned