####1. Testing to see if we can access the ADLS files

####1. Create External Location

####For each container in ADLS Gen2 that we want to access from Databricks, we need to create a corresponding external location. Therefore, we are creating an external location for the gizmobox container.

In [0]:
%sql
CREATE EXTERNAL LOCATION IF NOT EXISTS dbcertificationsa_gizmobox
    URL 'abfss://gizmobox@dbcertificationsa.dfs.core.windows.net/'
    WITH (STORAGE CREDENTIAL ckdatabrickscredential)
    COMMENT 'External location for gizmobox'

####2. Testing to see if we can access the ADLS files

In [0]:
%fs ls 'abfss://gizmobox@dbcertificationsa.dfs.core.windows.net/landing/operational-data/'

####3. Create the catalog - gizmobox

In [0]:
%sql
show catalogs;

catalog
certificationdatabricks
gizmobox
hive_metastore
samples
system


In [0]:
--While we are creating the catalog, we will not be storing any data under its default managed location, as we plan to define dedicated schemas with their own managed locations. However, we are still specifying a managed location for the catalog as a fallback, just in case it's needed.

CREATE CATALOG IF NOT EXISTS gizmobox
   MANAGED LOCATION 'abfss://gizmobox@dbcertificationsa.dfs.core.windows.net/'
   COMMENT 'This is the catalog for the Gizmobox Data Lakehouse';

####4. Create Schemas
1. Landing
2. Bronze
3. Silver
4. Gold

In [0]:
-- the schemas are to be created under the gizmo catalog.
-- first lets see what catalog are we under

SELECT current_catalog();

current_catalog()
certificationdatabricks


In [0]:
-- we need to switch to gizmobox catalog
USE CATALOG gizmobox;

-- CREATING 4 schemas(databases): landing, bronze, silver and gold

CREATE SCHEMA IF NOT EXISTS landing
  MANAGED LOCATION 'abfss://gizmobox@dbcertificationsa.dfs.core.windows.net/landing'
  COMMENT 'This is the landing schema for the Gizmobox Data Lakehouse';
CREATE SCHEMA IF NOT EXISTS bronze
  MANAGED LOCATION 'abfss://gizmobox@dbcertificationsa.dfs.core.windows.net/bronze'
  COMMENT 'This is the bronze schema for the Gizmobox Data Lakehouse';
CREATE SCHEMA IF NOT EXISTS silver
  MANAGED LOCATION 'abfss://gizmobox@dbcertificationsa.dfs.core.windows.net/silver'
  COMMENT 'This is the silver schema for the Gizmobox Data Lakehouse';
CREATE SCHEMA IF NOT EXISTS gold
  MANAGED LOCATION 'abfss://gizmobox@dbcertificationsa.dfs.core.windows.net/gold'
  COMMENT 'This is the gold schema for the Gizmobox Data Lakehouse';  

In [0]:
-- lets see what schemas are available in the gizmobox catalog
SHOW SCHEMAS;

databaseName
bronze
default
gold
information_schema
landing
silver


####5. Create Volume: Operational_data

#### Volumes give us a simpler path to access the storage account files
- After creating the volume, we can refer to this location: "abfss://gizmobox@dbcertificationsa.dfs.core.windows.net/landing/operational_data"
- with just "/Volumes/gizmobox/landing/operational_data"

In [0]:
use catalog gizmobox;
use schema landing;

CREATE EXTERNAL VOLUME IF NOT EXISTS operational_data
  LOCATION 'abfss://gizmobox@dbcertificationsa.dfs.core.windows.net/landing/operational_data'
  COMMENT 'This is the operational data volume under landing schema for the Gizmobox Data Lakehouse';

#### we can read json/parquet/csv and many other semi structured files in Spark SQL as tables

In [0]:
-- format is [fileformat.`path`]
-- Note: it is using backticks and not the single quotes

--This is to read a single file
select * from json.`Volumes/gizmobox/landing/operational_data/Cutomer/customer_2024_10.json`


--This is to read all files in year 2024
select * from json.`Volumes/gizmobox/landing/operational_data/Cutomer/customer_2024*.json`

-- This is to read the entire folder
select * from json.`Volumes/gizmobox/landing/operational_data/Cutomer`

-- When reading entire folder, if we want to incluse the path as a column, we can use the function input_file_name() :DEPRECATED FROM DATABRICKS VERSION 13.3 LTS ONWARDS
select *, input_file_name() as file_path from json.`Volumes/gizmobox/landing/operational_data/Cutomer`

-- INSTEAD OF input_file_name(), we can use the function spark.files.metadata() to get the file path
select *, _metadata() as file_path from json.`Volumes/gizmobox/landing/operational_data/Cutomer`
           -- _metadata is a column that is automatically created by Databricks when reading data from a file or a directory. It contains metadata about the file or directory, such as the file path, file size, and file modification time
           -- _metadata.file 


In [0]:
select * from json.`/Volumes/gizmobox/landing/operational_data/customers`

created_timestamp,customer_id,customer_name,date_of_birth,email,member_since,telephone
2024-10-17 16:12:27,9179.0,Richard Cox,1996-10-25,devon84@mail.com,2024-09-26,+1 6680703335
2024-10-01 00:50:29,4858.0,Carla Morton,2004-06-21,joseph88@mail.com,2024-09-15,+1 8616454195
2024-10-23 22:03:08,7207.0,Billy Scott,1997-03-17,christopher30@mail.com,2024-09-23,+1 5544387564
2024-10-12 06:02:27,8539.0,Lori Mason,2002-11-01,stephanie7@mail.com,2024-09-12,+1 0498301620
2024-10-24 13:03:13,9706.0,Jennifer Haas,2001-04-03,benjamin55@mail.com,2024-10-05,+1 4725460000
2024-10-08 22:49:25,9263.0,Joseph Keller,2003-02-11,,2024-10-04,+1 3817867756
2024-10-06 19:55:52,5028.0,Jessica Harris,2004-04-19,,2024-09-10,+1 8604009935
2024-10-18 23:24:52,9018.0,William Carter,2003-09-05,james70@gmail.com,2024-10-08,+1 1448753611
2024-10-21 13:20:26,8580.0,Shannon Austin,2002-03-22,john30@gmail.com,2024-10-07,+1 4594705629
2024-10-02 14:53:40,3409.0,Andrew Phillips,2003-04-17,peter73@yahoo.com,2024-09-30,+1 4079273853


In [0]:
 SELECT *, _metadata as file_path FROM json.`/Volumes/gizmobox/landing/operational_data/customers`

created_timestamp,customer_id,customer_name,date_of_birth,email,member_since,telephone,file_path
2024-10-17 16:12:27,9179.0,Richard Cox,1996-10-25,devon84@mail.com,2024-09-26,+1 6680703335,"List(dbfs:/Volumes/gizmobox/landing/operational_data/customers/customers_2024_10.json, customers_2024_10.json, 4389, 0, 4389, 2025-05-19T17:42:46Z)"
2024-10-01 00:50:29,4858.0,Carla Morton,2004-06-21,joseph88@mail.com,2024-09-15,+1 8616454195,"List(dbfs:/Volumes/gizmobox/landing/operational_data/customers/customers_2024_10.json, customers_2024_10.json, 4389, 0, 4389, 2025-05-19T17:42:46Z)"
2024-10-23 22:03:08,7207.0,Billy Scott,1997-03-17,christopher30@mail.com,2024-09-23,+1 5544387564,"List(dbfs:/Volumes/gizmobox/landing/operational_data/customers/customers_2024_10.json, customers_2024_10.json, 4389, 0, 4389, 2025-05-19T17:42:46Z)"
2024-10-12 06:02:27,8539.0,Lori Mason,2002-11-01,stephanie7@mail.com,2024-09-12,+1 0498301620,"List(dbfs:/Volumes/gizmobox/landing/operational_data/customers/customers_2024_10.json, customers_2024_10.json, 4389, 0, 4389, 2025-05-19T17:42:46Z)"
2024-10-24 13:03:13,9706.0,Jennifer Haas,2001-04-03,benjamin55@mail.com,2024-10-05,+1 4725460000,"List(dbfs:/Volumes/gizmobox/landing/operational_data/customers/customers_2024_10.json, customers_2024_10.json, 4389, 0, 4389, 2025-05-19T17:42:46Z)"
2024-10-08 22:49:25,9263.0,Joseph Keller,2003-02-11,,2024-10-04,+1 3817867756,"List(dbfs:/Volumes/gizmobox/landing/operational_data/customers/customers_2024_10.json, customers_2024_10.json, 4389, 0, 4389, 2025-05-19T17:42:46Z)"
2024-10-06 19:55:52,5028.0,Jessica Harris,2004-04-19,,2024-09-10,+1 8604009935,"List(dbfs:/Volumes/gizmobox/landing/operational_data/customers/customers_2024_10.json, customers_2024_10.json, 4389, 0, 4389, 2025-05-19T17:42:46Z)"
2024-10-18 23:24:52,9018.0,William Carter,2003-09-05,james70@gmail.com,2024-10-08,+1 1448753611,"List(dbfs:/Volumes/gizmobox/landing/operational_data/customers/customers_2024_10.json, customers_2024_10.json, 4389, 0, 4389, 2025-05-19T17:42:46Z)"
2024-10-21 13:20:26,8580.0,Shannon Austin,2002-03-22,john30@gmail.com,2024-10-07,+1 4594705629,"List(dbfs:/Volumes/gizmobox/landing/operational_data/customers/customers_2024_10.json, customers_2024_10.json, 4389, 0, 4389, 2025-05-19T17:42:46Z)"
2024-10-02 14:53:40,3409.0,Andrew Phillips,2003-04-17,peter73@yahoo.com,2024-09-30,+1 4079273853,"List(dbfs:/Volumes/gizmobox/landing/operational_data/customers/customers_2024_10.json, customers_2024_10.json, 4389, 0, 4389, 2025-05-19T17:42:46Z)"


In [0]:
-- _metadata is giving the file_path as a column along with many other metadata information in a json format. If we need just the file_path info:
select *, _metadata.file_path as file_path from json.`/Volumes/gizmobox/landing/operational_data/customers`

created_timestamp,customer_id,customer_name,date_of_birth,email,member_since,telephone,file_path
2024-10-17 16:12:27,9179.0,Richard Cox,1996-10-25,devon84@mail.com,2024-09-26,+1 6680703335,dbfs:/Volumes/gizmobox/landing/operational_data/customers/customers_2024_10.json
2024-10-01 00:50:29,4858.0,Carla Morton,2004-06-21,joseph88@mail.com,2024-09-15,+1 8616454195,dbfs:/Volumes/gizmobox/landing/operational_data/customers/customers_2024_10.json
2024-10-23 22:03:08,7207.0,Billy Scott,1997-03-17,christopher30@mail.com,2024-09-23,+1 5544387564,dbfs:/Volumes/gizmobox/landing/operational_data/customers/customers_2024_10.json
2024-10-12 06:02:27,8539.0,Lori Mason,2002-11-01,stephanie7@mail.com,2024-09-12,+1 0498301620,dbfs:/Volumes/gizmobox/landing/operational_data/customers/customers_2024_10.json
2024-10-24 13:03:13,9706.0,Jennifer Haas,2001-04-03,benjamin55@mail.com,2024-10-05,+1 4725460000,dbfs:/Volumes/gizmobox/landing/operational_data/customers/customers_2024_10.json
2024-10-08 22:49:25,9263.0,Joseph Keller,2003-02-11,,2024-10-04,+1 3817867756,dbfs:/Volumes/gizmobox/landing/operational_data/customers/customers_2024_10.json
2024-10-06 19:55:52,5028.0,Jessica Harris,2004-04-19,,2024-09-10,+1 8604009935,dbfs:/Volumes/gizmobox/landing/operational_data/customers/customers_2024_10.json
2024-10-18 23:24:52,9018.0,William Carter,2003-09-05,james70@gmail.com,2024-10-08,+1 1448753611,dbfs:/Volumes/gizmobox/landing/operational_data/customers/customers_2024_10.json
2024-10-21 13:20:26,8580.0,Shannon Austin,2002-03-22,john30@gmail.com,2024-10-07,+1 4594705629,dbfs:/Volumes/gizmobox/landing/operational_data/customers/customers_2024_10.json
2024-10-02 14:53:40,3409.0,Andrew Phillips,2003-04-17,peter73@yahoo.com,2024-09-30,+1 4079273853,dbfs:/Volumes/gizmobox/landing/operational_data/customers/customers_2024_10.json
