Basic configurations

# Time zone

In [0]:
# Configuring my local time zone
spark.conf.set("spark.sql.session.timeZone", "America/Sao_Paulo")

# Credentials for Azure SQL database (using Azure Key Vault)

In [0]:
sql_db_retail_key = dbutils.secrets.get(scope="keys", key="sqldbretailkey")
sql_db_retail_url = "jdbc:sqlserver://retail-oltp-server.database.windows.net:1433;databaseName=retail"
sql_db_retail_user = "andre"

spark.conf.set("spark.sql.retail_url", sql_db_retail_url)
spark.conf.set("spark.sql.retail_user", sql_db_retail_user)
spark.conf.set("spark.sql.retail_key", sql_db_retail_key)

# Credentials for mounting (using Azure Key Vault)

In [0]:
# For accessing the storage accounting
storage_account_key= dbutils.secrets.get(scope="keys", key="storageaccountkey")
mount_point = "/mnt/storageforretail/container"

# Mounting containers and directories

In [0]:
mount_point = "/mnt/storageforretail/container"


try:
    dbutils.fs.ls(mount_point)
    print('OK: container')
except:  
    # Mount the container in Databricks
    dbutils.fs.mount(
        source = "wasbs://container@storageforretail.blob.core.windows.net",
        mount_point = mount_point,
        extra_configs = {"fs.azure.account.key.storageforretail.blob.core.windows.net": storage_account_key}
    )
    print('OK: container')

# Create directories
directories = ["bronze", "silver", "gold"]
for directory in directories:
    dir_path = f"{mount_point}/{directory}"
    try:
        dbutils.fs.ls(dir_path)
        print(f'OK: container/{directory}')
    except:
        dbutils.fs.mkdirs(dir_path)
        print(f'OK: container/{directory}')


# Creating database for bronze

In [0]:
# First: create databases
spark.sql("CREATE DATABASE IF NOT EXISTS bronze LOCATION '/mnt/databricksretail/retailcontainer/bronze'")
spark.sql("CREATE DATABASE IF NOT EXISTS silver LOCATION '/mnt/databricksretail/retailcontainer/silver'")
spark.sql("CREATE DATABASE IF NOT EXISTS gold LOCATION '/mnt/databricksretail/retailcontainer/gold'")

## Define tables in a lazy way to enforce schema evolution

In [0]:
# Dictionary with tables and their primary keys
primary_keys = {
    "CUSTOMERS": ["CUSTOMER_ID"],
    "BRANDS": ["BRAND_ID"],
    "CATEGORIES": ["CATEGORY_ID"],
    "PRODUCTS": ["PRODUCT_ID"],
    "STORES": ["STORE_ID"],
    "PROMOTIONS": ["PROMOTION_ID"],
    "PAYMENT_METHODS": ["PAYMENT_METHOD_ID"],
    "INVENTORY": ["INVENTORY_ID"],
    "SALES": ["SALE_ID"],
    "TRANSACTION_ITEM": ["TRANSACTION_ID"] 
}

Create a table if not exist...

In [0]:
# Create tables for the first time only
for table in primary_keys:
    # Get the list of primary keys for the current table
    keys = primary_keys[table]

    # Convert the list of primary keys into a SQL column definition
    primary_keys_str = ",\n  ".join([f"{key} INTEGER" for key in keys]) # DEALING WITH COMPOSITE KEYS

    # Generate and execute the SQL statement
    spark.sql(f"""
        CREATE TABLE IF NOT EXISTS bronze.{table} (
            {primary_keys_str},
            hash STRING,
            `load_timestamp` TIMESTAMP,
            `end_timestamp` TIMESTAMP,
            state INTEGER
        ) USING DELTA
    """)
    print(f"Table bronze.{table} has been created!")

First drop table, then create...

In [0]:
# Create tables for the first time only
for table in primary_keys:
    # Get the list of primary keys for the current table
    keys = primary_keys[table]

    # Convert the list of primary keys into a SQL column definition
    primary_keys_str = ",\n  ".join([f"{key} INTEGER" for key in keys]) # DEALING WITH COMPOSITE KEYS

    # Drop the table if it exists
    spark.sql(f"DROP TABLE IF EXISTS bronze.{table}")

    # Generate and execute the SQL statement
    spark.sql(f"""
        CREATE OR REPLACE TABLE bronze.{table} (
            {primary_keys_str},
            hash STRING,
            `load_timestamp` TIMESTAMP,
            `end_timestamp` TIMESTAMP,
            state INTEGER
        ) USING DELTA
    """)
    print(f"Table bronze.{table} created successfully!")

# Creating database for gold

In [0]:
%sql
CREATE OR REPLACE TABLE gold.customers (
  SK_CUSTOMER BIGINT GENERATED ALWAYS AS IDENTITY,
  CUSTOMER_ID INTEGER,
  ACTIVE BOOLEAN,
  load_timestamp TIMESTAMP,
  end_timestamp TIMESTAMP,
  HASH STRING,
  SOCIAL_SECURITY_NUMBER STRING,
  NAME STRING,
  EMAIL STRING,
  PHONE BIGINT,
  ADDRESS STRING,
  ZIP_CODE INTEGER
)
USING DELTA

# Check the connection with SQL server

Check the IP address

In [0]:
import requests
ip = requests.get("https://ifconfig.me").text
print(f"O IP público do Databricks é: {ip}")

Check connection to database

In [0]:
%python
# Define the JDBC URL and authentication credentials
properties = {
    "user": sql_db_retail_user,        # Replace with your username
    "password": sql_db_retail_key,      # Replace with your password
    "driver": "com.microsoft.sqlserver.jdbc.SQLServerDriver"
}

# Try to access the table to validate the connection
try:
    df = spark.read.jdbc(url=sql_db_retail_url, table="INFORMATION_SCHEMA.TABLES", properties=properties)
    df.show()  # Displays the tables in the database
except Exception as e:
    print("Connection error:", e)

# Releasing resources

In [0]:
%sql
DROP TABLE IF EXISTS bronze.customers

In [0]:
# Listar todas as tabelas no Hive Metastore
tables = [
    "CUSTOMERS", "BRANDS", "CATEGORIES", "PRODUCTS", "STORES",
    "PROMOTIONS", "PAYMENT_METHODS", "INVENTORY", "SALES", "TRANSACTION_ITEM"
]

# Excluir todas as tabelas
for table in tables:
    spark.sql(f"DROP TABLE IF EXISTS bronze.{table}")

In [0]:
# Delete all files and directories


dbutils.fs.unmount("/mnt/databricksretail/retailcontainer")
dbutils.fs.rm("/mnt/databricksretail", recurse=True)

In [0]:
# Excluir o banco de dados, incluindo todas as tabelas (opção CASCADE)
spark.sql("DROP DATABASE bronze CASCADE")
spark.sql("DROP DATABASE silver CASCADE")
spark.sql("DROP DATABASE gold CASCADE")