## About this Notebook

- This notebook performs a sample ETL operation using Microsoft Open datasets (Covid data).
- It **doesn't require** a default lakehouse attached and instead uses absolute paths to create/load managed tables.
- This notebook is part of a data pipeline.
- Can be run from any Fabric workspace as long as proper access to provided to write to the targets(workspace and lakehouse).


## Libraries

In [None]:
import json
from pyspark.sql.functions import lit, to_utc_timestamp, unix_timestamp, avg, max, min, sum, count
from delta.tables import DeltaTable
from typing import Optional
from datetime import datetime, timezone

StatementMeta(, 3e181d44-f0b6-406c-a2a1-e7eaf98267e8, 13, Finished, Available)

## External parameters

In [None]:
# Keep Only External parameters in this cell.
stage = "dev"
onelake_name = "daily-onelake"
lakehouse_name = "lh_main"

StatementMeta(, 3e181d44-f0b6-406c-a2a1-e7eaf98267e8, 14, Finished, Available)

In [None]:
print(f"{stage = }")
print(f"{onelake_name =}")
print(f"{lakehouse_name = }")

## Local parameters

In [None]:
# Complete paths - This way we are independent of local workspace - we can connect to any workspace and any lake house as long we have the proper access. 
workspace_name = f"ws-fabric-cicd-{stage}"
lakehouse_folder_name = f"{stage}_covid_data"

print(f"{workspace_name = }")


# Microsoft Open dataset - Safety data - Ref: https://learn.microsoft.com/en-us/azure/open-datasets/dataset-new-york-city-safety?tabs=pyspark
# Azure storage access info  
blob_account_name = "pandemicdatalake"
blob_container_name = "public"
blob_relative_path = "curated/covid-19/"
blob_sas_token = r""
covid_data_sources = (
    ("ECDC", "ecdc_cases/latest/ecdc_cases.parquet"), 
    ("bing", "bing_covid-19_data/latest/bing_covid-19_data.parquet"),
    ("oxford", "covid_policy_tracker/latest/covid_policy_tracker.parquet")  
)

onelake_path = f"abfss://{workspace_name}@{onelake_name}.dfs.fabric.microsoft.com/{lakehouse_name}.lakehouse"
onelake_file_path = f"{onelake_path}/Files"
onelake_table_path = f"{onelake_path}/Tables"

# Allow Spark remote read
wasbs_path = f"wasbs://{blob_container_name}@{blob_account_name}.blob.core.windows.net/{blob_relative_path}"
spark.conf.set( f"fs.azure.sas.{blob_container_name}.{blob_account_name}.blob.core.windows.net", blob_sas_token)

## Check source and target locations

In [None]:
# Check onelake existence - otherwise abort notebook execution
error_message = f"Specfied lakehouse table path {onelake_file_path} doesn't exist. Ensure onelake={onelake_name}, workspace={workspace_name} and lakehouse={lakehouse_name} exist."
try:
    if not(mssparkutils.fs.exists(onelake_file_path)):
        raise ValueError("Encountered error while checking for Lakehouse table path specified.")
except Exception as e:
    print(f"Error message: {e}")
    # no further execution but Session is still active
    mssparkutils.notebook.exit(error_message)
else:
    print(f"Target folder path: {onelake_file_path} is valid and exists.")
    print("Listing source data contents to check connectivity")
    print(mssparkutils.fs.ls(wasbs_path))

## Copy data from remote source to OneLake

In [None]:
current_date = datetime.now(timezone.utc).strftime("%Y-%m-%d")
current_month = datetime.now(timezone.utc).strftime("%Y-%m")


# Optionally delete existing contents
file_path_prefix = f"{onelake_file_path}/raw_covid_data"

for source, source_path in covid_data_sources:
    source_file = f"{wasbs_path}/{source_path}"
    target_file = f"{file_path_prefix}/{source}/{current_month}/{current_date}.parquet"
    print(f"Copying {source_file = } to {target_file =} for {source = }.")
    # This is overwrite operation - and is okay as there is only one latest file for any given day.
    mssparkutils.fs.cp(source_file, target_file)

print(f"\n=====\nCovid data is copied to {file_path_prefix} for {current_date =}.\n=====")