### Parameters Setup

In [None]:
fabric_tenant = spark.conf.get("spark.fsd.fabric.tenant")


### Load landed Yellow Taxi Trip Records data from landing zone

In [None]:
import pandas as pd

# Load Yellow Taxi Trip Records source parquet files to pandas dataframe
year = "2022"
landing_path = "01_landing"
pd_df = pd.DataFrame()

for index in range(1, 13):
    month = str(index).zfill(2)
    monthly_df = pd.DataFrame(pd.read_parquet(f"/lakehouse/default/Files/{landing_path}/yellow_tripdata_{year}-{month}.parquet", engine="pyarrow"))

    pd_df = pd.concat([pd_df, monthly_df], ignore_index=True)

pd_df.count()


In [None]:
# Sink concatenated trip records to 02_staging path of Fabric OneLake
staging_path = "02_staging"
mssparkutils.fs.mkdirs(f"Files/{staging_path}")

pd_df.to_parquet(f"/lakehouse/default/Files/{staging_path}/yellow_taxi_tripdata_{year}.parquet")


### Register data assets and lineage of data pipeline to Purview

In [None]:
%run data_catalog_and_lineage

In [None]:
%run utils

In [None]:
purview_data_catalog = PurviewDataCatalog()

fabric_onelake_tenant, fabric_workspace_id, fabric_lakehouse_id = get_onelake_info()
onelake_base_path = f"abfss://{fabric_workspace_id}@{fabric_onelake_tenant}.dfs.fabric.microsoft.com/{fabric_lakehouse_id}/Files"

# Create source data assets list
source_data_assets = []
for index in range(1, 13):
    month = str(index).zfill(2)
    source_file_name = f"yellow_tripdata_{year}-{month}.parquet"
    source_data_asset = DataAsset(source_file_name,
                                  "parquet",
                                  f"{onelake_base_path}/{landing_path}/{source_file_name}")

    source_data_assets.append(source_data_asset)

# Create sink data assets list
sink_file_name = f"yellow_taxi_tripdata_{year}.parquet"
sink_data_asset = DataAsset(sink_file_name,
                            "parquet",
                            f"{onelake_base_path}/{staging_path}/{sink_file_name}")

sink_data_assets = [sink_data_asset]

# Create process data asset
current_notebook_context = mssparkutils.notebook.nb.context
notebook_id = current_notebook_context["currentNotebookId"]
# notebook_name = current_notebook_context["currentNotebookName"]
process_data_asset = DataAsset("data_ingestion (Fabric notebook)",
                               "process",
                               f"https://{fabric_tenant}.powerbi.com/groups/{fabric_workspace_id}/synapsenotebooks/{notebook_id}")

# Create lineage for data pipeline
data_pipeline_lineage = DataLineage(source_data_assets, sink_data_assets, process_data_asset)

# Register lineage of data pipeline to Purview
purview_data_catalog.register_lineage(data_pipeline_lineage)


## 