### Parameter Setup

In [None]:
fabric_tenant = spark.conf.get("spark.fsd.fabric.tenant")


### Load cleaned data from standardization zone

In [None]:
import pandas as pd
import numpy as np

# Load cleaned Yellow Taxi Trip Records parquet file from standardization zone to pandas dataframe
year = "2022"
standard_path = "03_standard"

pd_df = pd.read_parquet(f"/lakehouse/default/Files/{standard_path}/cleaned_yellow_taxi_tripdata_{year}.parquet", engine="pyarrow")
pd_df.head()


In [None]:
# Load cleansed NYC weather data from standardization zone
weather_df = pd.read_parquet(f"/lakehouse/default/Files/{standard_path}/cleaned_nyc_weather_{year}.parquet")
weather_df.head()


In [None]:
# Load location zones data from standardization zone
zones_df = pd.read_parquet(f"/lakehouse/default/Files/{standard_path}/nyc_zones.parquet")
zones_df.head()


### Data transformation

In [None]:
# Derive month columns from pickup and dropoff datetime columns
pd_df["month_pickup"] = pd_df["tpep_pickup_datetime"].dt.month.astype(np.uint8)
pd_df["month_dropoff"] = pd_df["tpep_dropoff_datetime"].dt.month.astype(np.uint8)

# Derive day columns
pd_df["day_pickup"] = pd_df["tpep_pickup_datetime"].dt.day.astype(np.uint8)
pd_df["day_dropoff"] = pd_df["tpep_dropoff_datetime"].dt.day.astype(np.uint8)

# Derive weekday columns
pd_df["weekday_pickup"] = pd_df["tpep_pickup_datetime"].dt.weekday.astype(np.uint8)
pd_df["weekday_dropoff"] = pd_df["tpep_dropoff_datetime"].dt.weekday.astype(np.uint8)

# Derive hour of day columns
pd_df["hour_pickup"] = pd_df["tpep_pickup_datetime"].dt.hour.astype(np.uint8)
pd_df["hour_dropoff"] = pd_df["tpep_dropoff_datetime"].dt.hour.astype(np.uint8)

pd_df.head()


In [None]:
# Join location zones data with trip data
pd_df = pd_df.join(zones_df.set_index("LocationID"), on="PULocationID")
pd_df.head()


In [None]:
# Aggregate taxi demand by hour_pickup, day_pickup, weekday_pickup and borough columns
transformed_df = pd_df.groupby(["hour_pickup", "day_pickup", "weekday_pickup", "month_pickup", "Borough"]).agg(
    demand=('hour_pickup', 'size'),
    pickup_timestamp=('tpep_pickup_datetime', 'first')
).reset_index()


In [None]:
# Encoding Borough columns
borough_array = zones_df["Borough"].unique()

transformed_df["borough_id"] = list(map(lambda x: list(borough_array).index(x), transformed_df["Borough"]))


In [None]:
# Convert `time` column to datetime 
weather_df['time'] = pd.to_datetime(weather_df['time'])

# Derive year column from time column of the NYC weather data
weather_df["year"] = weather_df["time"].dt.year

# Derive weekday column
weather_df["month"] = weather_df["time"].dt.month.astype(np.uint8)

# Derive weekday column
weather_df["day"] = weather_df["time"].dt.day.astype(np.uint8)

# Derive hour of day column
weather_df["hour"] = weather_df["time"].dt.hour.astype(np.uint8)

# Derive ID column
weather_df["id"] = weather_df["time"].dt.strftime('%Y%m%d%H')

weather_df.head()


### Sink transformed data to standardization zone

In [None]:
# Sink transformed trip and weather records to 03_standard path of Fabric OneLake
mssparkutils.fs.mkdirs(f"Files/{standard_path}")

transformed_df.to_csv(f"/lakehouse/default/Files/{standard_path}/transformed_yellow_taxi_tripdata_{year}.csv", index=False)
weather_df.to_csv(f"/lakehouse/default/Files/{standard_path}/transformed_nyc_weather_{year}.csv", index=False)


### Register data assets and lineage of target data pipeline to Purview

In [None]:
%run data_catalog_and_lineage

In [None]:
%run utils

In [None]:
purview_data_catalog = PurviewDataCatalog()

fabric_onelake_tenant, fabric_workspace_id, fabric_lakehouse_id = get_onelake_info()
onelake_base_path = f"abfss://{fabric_workspace_id}@{fabric_onelake_tenant}.dfs.fabric.microsoft.com/{fabric_lakehouse_id}/Files"

# Create source data assets list
source_data_assets = []
trip_data_source_file = f"cleaned_yellow_taxi_tripdata_{year}.parquet"
source_data_asset_1 = DataAsset(trip_data_source_file,
                                "parquet",
                                f"{onelake_base_path}/{standard_path}/{trip_data_source_file}")

zones_data_source_file = "nyc_zones.parquet"                         
source_data_asset_2 = DataAsset(zones_data_source_file,
                                "parquet",
                                f"{onelake_base_path}/{standard_path}/{zones_data_source_file}")

nycweather_data_source_file = f"cleaned_nyc_weather_{year}.parquet"
source_data_asset_3 = DataAsset(nycweather_data_source_file,
                                "parquet",
                                f"{onelake_base_path}/{standard_path}/{nycweather_data_source_file}")
source_data_assets.append(source_data_asset_1)
source_data_assets.append(source_data_asset_2)
source_data_assets.append(source_data_asset_3)

# Create sink data assets list
sink_data_assets = []
transformed_trip_data_file = f"transformed_yellow_taxi_tripdata_{year}.csv"
sink_data_asset_1 = DataAsset(transformed_trip_data_file,
                            "csv",
                            f"{onelake_base_path}/{standard_path}/{transformed_trip_data_file}")

transformed_nycweather_data_file = f"transformed_nyc_weather_{year}.csv"
sink_data_asset_2 = DataAsset(transformed_nycweather_data_file,
                            "csv",
                            f"{onelake_base_path}/{standard_path}/{transformed_nycweather_data_file}")
sink_data_assets.append(sink_data_asset_1)
sink_data_assets.append(sink_data_asset_2)

# Create process data asset
current_notebook_context = mssparkutils.notebook.nb.context
notebook_id = current_notebook_context["currentNotebookId"]
# notebook_name = current_notebook_context["currentNotebookName"]
process_data_asset = DataAsset("data_transformation (Fabric notebook)",
                               "process",
                               f"https://{fabric_tenant}.powerbi.com/groups/{fabric_workspace_id}/synapsenotebooks/{notebook_id}")

# Create lineage for data pipeline
data_pipeline_lineage = DataLineage(source_data_assets, sink_data_assets, process_data_asset)

# Register lineage of data pipeline to Purview
purview_data_catalog.register_lineage(data_pipeline_lineage)
