### Parameter Setup

In [None]:
fabric_tenant = spark.conf.get("spark.fsd.fabric.tenant")


### Load ingested data from staging zone

In [None]:
import pandas as pd

# Load Yellow Taxi Trip Records parquet file from staging zone to pandas dataframe
year = "2022"
staging_path = "02_staging"

pd_df = pd.read_parquet(f"/lakehouse/default/Files/{staging_path}/yellow_taxi_tripdata_{year}.parquet", engine="pyarrow")
pd_df.head()


In [None]:
# Load location zones data from landing zone
landing_path = "01_landing"
zones_df = pd.read_csv(f"/lakehouse/default/Files/{landing_path}/taxi_zone_lookup.csv")
zones_df.head()


In [None]:
# Load NYC weather data from landing zone
weather_df = pd.read_csv(f"/lakehouse/default/Files/{landing_path}/nyc_weather_{year}.csv")
weather_df.head()


### Data cleansing

In [None]:
# Remove rows with null passenger_count
pd_df = pd_df.dropna(subset=["passenger_count"])
pd_df.isnull().sum()


In [None]:
# Remove rows with unknown location ID (264 and 265) for PULocationID and DOLocationID columns
pd_df = pd_df.drop(pd_df["PULocationID"].loc[(pd_df["PULocationID"] == 264) | (pd_df["PULocationID"] == 265)].index)
pd_df = pd_df.drop(pd_df["DOLocationID"].loc[(pd_df["DOLocationID"] == 264) | (pd_df["DOLocationID"] == 265)].index)


In [None]:
# Remove rows with null values for 'temperature_2m (°C)' column of NYC weather data 
weather_df = weather_df.dropna(subset=["temperature_2m (°C)"])
weather_df.isnull().sum()


In [None]:
# Rename columns and select subset of columns
weather_df.rename(columns = {'temperature_2m (°C)': 'temperature_2m_c',
                             'precipitation (mm)': 'precipitation_mm',
                             'cloud_cover (%)': 'cloudcover_percentage',
                             'wind_speed_10m (km/h)': 'windspeed_10m_km_per_hour'}, 
                          inplace = True)
weather_df = weather_df[['time', 'temperature_2m_c', 'precipitation_mm', 'cloudcover_percentage', 'windspeed_10m_km_per_hour']]


In [None]:
# Remove rows with null values for service_zone column of zones data 
zones_df = zones_df.dropna(subset=["service_zone"])
zones_df.isnull().sum()


### Sink cleaned data to standardization zone

In [None]:
# Sink cleansed NYC yellow taxi trips, weather and zones data to standard zone of Fabric OneLake
standard_path = "03_standard"
mssparkutils.fs.mkdirs(f"Files/{standard_path}")

pd_df.to_parquet(f"/lakehouse/default/Files/{standard_path}/cleaned_yellow_taxi_tripdata_{year}.parquet")
weather_df.to_parquet(f"/lakehouse/default/Files/{standard_path}/cleaned_nyc_weather_{year}.parquet")
zones_df.to_parquet(f"/lakehouse/default/Files/{standard_path}/nyc_zones.parquet")


### Register data assets and lineage of data pipeline to Purview

In [None]:
%run data_catalog_and_lineage

In [None]:
%run utils

In [None]:
purview_data_catalog = PurviewDataCatalog()

fabric_onelake_tenant, fabric_workspace_id, fabric_lakehouse_id = get_onelake_info()
onelake_base_path = f"abfss://{fabric_workspace_id}@{fabric_onelake_tenant}.dfs.fabric.microsoft.com/{fabric_lakehouse_id}/Files"

# Create source data assets list
source_data_assets = []
trip_data_source_file = f"yellow_taxi_tripdata_{year}.parquet"
source_data_asset_1 = DataAsset(trip_data_source_file,
                                "parquet",
                                f"{onelake_base_path}/{staging_path}/{trip_data_source_file}")

zones_data_source_file = f"taxi_zone_lookup.csv"
source_data_asset_2 = DataAsset(zones_data_source_file,
                                "csv",
                                f"{onelake_base_path}/{landing_path}/{zones_data_source_file}")

nycweather_data_source_file = f"nyc_weather_{year}.csv"
source_data_asset_3 = DataAsset(nycweather_data_source_file,
                                "csv",
                                f"{onelake_base_path}/{landing_path}/{nycweather_data_source_file}")
source_data_assets.append(source_data_asset_1)
source_data_assets.append(source_data_asset_2)
source_data_assets.append(source_data_asset_3)

# Create sink data assets list
sink_data_assets = []
cleaned_trip_data_file = f"cleaned_yellow_taxi_tripdata_{year}.parquet"
sink_data_asset_1 = DataAsset(cleaned_trip_data_file,
                              "parquet",
                              f"{onelake_base_path}/{standard_path}/{cleaned_trip_data_file}")

cleaned_zones_data = "nyc_zones.parquet"
sink_data_asset_2 = DataAsset(cleaned_zones_data,
                              "parquet",
                              f"{onelake_base_path}/{standard_path}/{cleaned_zones_data}")

cleaned_nycweather_data = f"cleaned_nyc_weather_{year}.parquet"
sink_data_asset_3 = DataAsset(cleaned_nycweather_data,
                              "parquet",
                              f"{onelake_base_path}/{standard_path}/{cleaned_nycweather_data}")
sink_data_assets.append(sink_data_asset_1)
sink_data_assets.append(sink_data_asset_2)
sink_data_assets.append(sink_data_asset_3)

# Create process data asset
current_notebook_context = mssparkutils.notebook.nb.context
notebook_id = current_notebook_context["currentNotebookId"]
# notebook_name = current_notebook_context["currentNotebookName"]
process_data_asset = DataAsset("data_cleansing (Fabric notebook)",
                               "process",
                               f"https://{fabric_tenant}.powerbi.com/groups/{fabric_workspace_id}/synapsenotebooks/{notebook_id}")

# Create lineage for data pipeline
data_pipeline_lineage = DataLineage(source_data_assets, sink_data_assets, process_data_asset)

# Register lineage of data pipeline to Purview
purview_data_catalog.register_lineage(data_pipeline_lineage)
