# Parameters Setup

In [None]:
featurestore_subscription_id = spark.conf.get("spark.fsd.subscription_id")
featurestore_resource_group_name = spark.conf.get("spark.fsd.rg_name")
featurestore_name = spark.conf.get("spark.fsd.name")

year = "2022"
standard_path = "03_standard"

# the service principle should be albe to access feature store
client_id = spark.conf.get("spark.fsd.client_id")
tenant_id = spark.conf.get("spark.fsd.tenant_id")
client_secret = ""

In [None]:
%run utils

In [None]:
fabric_onelake_tenant, fabric_workspace_id, fabric_lakehouse_id = get_onelake_info()


# Obtain Credential and Initiate Clients 

In [None]:
from azure.identity import ClientSecretCredential

credential = ClientSecretCredential(
    tenant_id=tenant_id,
    client_id=client_id,
    client_secret=client_secret
)

In [None]:
# feature store client
from azureml.featurestore import FeatureStoreClient
from azure.ai.ml import MLClient
from azure.ai.ml.entities import (
    FeatureStore,
    FeatureStoreEntity,
    FeatureSet,
)

featurestore = FeatureStoreClient(
    credential=credential,
    subscription_id=featurestore_subscription_id,
    resource_group_name=featurestore_resource_group_name,
    name=featurestore_name,
)


fs_client = MLClient(
    credential,
    featurestore_subscription_id,
    featurestore_resource_group_name,
    featurestore_name,
)

# Create Entities & Feature Sets

### Create tmp folder

In [None]:
import time
import os

nyctaxi_featureset_version = str(int(time.time()))

featureset_folder = f"featuresets/nyctaxi_fs_1/{nyctaxi_featureset_version}"
os.makedirs(f"{featureset_folder}/spec", exist_ok=True)

print(f"the version is: {nyctaxi_featureset_version}")
print(f"feature set folder is {featureset_folder}")

In [None]:
# Create NYC weather featureset folder
nycweather_featureset_version = str(int(time.time()))
nycweather_featureset_folder = f"featuresets/nycweather_fs_1/{nycweather_featureset_version}"
os.makedirs(f"{nycweather_featureset_folder}/spec", exist_ok=True)


### Create FeatureSetSpec.yaml

In [None]:
nyctaxi_table_path = f"abfss://{fabric_workspace_id}@{fabric_onelake_tenant}.dfs.fabric.microsoft.com/{fabric_lakehouse_id}/Files/{standard_path}/transformed_yellow_taxi_tripdata_{year}.csv"
nycweather_table_path = f"abfss://{fabric_workspace_id}@{fabric_onelake_tenant}.dfs.fabric.microsoft.com/{fabric_lakehouse_id}/Files/{standard_path}/transformed_nyc_weather_{year}.csv"


In [None]:
with open(f"{featureset_folder}/spec/FeatureSetSpec.yaml", "w") as spec_yaml:
    spec_yaml.write(f"""
$schema: http://azureml/sdk-2-0/FeatureSetSpec.json

source:
  type: csv
  path: {nyctaxi_table_path}
  timestamp_column: 
    name: pickup_timestamp
features: 
  - name: hour_pickup
    type: integer
  - name: day_pickup
    type: integer
  - name: weekday_pickup
    type: integer
  - name: month_pickup
    type: integer
  - name: demand
    type: integer
index_columns:
  - name: borough_id
    type: integer
    """)

In [None]:
with open(f"{featureset_folder}/spec/FeatureSetSpec.yaml", "r") as spec_yaml:
    print(spec_yaml.read())

In [None]:
# Create featureset spec yaml for NYC weather features
with open(f"{nycweather_featureset_folder}/spec/FeatureSetSpec.yaml", "w") as spec_yaml:
    spec_yaml.write(f"""
$schema: http://azureml/sdk-2-0/FeatureSetSpec.json

source:
  type: csv
  path: {nycweather_table_path}
  timestamp_column: 
    name: time
features:
  - name: hour
    type: integer
  - name: day
    type: integer
  - name: month
    type: integer
  - name: year
    type: integer
  - name: temperature_2m_c
    type: double
  - name: precipitation_mm
    type: double
  - name: cloudcover_percentage
    type: integer
  - name: windspeed_10m_km_per_hour
    type: double
index_columns:
  - name: id
    type: integer
    """)


### Register Entities

In [None]:
from azure.ai.ml.entities import DataColumn, DataColumnType

vendor_entity_config = FeatureStoreEntity(
    name="borough",
    version=nyctaxi_featureset_version,
    index_columns=[DataColumn(name="borough_id", type=DataColumnType.INTEGER)],
    stage="Development",
    description="This entity represents index key of Boroughs.",
    tags={"data_typ": "nonPII"},
)

poller = fs_client.feature_store_entities.begin_create_or_update(vendor_entity_config)
print(poller.result())

In [None]:
# Create entity config for NYC weather features
nycweather_entity_config = FeatureStoreEntity(
    name="nycweather_id",
    version=nycweather_featureset_version,
    index_columns=[DataColumn(name="id", type=DataColumnType.INTEGER)],
    stage="Development",
    description="This entity represents index key of NYC weather data.",
    tags={"data_typ": "nonPII"},
)

poller = fs_client.feature_store_entities.begin_create_or_update(nycweather_entity_config)
print(poller.result())


### Register Feature Sets

In [None]:
from azure.ai.ml.entities import FeatureSetSpecification

nyctaxi_featureset_spec_folder = f"{featureset_folder}/spec"

nyctaxi_fset_config = FeatureSet(
    name="nyctaxi",
    version=nyctaxi_featureset_version,
    description="nyc taxi data",
    entities=[f"azureml:borough:{nyctaxi_featureset_version}"],
    stage="Development",
    specification=FeatureSetSpecification(path=nyctaxi_featureset_spec_folder),
    tags={"data_type": "nonPII"},
)

poller = fs_client.feature_sets.begin_create_or_update(nyctaxi_fset_config)
print(poller.result())

In [None]:
# Register featureset for NYC weather features
nycweather_featureset_spec_folder = f"{nycweather_featureset_folder}/spec"

nycweather_fset_config = FeatureSet(
    name="nycweather",
    version=nycweather_featureset_version,
    description="nyc weather data",
    entities=[f"azureml:nycweather_id:{nycweather_featureset_version}"],
    stage="Development",
    specification=FeatureSetSpecification(path=nycweather_featureset_spec_folder),
)

poller = fs_client.feature_sets.begin_create_or_update(nycweather_fset_config)
print(poller.result())


### Try to Retrieve Features

In [None]:
nyctaxi_fset = featurestore.feature_sets.get("nyctaxi", nyctaxi_featureset_version)
nyctaxi_fset.features

In [None]:
df = nyctaxi_fset.to_spark_dataframe().head(5)
display(df)

In [None]:
nycweather_fset = featurestore.feature_sets.get("nycweather", nycweather_featureset_version)
nycweather_fset.features


In [None]:
df = nycweather_fset.to_spark_dataframe().head(5)
display(df)


### Register MFS features lineage to Purview

In [None]:
%run data_catalog_and_lineage

In [None]:
purview_data_catalog = PurviewDataCatalog()

nyctaxi_fset = featurestore.feature_sets.get("nyctaxi", nyctaxi_featureset_version)
nycweather_fset = featurestore.feature_sets.get("nycweather", nycweather_featureset_version)

features_lineage = []
for fset in [nyctaxi_fset, nycweather_fset]:
    # Create data source assets
    data_source_assets = []
    data_source_path = fset.source.path
    source_qualified_name = data_source_path
    source_name = data_source_path.split("/")[-1]
    source_type = source_name.split(".")[-1]
    data_source_asset = DataAsset(source_name, source_type, source_qualified_name)
    data_source_assets.append(data_source_asset)

    # Create feature assets
    target_features = [feat.name for feat in fset.features]
    feature_assets = purview_data_catalog.prepare_feature_assets(featurestore_name,
                                                                fset,
                                                                target_features,
                                                                **{"tenant_id": tenant_id,
                                                                "subscription_id": featurestore_subscription_id,
                                                                "resource_group": featurestore_resource_group_name})

    # Create process asset if there's transformation found
    process_asset = None
    if fset.feature_transformation_code:
        transformer_class = feature_transformation_code.transformer_class
        process_entity = DataAsset(transformer_class,
                                "process",
                                f"{feature_transformation_code.path}/{transformer_class}")

    feature_lineage = DataLineage(input_data_assets=data_source_assets,
                                output_data_assets=feature_assets,
                                process_asset=process_asset)
    features_lineage.append(feature_lineage)


In [None]:
# Register features lineage
for feature_lineage in features_lineage:
    purview_data_catalog.register_lineage(feature_lineage)
