In [23]:
import os
from datetime import datetime

import pandas as pd
from feast import FeatureStore
from datetime import datetime, timedelta
import numpy as np

In [24]:
raw_data_path = os.path.join("feature_store", "feature_repo", "data", "driver_stats.parquet")
feature_store_path = os.path.join("feature_store", "feature_repo")

### Check data

In [25]:
df = pd.read_parquet(raw_data_path)

In [26]:
df.head(5)

Unnamed: 0,event_timestamp,driver_id,conv_rate,acc_rate,avg_daily_trips,created
0,2024-10-17 12:07:08.228578+00:00,1001,1.0,1.0,1000,2024-10-17 12:07:08.228581
1,2024-10-02 11:00:00+00:00,1005,0.429879,0.194598,582,2024-10-17 11:30:07.072000
2,2024-10-02 12:00:00+00:00,1005,0.230119,0.642878,551,2024-10-17 11:30:07.072000
3,2024-10-02 13:00:00+00:00,1005,0.1286,0.674187,38,2024-10-17 11:30:07.072000
4,2024-10-02 14:00:00+00:00,1005,0.400603,0.473636,583,2024-10-17 11:30:07.072000


### Features inference 

In [27]:
entity_df = pd.DataFrame.from_dict(
    {
        # entity's join key -> entity values
        "driver_id": [1001, 1002, 1003],
        # "event_timestamp" (reserved key) -> timestamps
        "event_timestamp": [
            datetime(2021, 4, 12, 10, 59, 42),
            datetime(2021, 4, 12, 8, 12, 10),
            datetime(2021, 4, 12, 16, 40, 26),
        ]
    }
)

In [28]:
entity_df

Unnamed: 0,driver_id,event_timestamp
0,1001,2021-04-12 10:59:42
1,1002,2021-04-12 08:12:10
2,1003,2021-04-12 16:40:26


In [29]:
store = FeatureStore(repo_path=feature_store_path)



### Historical data for training

In [30]:
training_df = store.get_historical_features(
    entity_df=entity_df,
    features=[
        "driver_quality_stats:conv_rate",
        "driver_quality_stats:acc_rate",
        "driver_activity_stats:avg_daily_trips",
    ],
).to_df()



In [31]:
training_df.head()

Unnamed: 0,driver_id,event_timestamp,conv_rate,acc_rate,avg_daily_trips
0,1001,2021-04-12 10:59:42+00:00,0.709758,0.692957,402
1,1002,2021-04-12 08:12:10+00:00,0.718295,0.584081,370
2,1003,2021-04-12 16:40:26+00:00,0.697411,0.19768,25


### Historical data for training + New features based on existing data (On demand)

In [47]:
training_df = store.get_historical_features(
    entity_df=entity_df,
    features=[
        "driver_quality_stats:conv_rate",
        "driver_quality_stats:acc_rate",
        "driver_activity_stats:avg_daily_trips",
        "driver_efficiency_metrics:efficiency_index",
        "driver_efficiency_metrics:risk_score",
    ],
).to_df()



In [48]:
training_df.head()

Unnamed: 0,driver_id,event_timestamp,conv_rate,acc_rate,avg_daily_trips,efficiency_index,risk_score
0,1001,2021-04-12 10:59:42+00:00,0.709758,0.692957,402,402.709758,-3.157007
1,1002,2021-04-12 08:12:10+00:00,0.718295,0.584081,370,370.718295,-2.455543
2,1003,2021-04-12 16:40:26+00:00,0.697411,0.19768,25,25.697411,0.355938


In [None]:
# Online feature retrieval
online_features = store.get_online_features(
    features=[
        "driver_quality_stats:conv_rate",
        "driver_quality_stats:acc_rate",
        "driver_activity_stats:avg_daily_trips",
    ],
    entity_rows=[{"driver_id": 1001}, {"driver_id": 1005}],
).to_dict()

print("Online features for drivers 1001, 1002:")
for key, value in online_features.items():
    print(f"{key}: {value}")

In [50]:
# Using Feature Service for consistent feature sets
training_df_v1 = store.get_historical_features(
    entity_df=entity_df,
    features=store.get_feature_service("driver_activity_v2")
).to_df()

print("\nFeatures from driver_activity_v2 service:")
training_df_v1.head()


Features from driver_activity_v2 service:


Unnamed: 0,driver_id,event_timestamp,conv_rate,acc_rate,avg_daily_trips,efficiency_index,risk_score
0,1001,2021-04-12 10:59:42+00:00,0.709758,0.692957,402,402.709758,-3.157007
1,1002,2021-04-12 08:12:10+00:00,0.718295,0.584081,370,370.718295,-2.455543
2,1003,2021-04-12 16:40:26+00:00,0.697411,0.19768,25,25.697411,0.355938


### Feature view 1 metada

In [40]:
# Get feature view metadata
feature_view = store.get_feature_view("driver_quality_stats")
print("\nFeature view metadata:")
print(f"Name: {feature_view.name}")
print(f"Entities: {feature_view.entities}")
print(f"TTL: {feature_view.ttl}")
print(f"Online: {feature_view.online}")
print(f"Features: {[f.name for f in feature_view.features]}")


Feature view metadata:
Name: driver_quality_stats
Entities: ['driver']
TTL: 1 day, 0:00:00
Online: True
Features: ['conv_rate', 'acc_rate']


### Feature view 2 Metada

In [41]:
# Get feature view metadata
feature_view = store.get_feature_view("driver_activity_stats")
print("\nFeature view metadata:")
print(f"Name: {feature_view.name}")
print(f"Entities: {feature_view.entities}")
print(f"TTL: {feature_view.ttl}")
print(f"Online: {feature_view.online}")
print(f"Features: {[f.name for f in feature_view.features]}")


Feature view metadata:
Name: driver_activity_stats
Entities: ['driver']
TTL: 1 day, 0:00:00
Online: True
Features: ['avg_daily_trips']


### Online Data

In [51]:
# Using Feature Service for consistent feature sets
training_df_v3 = store.get_historical_features(
    entity_df=entity_df,
    features=store.get_feature_service("driver_activity_v3")
).to_df()

print("\nFeatures from driver_activity_v4 service:")
training_df_v3.head()


Features from driver_activity_v4 service:


Unnamed: 0,driver_id,event_timestamp,conv_rate,acc_rate,avg_daily_trips,efficiency_index,risk_score
0,1001,2021-04-12 10:59:42+00:00,0.709758,0.692957,402,402.709758,-3.157007
1,1002,2021-04-12 08:12:10+00:00,0.718295,0.584081,370,370.718295,-2.455543
2,1003,2021-04-12 16:40:26+00:00,0.697411,0.19768,25,25.697411,0.355938
