In [1]:
import os
from datetime import datetime

import pandas as pd
from feast import FeatureStore

In [2]:
raw_data_path = os.path.join("feature_store", "feature_repo", "data", "driver_stats.parquet")
feature_store_path = os.path.join("feature_store", "feature_repo")

### Check data

In [3]:
df = pd.read_parquet(raw_data_path)

In [4]:
df.head(5)

Unnamed: 0,event_timestamp,driver_id,conv_rate,acc_rate,avg_daily_trips,created
0,2025-12-16 12:56:54.821881+00:00,1001,0.95,0.9,60,2025-12-16 12:56:54.821890
1,2024-10-17 12:07:08.228578+00:00,1001,1.0,1.0,1000,2024-10-17 12:07:08.228581
2,2024-10-02 11:00:00+00:00,1005,0.429879,0.194598,582,2024-10-17 11:30:07.072000
3,2024-10-02 12:00:00+00:00,1005,0.230119,0.642878,551,2024-10-17 11:30:07.072000
4,2024-10-02 13:00:00+00:00,1005,0.1286,0.674187,38,2024-10-17 11:30:07.072000


### Features inference

In [5]:
entity_df = pd.DataFrame.from_dict(
    {
        # entity's join key -> entity values
        "driver_id": [1001, 1002, 1003],
        # "event_timestamp" (reserved key) -> timestamps
        "event_timestamp": [
            datetime(2021, 4, 12, 10, 59, 42),
            datetime(2021, 4, 12, 8, 12, 10),
            datetime(2021, 4, 12, 16, 40, 26),
        ],
        # (optional) label name -> label values. Feast does not process these
        "label_driver_reported_satisfaction": [1, 5, 3],
    }
)

In [6]:
entity_df

Unnamed: 0,driver_id,event_timestamp,label_driver_reported_satisfaction
0,1001,2021-04-12 10:59:42,1
1,1002,2021-04-12 08:12:10,5
2,1003,2021-04-12 16:40:26,3


In [7]:
from feast import FeatureStore
store = FeatureStore(repo_path=feature_store_path)


In [8]:
# Ensure feature definitions are registered in the Feast registry
from feature_store.feature_repo import (
    driver_stats_fv,
    driver_stats_fresh_fv,
    transformed_conv_rate,
    transformed_conv_rate_fresh,
    driver_performance_metrics,
    driver_activity_v1,
    driver_activity_v2,
    driver_activity_v3,
    driver_activity_v4,
)

store.apply([
    driver_stats_fv,
    driver_stats_fresh_fv,
    transformed_conv_rate,
    transformed_conv_rate_fresh,
    driver_performance_metrics,
    driver_activity_v1,
    driver_activity_v2,
    driver_activity_v3,
    driver_activity_v4,
])

print("✅ Feature definitions applied to registry (if not present)")



✅ Feature definitions applied to registry (if not present)


In [9]:
# Ensure Entity `driver` is registered and verify
from feature_store.feature_repo import driver

# Apply the entity definition to the registry
store.apply([driver])
print("✅ Entity 'driver' applied to registry (if not present)")

# Verify entity exists
try:
    ent = store.get_entity("driver")
    print(f"Entity found: {ent.name}, join_keys={ent.join_keys}")
except Exception as e:
    print("Error when retrieving entity 'driver':", e)

✅ Entity 'driver' applied to registry (if not present)
Error when retrieving entity 'driver': 'Entity' object has no attribute 'join_keys'


In [10]:


training_df = store.get_historical_features(
    entity_df=entity_df,
    features=[
        "driver_hourly_stats:conv_rate",
        "driver_hourly_stats:acc_rate",
        "driver_hourly_stats:avg_daily_trips",
    ],
).to_df()

print("----- Feature schema -----\n")
print(training_df.info())



----- Feature schema -----

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 6 columns):
 #   Column                              Non-Null Count  Dtype              
---  ------                              --------------  -----              
 0   driver_id                           3 non-null      int64              
 1   event_timestamp                     3 non-null      datetime64[ns, UTC]
 2   label_driver_reported_satisfaction  3 non-null      int64              
 3   conv_rate                           3 non-null      float32            
 4   acc_rate                            3 non-null      float32            
 5   avg_daily_trips                     3 non-null      int32              
dtypes: datetime64[ns, UTC](1), float32(2), int32(1), int64(2)
memory usage: 240.0 bytes
None


In [11]:
training_df.head()

Unnamed: 0,driver_id,event_timestamp,label_driver_reported_satisfaction,conv_rate,acc_rate,avg_daily_trips
0,1001,2021-04-12 10:59:42+00:00,1,0.709758,0.692957,402
1,1002,2021-04-12 08:12:10+00:00,5,0.718295,0.584081,370
2,1003,2021-04-12 16:40:26+00:00,3,0.697411,0.19768,25


### Features View - on demand

In [12]:
entity_df = pd.DataFrame.from_dict(
    {
        # entity's join key -> entity values
        "driver_id": [1001, 1002, 1003],
        # "event_timestamp" (reserved key) -> timestamps
        "event_timestamp": [
            datetime(2021, 4, 12, 10, 59, 42),
            datetime(2021, 4, 12, 8, 12, 10),
            datetime(2021, 4, 12, 16, 40, 26),
        ],
        # (optional) label name -> label values. Feast does not process these
        "label_driver_reported_satisfaction": [1, 5, 3],
        # values we're using for an on-demand transformation
        "val_to_add": [1, 2, 3],
        "val_to_add_2": [10, 20, 30],
    }
)

In [13]:
entity_df.head(5)

Unnamed: 0,driver_id,event_timestamp,label_driver_reported_satisfaction,val_to_add,val_to_add_2
0,1001,2021-04-12 10:59:42,1,1,10
1,1002,2021-04-12 08:12:10,5,2,20
2,1003,2021-04-12 16:40:26,3,3,30


In [14]:
training_df = store.get_historical_features(
    entity_df=entity_df,
    features=[
        "driver_hourly_stats:conv_rate",
        "driver_hourly_stats:acc_rate",
        "driver_hourly_stats:avg_daily_trips",
        "transformed_conv_rate:conv_rate_plus_val1",
        "transformed_conv_rate:conv_rate_plus_val2",
    ],
).to_df()



In [15]:
training_df

Unnamed: 0,driver_id,event_timestamp,label_driver_reported_satisfaction,val_to_add,val_to_add_2,conv_rate,acc_rate,avg_daily_trips,conv_rate_plus_val1,conv_rate_plus_val2
0,1001,2021-04-12 10:59:42+00:00,1,1,10,0.709758,0.692957,402,1.709758,10.709758
1,1002,2021-04-12 08:12:10+00:00,5,2,20,0.718295,0.584081,370,2.718295,20.718295
2,1003,2021-04-12 16:40:26+00:00,3,3,30,0.697411,0.19768,25,3.697411,30.697411


In [16]:
# Online feature retrieval
online_features = store.get_online_features(
    features=[
        "driver_hourly_stats:conv_rate",
        "driver_hourly_stats:acc_rate",
        "driver_hourly_stats:avg_daily_trips",
    ],
    entity_rows=[{"driver_id": 1001}, {"driver_id": 1002}],
).to_dict()

print("Online features for drivers 1001, 1002:")
for key, value in online_features.items():
    print(f"{key}: {value}")



Online features for drivers 1001, 1002:
driver_id: [1001, 1002]
acc_rate: [None, None]
conv_rate: [None, None]
avg_daily_trips: [None, None]


In [17]:
# Using Feature Service for consistent feature sets
training_df_v1 = store.get_historical_features(
    entity_df=entity_df,
    features=store.get_feature_service("driver_activity_v1")
).to_df()

print("\nFeatures from driver_activity_v1 service:")
training_df_v1.head()




Features from driver_activity_v1 service:


Unnamed: 0,driver_id,event_timestamp,label_driver_reported_satisfaction,val_to_add,val_to_add_2,conv_rate,conv_rate_plus_val1,conv_rate_plus_val2
0,1001,2021-04-12 10:59:42+00:00,1,1,10,0.709758,1.709758,10.709758
1,1002,2021-04-12 08:12:10+00:00,5,2,20,0.718295,2.718295,20.718295
2,1003,2021-04-12 16:40:26+00:00,3,3,30,0.697411,3.697411,30.697411


In [18]:
# Get feature view metadata
feature_view = store.get_feature_view("driver_hourly_stats")
print("\nFeature view metadata:")
print(f"Name: {feature_view.name}")
print(f"Entities: {feature_view.entities}")
print(f"TTL: {feature_view.ttl}")
print(f"Online: {feature_view.online}")
print(f"Features: {[f.name for f in feature_view.features]}")


Feature view metadata:
Name: driver_hourly_stats
Entities: ['driver']
TTL: 1 day, 0:00:00
Online: True
Features: ['conv_rate', 'acc_rate', 'avg_daily_trips']


In [19]:
# Using Feature Service for consistent feature sets
training_df_v4 = store.get_historical_features(
    entity_df=entity_df,
    features=store.get_feature_service("driver_activity_v4")
).to_df()

print("\nFeatures from driver_activity_v4 service:")
training_df_v4.head()




Features from driver_activity_v4 service:


Unnamed: 0,driver_id,event_timestamp,label_driver_reported_satisfaction,val_to_add,val_to_add_2,conv_rate,acc_rate,avg_daily_trips,combined_rating,performance_score
0,1001,2021-04-12 10:59:42+00:00,1,1,10,0.709758,0.692957,402,0.703038,4.217479
1,1002,2021-04-12 08:12:10+00:00,5,2,20,0.718295,0.584081,370,0.66461,3.931966
2,1003,2021-04-12 16:40:26+00:00,3,3,30,0.697411,0.19768,25,0.497519,1.620965


In [20]:
# ## ✨ Добавление новых Feature Views и On-Demand Feature View
# Здесь мы определим:
# - 2 новых Feature View: `driver_conversion_metrics` и `driver_safety_metrics`
# - 1 on-demand Feature View `real_time_driver_scores`, который использует значения из RequestSource
# Затем применим их к `FeatureStore` и продемонстрируем запросы для исторических и онлайн данных.

from feast import Entity, FeatureView, Field, FileSource, RequestSource, FeatureService
from feast.on_demand_feature_view import on_demand_feature_view
from datetime import timedelta
from feast.types import Float32, Int64, Float64

# Entity (повторное определение в ноутбуке)
driver_entity = Entity(name="driver", join_keys=["driver_id"])

# Используем тот же parquet-файл, который уже используется в ноутбуке
driver_source = FileSource(
    name="driver_stats_source_nb",
    path=raw_data_path,
    timestamp_field="event_timestamp",
    created_timestamp_column="created",
)

# 1) Feature View: Метрики конверсии и среднего числа поездок
driver_conversion_metrics = FeatureView(
    name="driver_conversion_metrics",
    entities=[driver_entity],
    ttl=timedelta(days=1),
    schema=[
        Field(name="conv_rate", dtype=Float32),
        Field(name="avg_daily_trips", dtype=Int64),
    ],
    online=True,
    source=driver_source,
    tags={"team": "conversion"},
)

# 2) Feature View: Метрики безопасности
driver_safety_metrics = FeatureView(
    name="driver_safety_metrics",
    entities=[driver_entity],
    ttl=timedelta(days=1),
    schema=[
        Field(name="acc_rate", dtype=Float32),
    ],
    online=True,
    source=driver_source,
    tags={"team": "safety"},
)

# Request source для on-demand фич
realtime_request = RequestSource(
    name="realtime_inputs",
    schema=[
        Field(name="val_to_add", dtype=Int64),
        Field(name="val_to_add_2", dtype=Int64),
    ],
)

# 3) On-demand Feature View: вычисляет в реальном времени показатели риска и ожидаемой конверсии
@on_demand_feature_view(
    sources=[driver_conversion_metrics, driver_safety_metrics, realtime_request],
    schema=[
        Field(name="safety_risk_score", dtype=Float64),
        Field(name="expected_conversion_pct", dtype=Float64),
    ],
)
def real_time_driver_scores(inputs: pd.DataFrame) -> pd.DataFrame:
    df = pd.DataFrame()
    # простая формула: риск пропорционален acc_rate и дополнению val_to_add
    df["safety_risk_score"] = inputs["acc_rate"] * (1 + inputs["val_to_add"] / 10.0)
    # ожидаемая конверсия = conv_rate + небольшой процент от val_to_add_2
    df["expected_conversion_pct"] = inputs["conv_rate"] + inputs["val_to_add_2"] / 100.0
    return df

# Применяем определения к FeatureStore
fs = store
fs.apply([driver_entity, driver_conversion_metrics, driver_safety_metrics, realtime_request, real_time_driver_scores])

print("✅ Новые Feature Views и on-demand Feature View применены к FeatureStore")

✅ Новые Feature Views и on-demand Feature View применены к FeatureStore




In [21]:
# ### Демонстрация: Historical retrieval (для обучения)
entity_df_rt = pd.DataFrame.from_dict(
    {
        "driver_id": [1001, 1002, 1003],
        "event_timestamp": [
            datetime(2021, 4, 12, 10, 59, 42),
            datetime(2021, 4, 12, 8, 12, 10),
            datetime(2021, 4, 12, 16, 40, 26),
        ],
        "val_to_add": [1, 2, 3],
        "val_to_add_2": [10, 20, 30],
    }
)

hist_df = fs.get_historical_features(
    entity_df=entity_df_rt,
    features=[
        "driver_conversion_metrics:conv_rate",
        "driver_conversion_metrics:avg_daily_trips",
        "driver_safety_metrics:acc_rate",
        "real_time_driver_scores:safety_risk_score",
        "real_time_driver_scores:expected_conversion_pct",
    ],
).to_df()

print("--- Historical features (training) ---")
hist_df.head()



--- Historical features (training) ---


Unnamed: 0,driver_id,event_timestamp,val_to_add,val_to_add_2,conv_rate,avg_daily_trips,acc_rate,safety_risk_score,expected_conversion_pct
0,1001,2021-04-12 10:59:42+00:00,1,10,0.709758,402,0.692957,0.762253,0.809758
1,1002,2021-04-12 08:12:10+00:00,2,20,0.718295,370,0.584081,0.700898,0.918295
2,1003,2021-04-12 16:40:26+00:00,3,30,0.697411,25,0.19768,0.256985,0.997411


In [22]:
# Materialize latest features into online store and (optionally) push a sample event
from datetime import datetime
import pandas as pd
from feast.data_source import PushMode

print("--- Materialize features into online store ---")
# Materialize all feature views up to now (loads latest values from offline sources into online store)
store.materialize_incremental(end_date=datetime.now())
print("✅ Materialization complete")

# Optional: simulate a streaming event and push it to the online store
print("--- Push a sample event into online store (simulated stream) ---")
event_df = pd.DataFrame.from_dict(
    {
        "driver_id": [1001],
        "event_timestamp": [datetime.now()],
        "created": [datetime.now()],
        "conv_rate": [0.95],
        "acc_rate": [0.9],
        "avg_daily_trips": [60],
    }
)
store.push("driver_stats_push_source", event_df, to=PushMode.ONLINE_AND_OFFLINE)
print("✅ Sample event pushed to online store")



--- Materialize features into online store ---
Materializing [1m[32m4[0m feature views to [1m[32m2025-12-16 12:58:04+03:00[0m into the [1m[32msqlite[0m online store.

[1m[32mdriver_conversion_metrics[0m from [1m[32m2025-12-16 15:56:54+03:00[0m to [1m[32m2025-12-16 12:58:04+03:00[0m:


100%|█████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 33.89it/s]


[1m[32mdriver_safety_metrics[0m from [1m[32m2025-12-16 15:56:54+03:00[0m to [1m[32m2025-12-16 15:58:04+03:00[0m:


100%|█████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 70.31it/s]


[1m[32mdriver_hourly_stats[0m from [1m[32m2025-12-16 15:56:54+03:00[0m to [1m[32m2025-12-16 15:58:04+03:00[0m:


100%|█████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 59.37it/s]

[1m[32mdriver_hourly_stats_fresh[0m from [1m[32m2025-12-16 15:56:54+03:00[0m to [1m[32m2025-12-16 15:58:04+03:00[0m:



100%|████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 351.78it/s]


✅ Materialization complete
--- Push a sample event into online store (simulated stream) ---
✅ Sample event pushed to online store


In [23]:
# Diagnostic: check which online feature(s) return empty values
from pprint import pprint

tests = [
    ("Base FV: driver_hourly_stats", ["driver_hourly_stats:conv_rate","driver_hourly_stats:acc_rate"], [{"driver_id":1001},{"driver_id":1002}]),
    ("Conversion FV", ["driver_conversion_metrics:conv_rate"], [{"driver_id":1001},{"driver_id":1002}]),
    ("Safety FV", ["driver_safety_metrics:acc_rate"], [{"driver_id":1001},{"driver_id":1002}]),
    ("On-demand + conv", ["real_time_driver_scores:safety_risk_score","real_time_driver_scores:expected_conversion_pct","driver_conversion_metrics:conv_rate"], [{"driver_id":1001,"val_to_add":2,"val_to_add_2":15},{"driver_id":1002,"val_to_add":0,"val_to_add_2":5}]),
]

for label, feats, rows in tests:
    print("---", label, "---")
    try:
        res = fs.get_online_features(features=feats, entity_rows=rows).to_dict()
        pprint(res)
    except Exception as e:
        print("Error:", type(e).__name__, str(e))

# Also print a simple call for a single on-demand feature to get clearer error
print('\n--- Single on-demand check ---')
try:
    res = fs.get_online_features(features=["real_time_driver_scores:safety_risk_score"], entity_rows=[{"driver_id":1001,"val_to_add":2,"val_to_add_2":15}]).to_dict()
    pprint(res)
except Exception as e:
    print("Error single on-demand:", type(e).__name__, str(e))



--- Base FV: driver_hourly_stats ---
{'acc_rate': [0.8999999761581421, None],
 'conv_rate': [0.949999988079071, None],
 'driver_id': [1001, 1002]}
--- Conversion FV ---
{'conv_rate': [0.949999988079071, None], 'driver_id': [1001, 1002]}
--- Safety FV ---
{'acc_rate': [0.8999999761581421, None], 'driver_id': [1001, 1002]}
--- On-demand + conv ---
{'conv_rate': [0.949999988079071, None],
 'driver_id': [1001, 1002],
 'expected_conversion_pct': [1.099999988079071, None],
 'safety_risk_score': [1.0799999713897706, None]}

--- Single on-demand check ---
{'driver_id': [1001], 'safety_risk_score': [1.0799999713897706]}


In [24]:
# ### Демонстрация: Online retrieval (для инференса)
online_results = fs.get_online_features(
    features=[
        "real_time_driver_scores:safety_risk_score",
        "real_time_driver_scores:expected_conversion_pct",
        "driver_conversion_metrics:conv_rate",
    ],
    entity_rows=[
        {"driver_id": 1001, "val_to_add": 2, "val_to_add_2": 15},
        {"driver_id": 1002, "val_to_add": 0, "val_to_add_2": 5},
    ],
).to_dict()

print("--- Online features ---")
for k, v in online_results.items():
    print(f"{k}: {v}")



--- Online features ---
driver_id: [1001, 1002]
conv_rate: [0.949999988079071, None]
safety_risk_score: [1.0799999713897706, None]
expected_conversion_pct: [1.099999988079071, None]


In [25]:
# ### FeatureService: группировка фич для одной версии модели
from feast import FeatureService

driver_activity_realtime = FeatureService(
    name="driver_activity_realtime",
    features=[driver_conversion_metrics, driver_safety_metrics, real_time_driver_scores],
)
fs.apply([driver_activity_realtime])

hist_service_df = fs.get_historical_features(
    entity_df=entity_df_rt,
    features=fs.get_feature_service("driver_activity_realtime"),
).to_df()

print("--- Исторические признаки из FeatureService 'driver_activity_realtime' ---")
hist_service_df.head()



--- Исторические признаки из FeatureService 'driver_activity_realtime' ---


Unnamed: 0,driver_id,event_timestamp,val_to_add,val_to_add_2,conv_rate,avg_daily_trips,acc_rate,safety_risk_score,expected_conversion_pct
0,1001,2021-04-12 10:59:42+00:00,1,10,0.709758,402,0.692957,0.762253,0.809758
1,1002,2021-04-12 08:12:10+00:00,2,20,0.718295,370,0.584081,0.700898,0.918295
2,1003,2021-04-12 16:40:26+00:00,3,30,0.697411,25,0.19768,0.256985,0.997411
