In [1]:
!pip install feast==0.29.0

Collecting feast==0.29.0
  Downloading feast-0.29.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.8/29.8 MB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting pandavro~=1.5.0
  Downloading pandavro-1.5.2.tar.gz (3.8 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting grpcio-reflection<2,>=1.47.0
  Downloading grpcio_reflection-1.51.1-py3-none-any.whl (11 kB)
Collecting httpx>=0.23.3
  Downloading httpx-0.23.3-py3-none-any.whl (71 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.5/71.5 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting proto-plus<2,>=1.20.0
  Downloading proto_plus-1.22.2-py3-none-any.whl (47 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.9/47.9 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting typeguard
  Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Collecting toml<

In [4]:
!pip install scikit-learn



In [5]:
!feast version

Feast is an open source project that collects anonymized error reporting and usage statistics. To opt out or learn more see https://docs.feast.dev/reference/usage
Feast SDK Version: "feast 0.29.0"


In [6]:
!feast init driver_feast_project


Creating a new Feast repository in [1m[32m/home/jovyan/driver_feast_project[0m.



In [8]:
# check key overall architecture of the feature store
!ls /home/jovyan/driver_feast_project
!ls /home/jovyan/driver_feast_project/feature_repo

!echo "###################check feature_store.yaml###################"
!cat /home/jovyan/driver_feast_project/feature_repo/feature_store.yaml

feature_repo  __init__.py  README.md
data			feature_store.yaml  __pycache__
example_driver_repo.py	__init__.py	    test_workflow.py
###################check feature_store.yaml###################
project: driver_feast_project
# By default, the registry is a file (but can be turned into a more scalable SQL-backed registry)
registry: data/registry.db
# The provider primarily specifies default offline / online stores & storing the registry in a given cloud
provider: local
online_store:
    type: sqlite
    path: data/online_store.db
entity_key_serialization_version: 2


# Inspecting the raw data

In [71]:
import pandas as pd
pd.read_parquet("/home/jovyan/driver_feast_project/feature_repo/data/driver_stats.parquet")

Unnamed: 0,event_timestamp,driver_id,conv_rate,acc_rate,avg_daily_trips,created,string_feature
0,2021-08-31 18:00:00+00:00,1005,0.362754,0.697629,31,2021-09-15 18:01:55.403,test
1,2021-08-31 19:00:00+00:00,1005,0.790768,0.821253,520,2021-09-15 18:01:55.403,test
2,2021-08-31 20:00:00+00:00,1005,0.416623,0.939601,777,2021-09-15 18:01:55.403,test
3,2021-08-31 21:00:00+00:00,1005,0.108623,0.818718,750,2021-09-15 18:01:55.403,test
4,2021-08-31 22:00:00+00:00,1005,0.244842,0.511777,954,2021-09-15 18:01:55.403,test
...,...,...,...,...,...,...,...
1802,2021-09-15 16:00:00+00:00,1001,0.681178,0.449826,906,2021-09-15 18:01:55.403,test
1803,2021-09-15 17:00:00+00:00,1001,0.812357,0.840873,714,2021-09-15 18:01:55.403,test
1804,2021-04-12 07:00:00+00:00,1001,0.701558,0.195824,566,2021-09-15 18:01:55.403,test
1805,2021-09-08 06:00:00+00:00,1003,0.084715,0.615489,523,2021-09-15 18:01:55.403,test


# Apply and deploy feature definitions

In [63]:
!cd /home/jovyan/driver_feast_project/feature_repo && feast apply

  schema = ParquetDataset(path).schema.to_arrow_schema()
Created entity [1m[32mdriver[0m
Created feature view [1m[32mdriver_hourly_stats[0m

Created sqlite table [1m[32mdriver_feast_project_driver_hourly_stats[0m



In [64]:
# check key overall architecture of the feature store
!ls /home/jovyan/driver_feast_project
!ls /home/jovyan/driver_feast_project/feature_repo
!ls /home/jovyan/driver_feast_project/feature_repo/data

feature_repo  __init__.py  README.md
data			feature_store.yaml  __pycache__
example_driver_repo.py	__init__.py	    test_workflow.py
driver_stats.parquet  online_store.db  registry.db


# Generating training data

In [68]:
import feast
from joblib import dump
import pandas as pd
from datetime import datetime
from sklearn.linear_model import LinearRegression

# Load driver order data, when orders give to entity_df, it shows 0 entries  
orders = pd.read_csv("driver_orders.csv", sep="\t")
orders["event_timestamp"] = pd.to_datetime(orders["event_timestamp"])
print(orders)

entity_df = pd.DataFrame.from_dict(
    {
        # entity's join key -> entity values
        "driver_id": [1001, 1002, 1003, 1001, 1002],
        # "event_timestamp" (reserved key) -> timestamps
        "event_timestamp": [
            datetime(2021, 4, 12, 10, 59, 42),
            datetime(2021, 4, 12, 8, 12, 10),
            datetime(2021, 4, 12, 16, 40, 26),
            datetime(2021, 4, 12, 4, 29, 28),
            datetime(2021, 4, 12, 15, 40, 26),
        ],
        # (optional) label name -> label values. Feast does not process these
        "trip_completed": [1, 0, 0, 1, 0],
    }
)

# entity_df = pd.DataFrame.from_dict(
#     {
#         # entity's join key -> entity values
#         "driver_id": [1001, 1002, 1003, 1001, 1002, 1003, 1001, 1002, 1003, 1004],
#         # "event_timestamp" (reserved key) -> timestamps
#         "event_timestamp": [
#             datetime(2021, 4, 16, 20, 29, 28),
#             datetime(2021, 4, 17, 4, 29, 28),
#             datetime(2021, 4, 17, 12, 29, 28),
#             datetime(2021, 4, 17, 20, 29, 28),
#             datetime(2021, 4, 18, 4, 29, 28),
#             datetime(2021, 4, 18, 12, 29, 28),
#             datetime(2021, 4, 18, 20, 29, 28),
#             datetime(2021, 4, 19, 4, 29, 28),
#             datetime(2021, 4, 19, 12, 29, 28),
#             datetime(2021, 4, 19, 20, 29, 28),
#         ],
#         "trip_completed": [1, 0, 0, 1, 0, 0, 1, 0, 0, 1],
#     }
# )

# Connect to your feature store provider
fs = feast.FeatureStore(repo_path="/home/jovyan/driver_feast_project/feature_repo")

# Retrieve training data from driver_orders.csv
training_df = fs.get_historical_features(
    # entity_df=orders,
    entity_df=entity_df,
    features=[
        "driver_hourly_stats:conv_rate",
        "driver_hourly_stats:acc_rate",
        "driver_hourly_stats:avg_daily_trips",
    ],
).to_df()

print("----- Feature schema -----\n")
print(training_df.info())

print()
print("----- Example features -----\n")
print(training_df.head())

            event_timestamp  driver_id  trip_completed
0 2021-04-16 20:29:28+00:00       1001               1
1 2021-04-17 04:29:28+00:00       1002               0
2 2021-04-17 12:29:28+00:00       1003               0
3 2021-04-17 20:29:28+00:00       1001               1
4 2021-04-18 04:29:28+00:00       1002               0
5 2021-04-18 12:29:28+00:00       1003               0
6 2021-04-18 20:29:28+00:00       1001               1
7 2021-04-19 04:29:28+00:00       1002               0
8 2021-04-19 12:29:28+00:00       1003               0
9 2021-04-19 20:29:28+00:00       1004               1
----- Feature schema -----

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   driver_id        4 non-null      int64              
 1   event_timestamp  4 non-null      datetime64[ns, UTC]
 2   trip_completed   4 non-null   

## Train model

In [39]:
# Train model
target = "trip_completed"

reg = LinearRegression()
train_X = training_df[training_df.columns.drop(target).drop("event_timestamp")]
train_Y = training_df.loc[:, target]
reg.fit(train_X[sorted(train_X)], train_Y)

# Save model
dump(reg, "driver_model.bin")

['driver_model.bin']