In [1]:
# Importing dependencies
from sklearn import datasets
import pandas as pd

# Loading a toy dataset into a DataFrame
data = datasets.load_breast_cancer()
data_df = pd.DataFrame(data=data.data, columns=data.feature_names)

In [10]:
# Splitting the dataset into arbitrary sets of features
data_df1 = data_df[data.feature_names[:5]]
data_df2 = data_df[data.feature_names[5:10]]
data_df3 = data_df[data.feature_names[10:17]]
data_df4 = data_df[data.feature_names[17:30]]
target_df = pd.DataFrame(data=data.target, columns=["target"])

In [11]:
# Creating timestamps for the data
timestamps = pd.date_range(
    end=pd.Timestamp.now(), 
    periods=len(data_df), 
    freq='D').to_frame(name="event_timestamp", index=False)

In [12]:
# Adding the timestamp column to each DataFrame
data_df1 = pd.concat(objs=[data_df1, timestamps], axis=1)
data_df2 = pd.concat(objs=[data_df2, timestamps], axis=1)
data_df3 = pd.concat(objs=[data_df3, timestamps], axis=1)
data_df4 = pd.concat(objs=[data_df4, timestamps], axis=1)
target_df = pd.concat(objs=[target_df, timestamps], axis=1)

In [13]:
# Creating a list of arbitrary IDs for feature rows
patient_ids = pd.DataFrame(data=list(range(len(data_df))), columns=["patient_id"])

# Adding the timestamp column to each DataFrame
data_df1 = pd.concat(objs=[data_df1, patient_ids], axis=1)
data_df2 = pd.concat(objs=[data_df2, patient_ids], axis=1)
data_df3 = pd.concat(objs=[data_df3, patient_ids], axis=1)
data_df4 = pd.concat(objs=[data_df4, patient_ids], axis=1)
target_df = pd.concat(objs=[target_df, patient_ids], axis=1)

In [15]:
# Inspecting the feature DataFrames
data_df1.tail()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,event_timestamp,patient_id
564,21.56,22.39,142.0,1479.0,0.111,2024-10-26 00:07:16.430652,564
565,20.13,28.25,131.2,1261.0,0.0978,2024-10-27 00:07:16.430652,565
566,16.6,28.08,108.3,858.1,0.08455,2024-10-28 00:07:16.430652,566
567,20.6,29.33,140.1,1265.0,0.1178,2024-10-29 00:07:16.430652,567
568,7.76,24.54,47.92,181.0,0.05263,2024-10-30 00:07:16.430652,568


In [23]:
data_df2.head()

Unnamed: 0,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,event_timestamp,patient_id
0,0.2776,0.3001,0.1471,0.2419,0.07871,2023-04-11 00:07:16.430652,0
1,0.07864,0.0869,0.07017,0.1812,0.05667,2023-04-12 00:07:16.430652,1
2,0.1599,0.1974,0.1279,0.2069,0.05999,2023-04-13 00:07:16.430652,2
3,0.2839,0.2414,0.1052,0.2597,0.09744,2023-04-14 00:07:16.430652,3
4,0.1328,0.198,0.1043,0.1809,0.05883,2023-04-15 00:07:16.430652,4


In [24]:
data_df3.head()

Unnamed: 0,radius error,texture error,perimeter error,area error,smoothness error,compactness error,concavity error,event_timestamp,patient_id
0,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,2023-04-11 00:07:16.430652,0
1,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,2023-04-12 00:07:16.430652,1
2,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,2023-04-13 00:07:16.430652,2
3,0.4956,1.156,3.445,27.23,0.00911,0.07458,0.05661,2023-04-14 00:07:16.430652,3
4,0.7572,0.7813,5.438,94.44,0.01149,0.02461,0.05688,2023-04-15 00:07:16.430652,4


In [25]:
data_df4.head()

Unnamed: 0,concave points error,symmetry error,fractal dimension error,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,event_timestamp,patient_id
0,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,2023-04-11 00:07:16.430652,0
1,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,2023-04-12 00:07:16.430652,1
2,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,2023-04-13 00:07:16.430652,2
3,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,2023-04-14 00:07:16.430652,3
4,0.01885,0.01756,0.005115,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,2023-04-15 00:07:16.430652,4


In [26]:
# Inspecting the target DataFrame
target_df.head()

Unnamed: 0,target,event_timestamp,patient_id
0,0,2023-04-11 00:07:16.430652,0
1,0,2023-04-12 00:07:16.430652,1
2,0,2023-04-13 00:07:16.430652,2
3,0,2023-04-14 00:07:16.430652,3
4,0,2023-04-15 00:07:16.430652,4


In [27]:
# Writing our DataFrames to parquet files
data_df1.to_parquet(path='data_df1.parquet')
data_df2.to_parquet(path='data_df2.parquet')
data_df3.to_parquet(path='data_df3.parquet')
data_df4.to_parquet(path='data_df4.parquet')
target_df.to_parquet(path='target_df.parquet')

In [12]:
data_df1.tail()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,event_timestamp,patient_id
564,21.56,22.39,142.0,1479.0,0.111,2022-03-08 13:01:45.471451,564
565,20.13,28.25,131.2,1261.0,0.0978,2022-03-09 13:01:45.471451,565
566,16.6,28.08,108.3,858.1,0.08455,2022-03-10 13:01:45.471451,566
567,20.6,29.33,140.1,1265.0,0.1178,2022-03-11 13:01:45.471451,567
568,7.76,24.54,47.92,181.0,0.05263,2022-03-12 13:01:45.471451,568


In [1]:
# Importing dependencies
import pandas as pd
from feast import FeatureStore
from feast.infra.offline_stores.file_source import SavedDatasetFileStorage
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from joblib import dump, load
from datetime import datetime, timedelta
os.chdir(os.getcwd() + "/breast_cancer")

In [4]:
# Getting our FeatureStore
store = FeatureStore(repo_path=".")
store

FeatureStore(
    repo_path=WindowsPath('.'),
    config=RepoConfig(project='breast_cancer', provider='local', registry_config='data/registry.db', online_config={'path': 'data/online_store.db', 'type': 'sqlite'}, auth={'type': 'no_auth'}, offline_config='dask', batch_engine_config='local', feature_server=None, flags=None, repo_path=WindowsPath('.'), entity_key_serialization_version=2, coerce_tz_aware=True),
    registry=<feast.infra.registry.registry.Registry object at 0x000002765B48A370>,
    provider=<feast.infra.passthrough_provider.PassthroughProvider object at 0x000002765B4B68E0>
)

In [6]:
entities = store.list_entities()
for entity in entities:
    print(entity)

{
  "spec": {
    "name": "patient_id",
    "joinKey": "patient_id"
  },
  "meta": {
    "createdTimestamp": "2024-11-02T10:45:49.879993Z",
    "lastUpdatedTimestamp": "2024-11-02T10:45:49.879993Z"
  }
}


In [36]:
store.list_all_feature_views()



[<FeatureView(name = df1_Field_view, entities = ['patient_id'], ttl = 3 days, 0:00:00, stream_source = None, batch_source = {
   "type": "BATCH_FILE",
   "timestampField": "event_timestamp",
   "fileOptions": {
     "uri": "data/data_df1.parquet"
   },
   "name": "data/data_df1.parquet"
 }, entity_columns = [patient_id-Int64], features = [mean radius-Float32, mean texture-Float32, mean perimeter-Float32, mean area-Float32, mean smoothness-Float32], description = , tags = {}, owner = , projection = FeatureViewProjection(name='df1_Field_view', name_alias=None, desired_features=[], features=[mean radius-Float32, mean texture-Float32, mean perimeter-Float32, mean area-Float32, mean smoothness-Float32], join_key_map={}), created_timestamp = 2024-11-02 10:45:49.879993, last_updated_timestamp = 2024-11-02 10:53:52.627083, online = True, materialization_intervals = [(datetime.datetime(2024, 11, 1, 16, 22, 37, 270753, tzinfo=<UTC>), datetime.datetime(2024, 11, 2, 16, 22, 37, 270753, tzinfo=<UTC

In [37]:
# Reading our targets as an entity DataFrame
entity_df = pd.read_parquet(path="data/target_df.parquet")    
entity_df

Unnamed: 0,target,event_timestamp,patient_id
0,0,2023-04-11 00:07:16.430652,0
1,0,2023-04-12 00:07:16.430652,1
2,0,2023-04-13 00:07:16.430652,2
3,0,2023-04-14 00:07:16.430652,3
4,0,2023-04-15 00:07:16.430652,4
...,...,...,...
564,0,2024-10-26 00:07:16.430652,564
565,0,2024-10-27 00:07:16.430652,565
566,0,2024-10-28 00:07:16.430652,566
567,0,2024-10-29 00:07:16.430652,567


In [38]:
# Getting the indicated historical features
# and joining them with our entity DataFrame
training_data = store.get_historical_features(
    entity_df=entity_df,
    features=[
        "df1_Field_view:mean radius",
        "df1_Field_view:mean texture",
        "df1_Field_view:mean perimeter",
        "df1_Field_view:mean area",
        "df1_Field_view:mean smoothness",
        "df2_Field_view:mean compactness",
        "df2_Field_view:mean concavity",
        "df2_Field_view:mean concave points",
        "df2_Field_view:mean symmetry",
        "df2_Field_view:mean fractal dimension",
        "df3_Field_view:radius error",
        "df3_Field_view:texture error",
        "df3_Field_view:perimeter error",
        "df3_Field_view:area error",
        "df3_Field_view:smoothness error",
        "df3_Field_view:compactness error",
        "df3_Field_view:concavity error",
        "df4_Field_view:concave points error",
        "df4_Field_view:symmetry error",
        "df4_Field_view:fractal dimension error",
        "df4_Field_view:worst radius",
        "df4_Field_view:worst texture",
        "df4_Field_view:worst perimeter",
        "df4_Field_view:worst area",
        "df4_Field_view:worst smoothness",
        "df4_Field_view:worst compactness",
        "df4_Field_view:worst concavity",
        "df4_Field_view:worst concave points",
        "df4_Field_view:worst symmetry",
        "df4_Field_view:worst fractal dimension"
    ]
)




In [39]:
feature_data = training_data.to_df()
feature_data

Unnamed: 0,target,event_timestamp,patient_id,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,0,2023-04-11 00:07:16.430652+00:00,0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,0,2023-04-12 00:07:16.430652+00:00,1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,0,2023-04-13 00:07:16.430652+00:00,2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,0,2023-04-14 00:07:16.430652+00:00,3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,0,2023-04-15 00:07:16.430652+00:00,4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,0,2024-10-26 00:07:16.430652+00:00,564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,0,2024-10-27 00:07:16.430652+00:00,565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,0,2024-10-28 00:07:16.430652+00:00,566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,0,2024-10-29 00:07:16.430652+00:00,567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [41]:
# Storing the dataset as a local file
dataset = store.create_saved_dataset(
    from_=training_data,
    name="breast_cancer_dataset",
    storage=SavedDatasetFileStorage("data/breast_cancer_dataset.parquet")
)

In [42]:
# Retrieving the saved dataset and converting it to a DataFrame
training_df = store.get_saved_dataset(name="breast_cancer_dataset").to_df()



In [43]:
# Separating the features and labels
labels = training_df['target']
features = training_df.drop(
    labels=['target', 'event_timestamp', "patient_id"], 
    axis=1)

# Splitting the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(features, 
                                                    labels, 
                                                    stratify=labels)

In [44]:
# Creating and training LogisticRegression
reg = LogisticRegression()
reg.fit(X=X_train[sorted(X_train)], y=y_train)

# Saving the model
dump(value=reg, filename="model.joblib")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


['model.joblib']

In [46]:
#Code for loading features to online store between two dates
store.materialize(
    end_date=datetime.now(),
    start_date=datetime.now() - timedelta(days=10))



Materializing [1m[32m5[0m feature views from [1m[32m2024-10-24 14:18:50+05:30[0m to [1m[32m2024-11-03 14:18:50+05:30[0m into the [1m[32msqlite[0m online store.

[1m[32mdf1_Field_view[0m:


  conn.execute(
100%|████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 208.23it/s]


[1m[32mdf2_Field_view[0m:


  conn.execute(
100%|████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 375.04it/s]


[1m[32mdf3_Field_view[0m:


  conn.execute(
100%|████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 352.88it/s]


[1m[32mtarget_Field_view[0m:


  conn.execute(
100%|████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 374.94it/s]


[1m[32mdf4_Field_view[0m:


  conn.execute(
100%|████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 315.83it/s]


In [73]:
# Loading the latest features after a previous materialize call or from the beginning of time
store.materialize_incremental(end_date=datetime.now())



Materializing [1m[32m5[0m feature views to [1m[32m2024-11-03 15:33:51+05:30[0m into the [1m[32msqlite[0m online store.

[1m[32mdf1_Field_view[0m from [1m[32m2024-11-03 19:58:42+05:30[0m to [1m[32m2024-11-03 15:33:51+05:30[0m:


0it [00:00, ?it/s]


[1m[32mdf2_Field_view[0m from [1m[32m2024-11-03 19:58:42+05:30[0m to [1m[32m2024-11-03 21:03:51+05:30[0m:


0it [00:00, ?it/s]


[1m[32mdf3_Field_view[0m from [1m[32m2024-11-03 19:58:42+05:30[0m to [1m[32m2024-11-03 21:03:51+05:30[0m:


0it [00:00, ?it/s]


[1m[32mtarget_Field_view[0m from [1m[32m2024-11-03 19:58:42+05:30[0m to [1m[32m2024-11-03 21:03:51+05:30[0m:


0it [00:00, ?it/s]


[1m[32mdf4_Field_view[0m from [1m[32m2024-11-03 19:58:42+05:30[0m to [1m[32m2024-11-03 21:03:51+05:30[0m:


0it [00:00, ?it/s]


In [47]:
# Defining our features names
feast_features = [
        "df1_Field_view:mean radius",
        "df1_Field_view:mean texture",
        "df1_Field_view:mean perimeter",
        "df1_Field_view:mean area",
        "df1_Field_view:mean smoothness",
        "df2_Field_view:mean compactness",
        "df2_Field_view:mean concavity",
        "df2_Field_view:mean concave points",
        "df2_Field_view:mean symmetry",
        "df2_Field_view:mean fractal dimension",
        "df3_Field_view:radius error",
        "df3_Field_view:texture error",
        "df3_Field_view:perimeter error",
        "df3_Field_view:area error",
        "df3_Field_view:smoothness error",
        "df3_Field_view:compactness error",
        "df3_Field_view:concavity error",
        "df4_Field_view:concave points error",
        "df4_Field_view:symmetry error",
        "df4_Field_view:fractal dimension error",
        "df4_Field_view:worst radius",
        "df4_Field_view:worst texture",
        "df4_Field_view:worst perimeter",
        "df4_Field_view:worst area",
        "df4_Field_view:worst smoothness",
        "df4_Field_view:worst compactness",
        "df4_Field_view:worst concavity",
        "df4_Field_view:worst concave points",
        "df4_Field_view:worst symmetry",
        "df4_Field_view:worst fractal dimension"
    ]

# Getting the latest features
features = store.get_online_features(
    features=feast_features,    
    entity_rows=[{"patient_id": 568}, {"patient_id": 567}]
).to_dict()


  rows = cur.fetchall()


In [48]:
features

{'patient_id': [568, 567],
 'mean smoothness': [0.052629999816417694, 0.11779999732971191],
 'mean radius': [7.760000228881836, 20.600000381469727],
 'mean area': [181.0, 1265.0],
 'mean texture': [24.540000915527344, 29.329999923706055],
 'mean perimeter': [47.91999816894531, 140.10000610351562],
 'mean fractal dimension': [0.05883999913930893, 0.07016000151634216],
 'mean concave points': [0.0, 0.15199999511241913],
 'mean compactness': [0.04362000152468681, 0.2770000100135803],
 'mean concavity': [0.0, 0.3513999879360199],
 'mean symmetry': [0.15870000422000885, 0.23970000445842743],
 'concavity error': [0.0, 0.07117000222206116],
 'compactness error': [0.004660000093281269, 0.06157999858260155],
 'radius error': [0.385699987411499, 0.7260000109672546],
 'texture error': [1.4279999732971191, 1.5950000286102295],
 'area error': [19.149999618530273, 86.22000122070312],
 'perimeter error': [2.5480000972747803, 5.771999835968018],
 'smoothness error': [0.007189000025391579, 0.0065219998

In [50]:
# Converting the features to a DataFrame
features_df = pd.DataFrame.from_dict(data=features)
features_df

Unnamed: 0,patient_id,mean smoothness,mean radius,mean area,mean texture,mean perimeter,mean fractal dimension,mean concave points,mean compactness,mean concavity,...,worst perimeter,fractal dimension error,worst smoothness,worst radius,symmetry error,worst concavity,worst fractal dimension,worst compactness,worst area,concave points error
0,568,0.05263,7.76,181.0,24.540001,47.919998,0.05884,0.0,0.04362,0.0,...,59.16,0.002783,0.08996,9.456,0.02676,0.0,0.07039,0.06444,268.600006,0.0
1,567,0.1178,20.6,1265.0,29.33,140.100006,0.07016,0.152,0.277,0.3514,...,184.600006,0.006185,0.165,25.74,0.02324,0.9387,0.124,0.8681,1821.0,0.01664


In [51]:
# Loading our model and doing inference
reg = load("model.joblib")
predictions = reg.predict(features_df[sorted(features_df.drop("patient_id", axis=1))])
predictions

array([1, 0])

In [58]:
import sqlite3
con_online = sqlite3.connect("data/online_store.db")
print("\n--- Schema of online store ---")
print(
    pd.read_sql_query(
        "SELECT * FROM breast_cancer_target_Field_view", con_online).columns.tolist())

print("\n--- schema details of online store ---")
print(
    pd.read_sql_query(
        "SELECT * FROM breast_cancer_df4_Field_view", con_online))


con_online.close()


--- Schema of online store ---
['entity_key', 'feature_name', 'value', 'vector_value', 'event_ts', 'created_ts']

--- Data of online store ---
                                           entity_key  \
0   b'\x02\x00\x00\x00patient_id\x04\x00\x00\x00\x...   
1   b'\x02\x00\x00\x00patient_id\x04\x00\x00\x00\x...   
2   b'\x02\x00\x00\x00patient_id\x04\x00\x00\x00\x...   
3   b'\x02\x00\x00\x00patient_id\x04\x00\x00\x00\x...   
4   b'\x02\x00\x00\x00patient_id\x04\x00\x00\x00\x...   
..                                                ...   
73  b'\x02\x00\x00\x00patient_id\x04\x00\x00\x00\x...   
74  b'\x02\x00\x00\x00patient_id\x04\x00\x00\x00\x...   
75  b'\x02\x00\x00\x00patient_id\x04\x00\x00\x00\x...   
76  b'\x02\x00\x00\x00patient_id\x04\x00\x00\x00\x...   
77  b'\x02\x00\x00\x00patient_id\x04\x00\x00\x00\x...   

               feature_name                 value vector_value  \
0      concave points error  b'5\x00\x00\x00\x00'         None   
1            symmetry error        b'5\

In [72]:
#get features from multiple feature view
feature_service = store.get_feature_service("df_featuresrvc")
combined_features = store.get_historical_features(features=feature_service, entity_df=entity_df)
combined_features.to_df()



Unnamed: 0,target,event_timestamp,patient_id,mean area,area error
0,0,2023-04-11 00:07:16.430652+00:00,0,1001.0,153.40
1,0,2023-04-12 00:07:16.430652+00:00,1,1326.0,74.08
2,0,2023-04-13 00:07:16.430652+00:00,2,1203.0,94.03
3,0,2023-04-14 00:07:16.430652+00:00,3,386.1,27.23
4,0,2023-04-15 00:07:16.430652+00:00,4,1297.0,94.44
...,...,...,...,...,...
564,0,2024-10-26 00:07:16.430652+00:00,564,1479.0,158.70
565,0,2024-10-27 00:07:16.430652+00:00,565,1261.0,99.04
566,0,2024-10-28 00:07:16.430652+00:00,566,858.1,48.55
567,0,2024-10-29 00:07:16.430652+00:00,567,1265.0,86.22


In [75]:
features_new = store.get_online_features(
    features=feature_service,    
    entity_rows=[{"patient_id": 568}, {"patient_id": 567}]
).to_dict()

features_new_df = pd.DataFrame.from_dict(data=features_new)
features_new_df

  rows = cur.fetchall()


Unnamed: 0,patient_id,mean area,area error
0,568,181.0,19.15
1,567,1265.0,86.220001
