In [1]:
import pandas as pd
import os

data = { 
    'credit_card_number': ['1111 2222 3333 4444', '1111 2222 3333 4444','1111 2222 3333 4444',
                           '1111 2222 3333 4444'],
    'trans_datetime': ['2022-01-01 08:44', '2022-01-02 19:44', '2022-01-02 20:44', '2022-01-02 20:55'],
    'amount': [142.34, 12.34, 66.29, 112.33],
    'location': ['Sao Paolo', 'Rio De Janeiro', 'Stockholm', 'Stockholm'],
    'fraud': [False, False, True, True] 
}

df = pd.DataFrame.from_dict(data)
df['trans_datetime']= pd.to_datetime(df['trans_datetime'])
df

Unnamed: 0,credit_card_number,trans_datetime,amount,location,fraud
0,1111 2222 3333 4444,2022-01-01 08:44:00,142.34,Sao Paolo,False
1,1111 2222 3333 4444,2022-01-02 19:44:00,12.34,Rio De Janeiro,False
2,1111 2222 3333 4444,2022-01-02 20:44:00,66.29,Stockholm,True
3,1111 2222 3333 4444,2022-01-02 20:55:00,112.33,Stockholm,True


In [2]:
import hopsworks
proj = hopsworks.login(api_key_value=os.getenv("HOPSWORKS_PYTHON_CLIKEY"))
fs = proj.get_feature_store()

2025-03-05 02:40:17,168 INFO: Initializing external client
2025-03-05 02:40:17,169 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-03-05 02:40:17,798 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1214673


In [5]:
fg = fs.get_or_create_feature_group(
     name="credit_card_transactions",
     version=1,
     description="Credit Card Transaction data",
     primary_key=['credit_card_number'],
     event_time='trans_datetime'
) 

In [6]:
fg.insert(df)

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1214673/fs/1202301/fg/1401910


Uploading Dataframe: 100.00% |██████████| Rows 4/4 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: credit_card_transactions_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1214673/jobs/named/credit_card_transactions_1_offline_fg_materialization/executions


(Job('credit_card_transactions_1_offline_fg_materialization', 'SPARK'), None)

In [7]:
query = fg.select(["amount", "location", "fraud"])
fv = fs.create_feature_view(
    name="credit_card_transactions",
    version=1,
    description="Features from credit card transactions",
    labels=["fraud"],
    query=query
)
fv

Feature view created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1214673/fs/1202301/fv/credit_card_transactions/version/1


<hsfs.feature_view.FeatureView at 0x7fc9e408ca70>

In [5]:
fv = fs.get_feature_view(name='credit_card_transactions', version=1)
X_train, X_test, y_train, y_test = fv.train_test_split(0.5)


Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.32s) 




In [8]:
X_train

Unnamed: 0,amount,location
1,66.29,Stockholm
3,142.34,Sao Paolo


In [9]:
td_version, td_job = fv.create_train_test_split(
    description = 'Transactions fraud batch training dataset',
    data_format = 'csv',
    test_size = 0.5,
    write_options = {'wait_for_job': True},
    coalesce = True,
)

Training dataset job started successfully, you can follow the progress at 
http://c.app.hopsworks.ai/p/1214673/jobs/named/credit_card_transactions_1_create_fv_td_04032025010015/executions
2025-03-04 01:00:25,465 INFO: Waiting for execution to finish. Current state: INITIALIZING. Final status: UNDEFINED
2025-03-04 01:00:28,533 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED
2025-03-04 01:00:31,600 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED
2025-03-04 01:01:26,791 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED
2025-03-04 01:01:26,848 INFO: Waiting for log aggregation to finish.
2025-03-04 01:01:35,266 INFO: Execution finished successfully.
2025-03-04 01:01:35,267 INFO: Provenance cached data - overwriting last accessed/created training dataset from 3 to 4.




In [15]:
X_train, y_train, X_test, y_test = fv.get_train_test_split(4)
X_train

2025-03-04 01:13:23,759 INFO: Provenance cached data - overwriting last accessed/created training dataset from 1 to 4.


Unnamed: 0,amount,location
0,66.29,Stockholm
1,112.33,Stockholm


In [17]:
df2 = df[["credit_card_number", "amount"]].groupby("credit_card_number").sum()
df2.rename(columns={"amount": "total_spent"}, inplace=True)
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, 1111 2222 3333 4444 to 1111 2222 3333 4444
Data columns (total 1 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   total_spent  1 non-null      float64
dtypes: float64(1)
memory usage: 16.0+ bytes


In [18]:
df2


Unnamed: 0_level_0,total_spent
credit_card_number,Unnamed: 1_level_1
1111 2222 3333 4444,333.3


In [19]:
df2["as_of_datetime"] = df[["credit_card_number", "trans_datetime"]].groupby("credit_card_number").max()
df2

Unnamed: 0_level_0,total_spent,as_of_datetime
credit_card_number,Unnamed: 1_level_1,Unnamed: 2_level_1
1111 2222 3333 4444,333.3,2022-01-02 20:55:00


In [20]:
df2.reset_index(inplace=True)
df2

Unnamed: 0,credit_card_number,total_spent,as_of_datetime
0,1111 2222 3333 4444,333.3,2022-01-02 20:55:00


In [21]:
fg2 = fs.get_or_create_feature_group(
     name="credit_card_spending",
     version=1,
     description="Credit Card Spending",
     primary_key=['credit_card_number'],
     event_time='as_of_datetime'
) 
fg2

<hsfs.feature_group.FeatureGroup at 0x709efe901e20>

In [22]:
fg2.insert(df2, write_options={"wait_for_job": False})
fg2

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1214673/fs/1202301/fg/1403438


Uploading Dataframe: 100.00% |██████████| Rows 1/1 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: credit_card_spending_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1214673/jobs/named/credit_card_spending_1_offline_fg_materialization/executions


<hsfs.feature_group.FeatureGroup at 0x709efe901e20>

In [23]:
more_data = { 
    'credit_card_number': ['9999 8888 7777 6666', '9999 8888 7777 6666','9999 8888 7777 6666',
                           '9999 8888 7777 6666'],
    'trans_datetime': ['2022-01-02 04:11', '2022-01-03 07:24', '2022-01-05 10:33', '2022-01-05 11:50'],
    'amount': [55.67, 84, 77.95, 183],
    'location': ['San Francisco', 'San Francisco', 'Dublin', 'Dublin'],
    'fraud': [False, False, False, False] 
}

df3 = pd.DataFrame.from_dict(more_data)
df3['trans_datetime']= pd.to_datetime(df3['trans_datetime'])

fg = fs.get_feature_group(name="credit_card_transactions", version=1)

fg.insert(df3, write_options={"wait_for_job": False})

Uploading Dataframe: 100.00% |██████████| Rows 4/4 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: credit_card_transactions_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1214673/jobs/named/credit_card_transactions_1_offline_fg_materialization/executions


(Job('credit_card_transactions_1_offline_fg_materialization', 'SPARK'), None)

In [3]:
fg = fs.get_feature_group(name="credit_card_transactions", version=1)
df5 = fg.read()
df5

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.56s) 


Unnamed: 0,credit_card_number,trans_datetime,amount,location,fraud
0,1111 2222 3333 4444,2022-01-02 19:44:00+00:00,12.34,Rio De Janeiro,False
1,1111 2222 3333 4444,2022-01-02 20:44:00+00:00,66.29,Stockholm,True
2,1111 2222 3333 4444,2022-01-02 20:55:00+00:00,112.33,Stockholm,True
3,1111 2222 3333 4444,2022-01-01 08:44:00+00:00,142.34,Sao Paolo,False
4,9999 8888 7777 6666,2022-01-03 07:24:00+00:00,84.0,San Francisco,False
5,9999 8888 7777 6666,2022-01-05 10:33:00+00:00,77.95,Dublin,False
6,9999 8888 7777 6666,2022-01-05 11:50:00+00:00,183.0,Dublin,False
7,9999 8888 7777 6666,2022-01-02 04:11:00+00:00,55.67,San Francisco,False


In [4]:
df5 = df5.set_index('trans_datetime')

In [5]:
df5 = df5.sort_index()

In [6]:
df5['rolling_max_1d'] = df5.rolling('1D').amount.max()
df5['rolling_min_1d'] = df5.rolling('1D').amount.min()
df5

Unnamed: 0_level_0,credit_card_number,amount,location,fraud,rolling_max_1d,rolling_min_1d
trans_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-01-01 08:44:00+00:00,1111 2222 3333 4444,142.34,Sao Paolo,False,142.34,142.34
2022-01-02 04:11:00+00:00,9999 8888 7777 6666,55.67,San Francisco,False,142.34,55.67
2022-01-02 19:44:00+00:00,1111 2222 3333 4444,12.34,Rio De Janeiro,False,55.67,12.34
2022-01-02 20:44:00+00:00,1111 2222 3333 4444,66.29,Stockholm,True,66.29,12.34
2022-01-02 20:55:00+00:00,1111 2222 3333 4444,112.33,Stockholm,True,112.33,12.34
2022-01-03 07:24:00+00:00,9999 8888 7777 6666,84.0,San Francisco,False,112.33,12.34
2022-01-05 10:33:00+00:00,9999 8888 7777 6666,77.95,Dublin,False,77.95,77.95
2022-01-05 11:50:00+00:00,9999 8888 7777 6666,183.0,Dublin,False,183.0,77.95


In [7]:
df5.reset_index(inplace=True)

In [8]:
fg_agg = fs.get_or_create_feature_group(
     name="credit_card_rolling_windows",
     version=1,
     description="Daily Credit Card Spending",
     primary_key=['credit_card_number'],
     event_time='trans_datetime'
) 

In [9]:
fg_agg.insert(df5)

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1214673/fs/1202301/fg/1403521


Uploading Dataframe: 100.00% |██████████| Rows 8/8 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: credit_card_rolling_windows_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1214673/jobs/named/credit_card_rolling_windows_1_offline_fg_materialization/executions


(Job('credit_card_rolling_windows_1_offline_fg_materialization', 'SPARK'),
 None)

In [11]:
query = fg.select_all().join(fg_agg.select(['rolling_max_1d', 'rolling_min_1d']))

training_data = query.read()
training_data.head()

2025-03-05 03:02:23,834 ERROR: Peer ServerlessFstore__majidali is not known. Please register client certificates first.. Detail: Python exception: FlyingDuckException. gRPC client debug context: UNKNOWN:Error received from peer ipv4:51.79.26.27:5005 {created_time:"2025-03-05T03:02:23.834713087+00:00", grpc_status:2, grpc_message:"Peer ServerlessFstore__majidali is not known. Please register client certificates first.. Detail: Python exception: FlyingDuckException"}. Client context: IOError: Server never sent a data message. Detail: Internal
Traceback (most recent call last):
  File "/home/codespace/.pyenv/versions/3.9.21/lib/python3.9/site-packages/hsfs/core/arrow_flight_client.py", line 394, in afs_error_handler_wrapper
    return func(instance, *args, **kw)
  File "/home/codespace/.pyenv/versions/3.9.21/lib/python3.9/site-packages/hsfs/core/arrow_flight_client.py", line 459, in read_query
    return self._get_dataset(
  File "/home/codespace/.pyenv/versions/3.9.21/lib/python3.9/site-

Unnamed: 0,credit_card_number,trans_datetime,amount,location,fraud,rolling_max_1d,rolling_min_1d
0,9999 8888 7777 6666,2022-01-02 04:11:00+00:00,55.67,San Francisco,False,142.34,55.67
1,9999 8888 7777 6666,2022-01-03 07:24:00+00:00,84.0,San Francisco,False,112.33,12.34
2,9999 8888 7777 6666,2022-01-05 10:33:00+00:00,77.95,Dublin,False,77.95,77.95
3,9999 8888 7777 6666,2022-01-05 11:50:00+00:00,183.0,Dublin,False,183.0,77.95
4,1111 2222 3333 4444,2022-01-01 08:44:00+00:00,142.34,Sao Paolo,False,142.34,142.34
