In [1]:
import hopsworks
from datetime import datetime, timedelta
import pandas as pd
import numpy as np

In [2]:
project = hopsworks.login()
fs = project.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/213141
Connected. Call `.close()` to terminate connection gracefully.


In [3]:
# retrieve feature groups
traffic_fg = fs.get_feature_group(name='traffic_flow_data', version=1)
weather_fg = fs.get_feature_group(name='weather_data', version=1)

### Process traffic data

In [4]:
# Rerieve data
query = traffic_fg.select_all()
# query.show(100)
traffic_fv = fs.get_or_create_feature_view(
    name='traffic_trainset',
    query=query,
    version=4,
    labels=['current_speed']
)

Feature view created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/213141/fs/213060/fv/traffic_trainset/version/4


In [10]:
# Create train set
# yesterday = (datetime.now()-timedelta(days=1)).day
# today = datetime.now().day
traffic_fv.create_training_data()

Training dataset job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/213141/jobs/named/traffic_trainset_4_create_fv_td_01012024150129/executions




(2, <hsfs.core.job.Job at 0x1dffd627a60>)

In [17]:
traffic_train = traffic_fv.get_training_data(training_dataset_version=2)

In [18]:
traffic_train = pd.concat([traffic_train[0], traffic_train[1]], axis=1)

In [22]:
# Extract day 27, 28 as a trainset
traffic_train  = traffic_train[(traffic_train['day']==27) | (traffic_train['day']==28)]

In [26]:
traffic_train = pd.read_csv('traffic_trainset.csv')

In [27]:
traffic_train

Unnamed: 0,free_flow_speed,confidence,weekend,day,hour,minute,current_speed
0,36,1.0,False,27,0,0,36
1,36,1.0,False,27,0,15,36
2,36,1.0,False,27,0,30,36
3,36,1.0,False,27,0,45,36
4,36,1.0,False,27,1,0,36
...,...,...,...,...,...,...,...
187,36,1.0,False,28,22,45,36
188,36,1.0,False,28,23,0,36
189,36,1.0,False,28,23,15,36
190,36,1.0,False,28,23,30,36


In [43]:
# Make a segment of traffic data every hour (use the first data per hour)
columns = ['free_flow_speed', 'confidence', 'weekend', 'day', 'hour', 'current_speed']
traffic_train_new = pd.DataFrame(columns=columns)
for day in [27, 28]:
    for hour in range(24):
        first_data_per_hour = traffic_train[(traffic_train['day']==day) & (traffic_train['hour']==hour) & (traffic_train['minute']==0)]
        new_data = {}
        for column in columns:
            new_data[f'{column}']=first_data_per_hour[f'{column}'].iloc[0]
        traffic_train_new.loc[len(traffic_train_new)] = new_data

traffic_train_new

Unnamed: 0,free_flow_speed,confidence,weekend,day,hour,current_speed
0,36,1.0,False,27,0,36
1,36,1.0,False,27,1,36
2,36,1.0,False,27,2,36
3,36,1.0,False,27,3,36
4,36,1.0,False,27,4,36
5,36,1.0,False,27,5,36
6,35,0.971429,False,27,6,32
7,35,0.976241,False,27,7,25
8,36,0.996419,False,27,8,25
9,35,0.99433,False,27,9,24


In [45]:
# Save the traffic trainset as a csv file
traffic_train_new.to_csv('traffic_trainset_hour.csv', index=False)

### Process weather data

In [4]:
# Create feature view
query = weather_fg.select_all()
try:
    weather_fv = fs.get_feature_view(name='weather_trainset', version=1)
except:
    weather_fv = fs.get_or_create_feature_view(
        name='weather_trainset',
        query=query,
        version=1
    )

In [12]:
# Create training set
# weather_fv.create_training_data()
weather_train = weather_fv.get_training_data(training_dataset_version=1)[0]

In [13]:
# Extract day 27 and 28 as a trainset
weather_train  = weather_train[(weather_train['day']==27) | (weather_train['day']==28)]

In [14]:
weather_train = weather_train.reset_index(drop=True)

In [15]:
# Fill empty hours (use the nearest data)
for day in [27, 28]:
    for hour in range(24):
        ind = hour + 24*(day%27)
        if weather_train.loc[ind]['hour']!=hour:
            i=1
            hours_of_the_day = weather_train[weather_train['day']==day]['hour'].to_list()
            while hour-i not in hours_of_the_day and \
                hour+i not in hours_of_the_day:
                i+=1
            if hour-i in hours_of_the_day:
                new_data = weather_train[(weather_train['day']==day) & (weather_train['hour']==hour-i)]
                new_data['hour']=hour
                weather_train = pd.concat([weather_train.iloc[:ind], new_data, weather_train.iloc[ind:]]).reset_index(drop=True)
            elif hour+i in hours_of_the_day:
                new_data = weather_train[(weather_train['day']==day) & (weather_train['hour']==hour+i)]
                new_data['hour']=hour
                weather_train = pd.concat([weather_train.iloc[:ind], new_data, weather_train.iloc[ind:]]).reset_index(drop=True)

weather_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,day,hour,temp,wd,ws,prec1h,frsn1h,vis
0,27,0,-5.6,275,4.2,0.0,0.0,60.0
1,27,1,-5.6,275,4.2,0.0,0.0,60.0
2,27,2,-5.6,275,4.2,0.0,0.0,60.0
3,27,3,-5.3,281,4.0,0.0,0.0,58.7
4,27,4,-5.3,281,4.0,0.0,0.0,58.7
5,27,5,-3.5,287,5.1,0.0,0.0,58.9
6,27,6,-3.5,287,5.1,0.0,0.0,58.9
7,27,7,-3.5,287,5.1,0.0,0.0,58.9
8,27,8,-2.7,286,5.1,0.0,0.0,55.8
9,27,9,-2.2,285,5.7,0.0,0.0,49.8


In [16]:
# Save the weather trainset as a csv file
weather_train.to_csv('weather_trainset.csv', index=False)

: 

In [46]:
hopsworks.logout()

Connection closed.
