# Feature pipeline (daily)

explanation here

### Load imports

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import hopsworks
from datetime import datetime, timedelta, date
from entsoe import EntsoePandasClient
import time

#### Helper functions (timestamp)

In [48]:
# # functions for replacing date and time with timestamp (seconds since 1970-01-01)

# def entsoe_timestamp_2_time(x):
#     dt_obj = datetime.strptime(str(x), '%Y-%m-%d %H:%M:%S')
#     dt_obj = dt_obj.timestamp() * 1000
#     return int(dt_obj)

# def weather_timestamp_2_time(x, i):
#     dt_obj = datetime.strptime(str(x), '%Y-%m-%d %H:%M:%S')
#     dt_obj = dt_obj + timedelta(hours=i)
#     dt_obj = dt_obj.timestamp() * 1000

#     return int(dt_obj)

## Fetch & Parse data

In [5]:
today = datetime.today().date().strftime('%Y%m%d')
today

'20230114'

In [3]:
## Get current date and time for prediction, prediction and actual data is available 

# a. used to retrieve earlier dates, 
#date_from = "20230112"

# b. retrieve from the past day (default)
date_from = datetime.now() - timedelta(days=1)
date_from = date_from.date().strftime('%Y%m%d')

# 1 day ahead
date_to = (datetime.strptime(date_from, '%Y%m%d') + timedelta(days=1)).strftime('%Y%m%d')

# time
# time = datetime.now().time().strftime('%H')
date_from, date_to

('20230114', '20230115')

In [32]:
date_from = datetime.now() - timedelta(days=3)
date_from = date_from.date().strftime('%Y%m%d')
date_to = (datetime.strptime(date_from, '%Y%m%d') + timedelta(days=4)).strftime('%Y%m%d')

date_from, date_to

('20230111', '20230115')

### Entsoe API

In [42]:
# Client
client = EntsoePandasClient(api_key="cb3a29b2-3276-4a4c-aba3-6507120d99be")

# Date and country
start = pd.Timestamp(date_from, tz='Europe/Stockholm')
end = pd.Timestamp(date_to, tz='Europe/Stockholm')
country_code = 'SE_3'  

In [43]:
## Query entsoe

# Day price
df_day_price = client.query_day_ahead_prices(country_code, start=start,end=end)

# Generation per production type
df_generation_per_prod = client.query_generation(country_code, start=start,end=end, psr_type=None)

# Actual load (consumption)
df_load = client.query_load(country_code, start=start,end=end)

In [44]:
df_generation_per_prod.head()

Unnamed: 0,Fossil Gas,Hydro Water Reservoir,Nuclear,Other,Solar,Wind Onshore
2023-01-03 00:00:00+01:00,0.0,689.0,5798.0,832.0,0.0,814.0
2023-01-03 01:00:00+01:00,0.0,671.0,5797.0,825.0,0.0,839.0
2023-01-03 02:00:00+01:00,0.0,672.0,5797.0,837.0,0.0,893.0
2023-01-03 03:00:00+01:00,0.0,666.0,5797.0,845.0,0.0,926.0
2023-01-03 04:00:00+01:00,0.0,662.0,5798.0,878.0,0.0,935.0


In [45]:
# Combine entsoe data
df_entsoe = df_generation_per_prod.join(df_day_price.rename("day_ahead_price"))
df_entsoe = df_entsoe.join(df_load)


In [46]:
# convert current index (date) into column, rename and convert into timestamp (as int64)
df_entsoe_clean = df_entsoe.reset_index()
df_entsoe_clean = df_entsoe_clean.rename(columns = {'index':'datetime'})
df_entsoe_clean['datetime'] = df_entsoe_clean.DateTime.values.astype('int64') // 10 ** 6  ## divide by 10^6 to convert from ns to ms


In [47]:
df_entsoe_clean # gmt + 1

Unnamed: 0,DateTime,Fossil Gas,Hydro Water Reservoir,Nuclear,Other,Solar,Wind Onshore,day_ahead_price,Actual Load
0,1672700400000,0.0,689.0,5798.0,832.0,0.0,814.0,78.81,10026.0
1,1672704000000,0.0,671.0,5797.0,825.0,0.0,839.0,73.93,9951.0
2,1672707600000,0.0,672.0,5797.0,837.0,0.0,893.0,73.94,9940.0
3,1672711200000,0.0,666.0,5797.0,845.0,0.0,926.0,71.44,9933.0
4,1672714800000,0.0,662.0,5798.0,878.0,0.0,935.0,72.33,9898.0
...,...,...,...,...,...,...,...,...,...
235,1673546400000,0.0,1195.0,5788.0,836.0,0.0,992.0,98.37,11594.0
236,1673550000000,0.0,1154.0,5788.0,800.0,0.0,1007.0,74.08,11074.0
237,1673553600000,0.0,1130.0,5791.0,787.0,0.0,1000.0,74.50,10720.0
238,1673557200000,0.0,1099.0,5796.0,754.0,0.0,1016.0,66.84,10132.0


### SMHI

In [15]:
import json
from urllib.request import urlopen
from pandas import json_normalize

In [18]:
## fetch data
url = "https://opendata-download-metobs.smhi.se/api/version/latest/parameter/1/station/71420/period/latest-months/data.json"
response = urlopen(url)

# convert response to json, to dataframe
data_json = json.loads(response.read())
df_smhi_data = json_normalize(data_json['value']) 

# get timestamps the specified day (or latest)
timeseries_from = df_entsoe_clean["datetime"].iloc[0]
timeseries_to = df_entsoe_clean["datetime"].iloc[-1]

# #extract only the temperature in the time stamp interval
df_smhi_data = df_smhi_data.loc[(df_smhi_data['date'] >= timeseries_from) & (df_smhi_data['date'] <= timeseries_to)]
df_smhi_data = df_smhi_data.reset_index().rename(columns = {'date':'datetime'})

# data_json = json.loads(response.read())
# df_smhi_data = json_normalize(data_json['timeSeries'][0])
# for i in range(10):
#     print(df_smhi_data.parameters[i][10]['values'])# get timestamps the specified day (or latest)


In [13]:
def get_current_date():
    yesterday = datetime.today() - timedelta(days=1)
    yesterday = yesterday.date().strftime('%Y%m%d')
    tomorrow = (datetime.strptime(yesterday, '%Y%m%d') + timedelta(days=2)).strftime('%Y%m%d')

#     date_from = datetime.now() - timedelta(days=1)
# date_from = date_from.date().strftime('%Y%m%d')

# 1 day ahead   
    #date_to = (datetime.strptime(date_from, '%Y%m%d') + timedelta(days=1)).strftime('%Y%m%d')

    return yesterday, tomorrow

def get_entsoe_data(date_from, date_to):
    # Client
    client = EntsoePandasClient(api_key="cb3a29b2-3276-4a4c-aba3-6507120d99be")

    # Date and country
    start = pd.Timestamp(date_from, tz='Europe/Stockholm')
    end = pd.Timestamp(date_to, tz='Europe/Stockholm')
    country_code = 'SE_3'  

    df_day_price = client.query_day_ahead_prices(country_code, start=start,end=end)
    df_generation_per_prod = client.query_generation(country_code, start=start,end=end, psr_type=None)    
    df_load = client.query_load(country_code, start=start,end=end)

    df_entsoe = df_generation_per_prod.join(df_day_price.rename("day_ahead_price"))
    df_entsoe = df_entsoe.join(df_load)

    df_entsoe_clean = df_entsoe.reset_index()
    df_entsoe_clean = df_entsoe_clean.rename(columns = {'index':'DateTime'})
    df_entsoe_clean['DateTime'] = df_entsoe_clean.DateTime.values.astype('int64') // 10 ** 6

    col_list = ["Hydro Water Reservoir", "Nuclear", "Other", "Solar", "Wind Onshore"]
    df_entsoe_clean['total_generation'] = df_entsoe_clean[list(col_list)].sum(axis=1)

    df_entsoe_clean.drop(col_list + ["Fossil Gas"], axis=1, inplace=True)
    df_entsoe_clean.rename(columns={"Actual Load": "total_load", "DateTime":"datetime"}, inplace=True)

    # df_entsoe = df_generation_forecast.join(df_day_price.rename("day_ahead_price"))
    # df_entsoe = df_entsoe.join(df_load_forecast)
    return df_entsoe_clean 

date_from, date_to = get_current_date()
test = get_entsoe_data(date_from, date_to)
df_entsoe_clean = test.tail(24)


In [21]:
df_entsoe_clean

Unnamed: 0,datetime,day_ahead_price,total_load,total_generation
19,1673632800000,91.07,11029.0,9047.0
20,1673636400000,84.94,10302.0,8927.0
21,1673640000000,76.89,9807.0,8789.0
22,1673643600000,76.51,9380.0,8640.0
23,1673647200000,68.3,9102.0,8555.0
24,1673650800000,49.71,8918.0,8334.0
25,1673654400000,44.24,8795.0,8285.0
26,1673658000000,45.41,8627.0,8347.0
27,1673661600000,42.25,8620.0,8297.0
28,1673665200000,41.7,8733.0,8406.0


In [20]:
df_smhi_data

Unnamed: 0,index,datetime,value,quality
0,3111,1673632800000,6.6,G
1,3112,1673636400000,6.4,G
2,3113,1673640000000,6.5,G
3,3114,1673643600000,6.1,G
4,3115,1673647200000,6.3,G
5,3116,1673650800000,6.2,G
6,3117,1673654400000,6.3,G
7,3118,1673658000000,5.9,G
8,3119,1673661600000,5.6,G
9,3120,1673665200000,5.9,G


## Combine & clean final data

In [51]:
# combine Entsoe and SMHI data
df_feature_data = df_entsoe_clean.merge(df_smhi_data, how='inner', on='DateTime')

# create column total_generation, the sum of all production types                       
col_list = ["Hydro Water Reservoir", "Nuclear", "Other", "Solar", "Wind Onshore"]
df_feature_data['total_generation'] = df_feature_data[list(col_list)].sum(axis=1)

# drop redundant/irrelevant columns
df_feature_data.drop(col_list + ["Fossil Gas", "index", "quality"], axis=1, inplace=True)

# Convert into float type
df_feature_data["value"] = df_feature_data["value"].astype(float)

# rename to matching columns names
df_feature_data.rename(columns={"Actual Load": "total_load", "value": "temperature", "DateTime":"datetime"}, inplace=True)

df_feature_data.head()

Unnamed: 0,datetime,day_ahead_price,total_load,temperature,total_generation
0,1672700400000,78.81,10026.0,3.6,8133.0
1,1672704000000,73.93,9951.0,1.9,8132.0
2,1672707600000,73.94,9940.0,0.8,8199.0
3,1672711200000,71.44,9933.0,0.3,8234.0
4,1672714800000,72.33,9898.0,0.6,8273.0


## Add to feature group

In [52]:
df_feature_data

Unnamed: 0,datetime,day_ahead_price,total_load,temperature,total_generation
0,1672700400000,78.81,10026.0,3.6,8133.0
1,1672704000000,73.93,9951.0,1.9,8132.0
2,1672707600000,73.94,9940.0,0.8,8199.0
3,1672711200000,71.44,9933.0,0.3,8234.0
4,1672714800000,72.33,9898.0,0.6,8273.0
...,...,...,...,...,...
235,1673546400000,98.37,11594.0,4.5,8811.0
236,1673550000000,74.08,11074.0,5.1,8749.0
237,1673553600000,74.50,10720.0,5.1,8708.0
238,1673557200000,66.84,10132.0,5.3,8665.0


In [39]:
import hopsworks

project = hopsworks.login() 
fs = project.get_feature_store() 

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/4247




Connected. Call `.close()` to terminate connection gracefully.


In [53]:
new_electricity_data_fg = fs.get_feature_group(name = 'new_electricity_data_fg', version = 2)

In [54]:
new_electricity_data_fg.insert(df_feature_data)

Uploading Dataframe: 100.00% |██████████| Rows 240/240 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching offline feature group backfill job...
Backfill Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/4247/jobs/named/new_electricity_data_fg_2_offline_fg_backfill/executions


(<hsfs.core.job.Job at 0x7f214c7aa940>, None)