Load the big data file es_forecast.csv via vaex in memory-friendly chunks into several HDF5 files and store those in the directory data/es_forecast:

In [1]:
# Main data packages. 
import numpy as np
import pandas as pd

import os, vaex
from tqdm.notebook import tqdm

import datetime

In [17]:


# Read in the first two lines of data and store the data types
# in a dictionary. 
df_first2 = vaex.from_csv('data/es_forecast.csv', nrows = 30)
column_types = {name: (str(dtype.__name__) if isinstance(dtype, type) else 'float64') for name,dtype in df_first2.dtypes.to_dict().items()}

# Create a new directory in the data folder that holds the 
# converted data chunks for further preprocessing.
!mkdir data/es_forecast

# Read in the data in memory-friendly chunks.
# Export the data in the data format 'HDF5'.
for i, df in tqdm(enumerate(vaex.from_csv('data/es_forecast.csv', chunk_size=200_000, dtype=column_types)), total=185):
    df.export_hdf5(f'data/es_forecast/es_forecast_{i:02}.hdf5')
    
# Count the files exported.
print( f"# HDF5 Files in data/es_forecast: {len(os.listdir('data/es_forecast/'))}")

  0%|          | 0/185 [00:00<?, ?it/s]

# HDF5 Files in data/es_forecast: 185


Check the folder size in data/es_forecast:

In [114]:
# Check the folder size.
!du -h data/es_forecast

du: data/es_forecast: No such file or directory


In [2]:
# Import all .hdf5 chunks into a single dataset and
# export it to a single .hdf5 file we can work on
df = vaex.open('data/es_forecast/es_forecast*')
df.export_hdf5('data/es_forecast_aggregated.hdf5')

# Remove the temporary file directory from the data 
# preprocessing step.
!rm -r data/es_forecast

ERROR:MainThread:vaex:error opening 'data/es_forecast/es_forecast*'


OSError: Could not open file: data/es_forecast/es_forecast*, it does not exist

In [20]:
# Open the HDF5 file and show the dimensions
df = vaex.open('data/es_forecast_aggregated.hdf5')
print(f'Data dimensions: {df.shape}')

Data dimensions: (36876431, 6)


In [21]:
# Preview the data
df.head(20)

#,dt_start_utc,power_mw,carrier,type,area,version_utc
<i style='opacity: 0.6'>0</i>,2016-12-31 23:00:00,6291.0,Gesamt,Load Forecast,50Hertz,2019-09-20 14:00:00
<i style='opacity: 0.6'>1</i>,2016-12-31 23:00:00,40241.0,Gesamt,Load Forecast,DE,2019-09-20 14:00:00
<i style='opacity: 0.6'>2</i>,2016-12-31 23:00:00,2942.0,Gesamt,Load Forecast,DK,2019-09-20 14:00:00
<i style='opacity: 0.6'>3</i>,2016-12-31 23:00:00,1572.0,Gesamt,Load Forecast,DK1,2019-09-20 14:00:00
<i style='opacity: 0.6'>4</i>,2016-12-31 23:00:00,11971.0,Gesamt,Load Forecast,TTG,2019-09-20 14:00:00
...,...,...,...,...,...,...
<i style='opacity: 0.6'>15</i>,2016-12-31 23:00:00,331.0,Wind Offshore,Renewables Forecast,50Hertz,2019-09-20 14:00:00
<i style='opacity: 0.6'>16</i>,2016-12-31 23:00:00,3340.0,Wind Offshore,Renewables Forecast,DE,2019-09-20 14:00:00
<i style='opacity: 0.6'>17</i>,2016-12-31 23:00:00,1004.0,Wind Offshore,Renewables Forecast,DK,2019-09-20 14:00:00
<i style='opacity: 0.6'>18</i>,2016-12-31 23:00:00,652.0,Wind Offshore,Renewables Forecast,DK1,2019-09-20 14:00:00


In [22]:
# Show some summary statistics
df.describe()

Unnamed: 0,dt_start_utc,power_mw,carrier,type,area,version_utc
dtype,str,float64,str,str,str,str
count,36876431,36876431,36876431,36876431,36876431,36876431
,0,0,0,0,0,0
mean,--,8283.930682527276,--,--,--,--
std,--,14112.233684,--,--,--,--
min,--,0.0,--,--,--,--
max,--,76393.0,--,--,--,--


# How to filter via utc timestamps in the hdf5 dataframe

In order to filter via dates in the vaex df, you have to convert the information in the 'dt_start_utc' column to 'datetime64' and the define a filter variable (here: start_date) according to below code. At the beginning of the notebook you can see the necessary imports (import datetime to use the below code).

In [23]:
df['dt_start_utc']=df['dt_start_utc'].astype('datetime64')

In [24]:
start_date=np.datetime64('2018-01-01')
df_date_filter2=df[(df.dt_start_utc >= start_date)]

In [25]:
df_date_filter2.tail(40)

#,dt_start_utc,power_mw,carrier,type,area,version_utc
<i style='opacity: 0.6'>0</i>,2020-08-28 06:00:00,524.0,Wind Offshore,Renewables Forecast,DK1,2020-08-28 20:01:46
<i style='opacity: 0.6'>1</i>,2020-08-28 06:00:00,3798.0,Wind Offshore,Renewables Forecast,TTG,2020-08-27 16:02:00
<i style='opacity: 0.6'>2</i>,2020-08-28 06:00:00,3798.0,Wind Offshore,Renewables Forecast,TTG,2020-08-27 17:02:17
<i style='opacity: 0.6'>3</i>,2020-08-28 06:00:00,3798.0,Wind Offshore,Renewables Forecast,TTG,2020-08-27 18:01:50
<i style='opacity: 0.6'>4</i>,2020-08-28 06:00:00,3798.0,Wind Offshore,Renewables Forecast,TTG,2020-08-27 19:01:49
...,...,...,...,...,...,...
<i style='opacity: 0.6'>35</i>,2020-08-28 06:00:00,4547.0,Wind Onshore,Renewables Forecast,50Hertz,2020-08-27 21:01:44
<i style='opacity: 0.6'>36</i>,2020-08-28 06:00:00,4547.0,Wind Onshore,Renewables Forecast,50Hertz,2020-08-27 22:02:25
<i style='opacity: 0.6'>37</i>,2020-08-28 06:00:00,4547.0,Wind Onshore,Renewables Forecast,50Hertz,2020-08-27 23:01:56
<i style='opacity: 0.6'>38</i>,2020-08-28 06:00:00,4547.0,Wind Onshore,Renewables Forecast,50Hertz,2020-08-28 00:02:11


In [13]:
df_date_filter2.describe()

Unnamed: 0,dt_start_utc,power_mw,carrier,type,area,version_utc
dtype,datetime64[s],float64,str,str,str,str
count,36263209,36263209,36263209,36263209,36263209,36263209
,0,0,0,0,0,0
mean,2020-07-27T04:22:40,8305.525886498352,--,--,--,--
std,18789173.071796,14123.4004,--,--,--,--
min,2018-01-01T00:00:00,0.0,--,--,--,--
max,2021-07-14T21:45:36,76393.0,--,--,--,--
