# How to convert a pandas written hdf file with multiple keys into vaex written hdf file
#### Where the keys will be strings in an a new column

First we will load the neccesary packages

In [1]:
import pandas as pd
import numpy as np
import vaex as vx

Next we will inform the code where the original file is and where we would like the new file to go

In [2]:
pandas_path= 'test.h5'
vaex_path= 'test_vaex.hdf5'

We now need to load the pandas HDF file as a store using the pandas package

In [3]:
store=pd.HDFStore(pandas_path, mode='r')

INFO:MainThread:numexpr.utils:Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:MainThread:numexpr.utils:NumExpr defaulting to 8 threads.


As we will be going through each key in the HDF store we need an empty list to put the dataframe from each key in
- It may become clearer as to why we do this when you get to after the loop

In [4]:
dfs= []

For the next step we need to know the names of the keys in the hdf file, this line can be used to retrieve those

In [5]:
store.keys()

['/f06',
 '/f07',
 '/f08',
 '/f09',
 '/f12',
 '/f13',
 '/f14',
 '/f15',
 '/f16',
 '/f17',
 '/f18']

The satellite name can be retrieved from the key

In [6]:
key= store.keys()[0]
key

'/f06'

In [7]:
key[1:]

'f06'

The next step is to:
- loop through the keys
- load the dataframe for the key using store.select(key)
- convert the dataframe into vaex format
- create a new column that refers to the key the data comes from
- add the dataframe to the list "dfs"

First lets look at each step in the loop individually before creating

Here we select the dataframe with the key we chose above

In [8]:
data= store.select(key)
data

Unnamed: 0,ELE_TOTAL_ENERGY_FLUX,APEX_LAT,APEX_MLT,Kilcommons_ELE_ENERGY_FLUX,aurora,omni_By,omni_Bz
1987-01-01 00:00:00,,40.382637,5.544327,0.000000e+00,False,,
1987-01-01 00:01:40,,45.788013,5.439890,0.000000e+00,False,,
1987-01-01 00:03:20,,51.172146,5.313400,0.000000e+00,False,,
1987-01-01 00:05:00,,56.465580,5.157762,0.000000e+00,False,,
1987-01-01 00:06:40,,61.619465,4.961373,0.000000e+00,False,,
...,...,...,...,...,...,...,...
1987-07-21 23:51:40,8.050353e+07,-66.042656,18.260845,0.000000e+00,False,,
1987-07-21 23:53:20,4.335704e+09,-71.643639,18.200677,1.315906e+10,True,,
1987-07-21 23:55:00,3.078362e+09,-77.254066,18.088217,3.720737e+08,False,,
1987-07-21 23:56:40,0.000000e+00,-82.843079,17.809943,0.000000e+00,False,,


Next we convert into the vaex format

In [9]:
data= vx.from_pandas(data,copy_index=True)
data

#,ELE_TOTAL_ENERGY_FLUX,APEX_LAT,APEX_MLT,Kilcommons_ELE_ENERGY_FLUX,aurora,omni_By,omni_Bz,index
0,,40.38263702392578,5.544326922848863,0.0,False,,,1987-01-01 00:00:00.000000000
1,,45.78801345825195,5.439889926138547,0.0,False,,,1987-01-01 00:01:40.000000000
2,,51.17214584350586,5.313399980995317,0.0,False,,,1987-01-01 00:03:20.000000000
3,,56.465579986572266,5.157761853469031,0.0,False,,,1987-01-01 00:05:00.000000000
4,,61.61946487426758,4.961372861905423,0.0,False,,,1987-01-01 00:06:40.000000000
...,...,...,...,...,...,...,...,...
83803,80503528.0,-66.04265594482422,18.260845121092423,0.0,False,,,1987-07-21 23:51:40.000000000
83804,4335703552.0,-71.64363861083984,18.200676976056375,13159058432.0,True,,,1987-07-21 23:53:20.000000000
83805,3078361856.0,-77.25406646728516,18.088217058900128,372073664.0,False,,,1987-07-21 23:55:00.000000000
83806,0.0,-82.84307861328125,17.80994257960121,0.0,False,,,1987-07-21 23:56:40.000000000


Now we add in a new column that has the satellite name so we know which key it came from, you might have to scroll to the right to see the new column

In [10]:
data['sat_id']= np.array([key[1:]]*len(data))
data

#,ELE_TOTAL_ENERGY_FLUX,APEX_LAT,APEX_MLT,Kilcommons_ELE_ENERGY_FLUX,aurora,omni_By,omni_Bz,index,sat_id
0,,40.38263702392578,5.544326922848863,0.0,False,,,1987-01-01 00:00:00.000000000,'f06'
1,,45.78801345825195,5.439889926138547,0.0,False,,,1987-01-01 00:01:40.000000000,'f06'
2,,51.17214584350586,5.313399980995317,0.0,False,,,1987-01-01 00:03:20.000000000,'f06'
3,,56.465579986572266,5.157761853469031,0.0,False,,,1987-01-01 00:05:00.000000000,'f06'
4,,61.61946487426758,4.961372861905423,0.0,False,,,1987-01-01 00:06:40.000000000,'f06'
...,...,...,...,...,...,...,...,...,...
83803,80503528.0,-66.04265594482422,18.260845121092423,0.0,False,,,1987-07-21 23:51:40.000000000,'f06'
83804,4335703552.0,-71.64363861083984,18.200676976056375,13159058432.0,True,,,1987-07-21 23:53:20.000000000,'f06'
83805,3078361856.0,-77.25406646728516,18.088217058900128,372073664.0,False,,,1987-07-21 23:55:00.000000000,'f06'
83806,0.0,-82.84307861328125,17.80994257960121,0.0,False,,,1987-07-21 23:56:40.000000000,'f06'


You may have noticed the date is shown as a column called index, this is because the index in the pandas file was the dates. So we will rename this "index" column to something more sensible

In [11]:
data.rename('index', 'Date_UTC')
data

#,ELE_TOTAL_ENERGY_FLUX,APEX_LAT,APEX_MLT,Kilcommons_ELE_ENERGY_FLUX,aurora,omni_By,omni_Bz,Date_UTC,sat_id
0,,40.38263702392578,5.544326922848863,0.0,False,,,1987-01-01 00:00:00.000000000,'f06'
1,,45.78801345825195,5.439889926138547,0.0,False,,,1987-01-01 00:01:40.000000000,'f06'
2,,51.17214584350586,5.313399980995317,0.0,False,,,1987-01-01 00:03:20.000000000,'f06'
3,,56.465579986572266,5.157761853469031,0.0,False,,,1987-01-01 00:05:00.000000000,'f06'
4,,61.61946487426758,4.961372861905423,0.0,False,,,1987-01-01 00:06:40.000000000,'f06'
...,...,...,...,...,...,...,...,...,...
83803,80503528.0,-66.04265594482422,18.260845121092423,0.0,False,,,1987-07-21 23:51:40.000000000,'f06'
83804,4335703552.0,-71.64363861083984,18.200676976056375,13159058432.0,True,,,1987-07-21 23:53:20.000000000,'f06'
83805,3078361856.0,-77.25406646728516,18.088217058900128,372073664.0,False,,,1987-07-21 23:55:00.000000000,'f06'
83806,0.0,-82.84307861328125,17.80994257960121,0.0,False,,,1987-07-21 23:56:40.000000000,'f06'


Now we will apply these steps to all the keys in a for loop that loops through the keys in the hdf file

In [12]:
for key in store.keys(): #Loop through the keys in the HDF store
    # Load the data frame for the chosen key and change to vaex format
    data= vx.from_pandas(store.select(key),copy_index=True) # copy index is important to preserve the index from the pandas data frame
    # Create a new column and populate it with strings based on the key
    data['sat_id']= np.array([key[1:]]*len(data))
    # rename the index column (that would've orginally been the actual index in the pandas file) to something sensible
    data.rename('index', 'Date_UTC')
    # add the vaex data frame to the list
    dfs.append(data)

Finally we will use the concat function that takes a list of vaex dataframes combines into one by sticking them on top of each other

In [15]:
# combine the vaex data frame together
data= vx.concat(dfs)
data

#,ELE_TOTAL_ENERGY_FLUX,APEX_LAT,APEX_MLT,Kilcommons_ELE_ENERGY_FLUX,aurora,omni_By,omni_Bz,Date_UTC,sat_id
0,,40.38263702392578,5.544326922848863,0.0,False,,,1987-01-01 00:00:00.000000000,'f06'
1,,45.78801345825195,5.439889926138547,0.0,False,,,1987-01-01 00:01:40.000000000,'f06'
2,,51.17214584350586,5.313399980995317,0.0,False,,,1987-01-01 00:03:20.000000000,'f06'
3,,56.465579986572266,5.157761853469031,0.0,False,,,1987-01-01 00:05:00.000000000,'f06'
4,,61.61946487426758,4.961372861905423,0.0,False,,,1987-01-01 00:06:40.000000000,'f06'
...,...,...,...,...,...,...,...,...,...
9613723,161507696.0,29.03656578063965,7.6839360889233,100973448.0,False,-1.309999942779541,2.140000104904175,2014-12-31 23:51:40.000000000,'f18'
9613724,343186880.0,24.836381912231445,7.596553715727309,0.0,False,-1.100000023841858,2.0899999141693115,2014-12-31 23:53:20.000000000,'f18'
9613725,321288672.0,21.413782119750977,7.517882096917265,0.0,False,-2.0899999141693115,1.7699999809265137,2014-12-31 23:55:00.000000000,'f18'
9613726,162384752.0,19.240137100219727,7.447675145180358,0.0,False,-2.2300000190734863,1.7400000095367432,2014-12-31 23:56:40.000000000,'f18'


Last but not least we will export the new dataframe as a hdf file, now in the vaex format so it can be read easily

In [14]:
# export the new vaex data frame as a hdf5 file
data.export_hdf5(vaex_path)