In [1]:
import time
import pandas as pd
import vaex #can seem a little slow to import\n
try:
    import progressbar
    bar=True
except:
    bar=False
    pass
import numpy as np

In [2]:
#hdf Store in pandas format
store= pd.HDFStore('pandas.hdf5', mode='r')

INFO:MainThread:numexpr.utils:NumExpr defaulting to 8 threads.


In [3]:
#assuming the store is too big to load in one go, load the store in chunks \n",
if bar:
    prog_bar= progressbar.ProgressBar()
for i, tmp_df in enumerate(store.select(key='main', chunksize=5_000_000)):
    if bar:
        prog_bar.update(i)
    #converting the pandas data frame into a vaex data frame \n",
    dfvx= vaex.from_pandas(tmp_df, copy_index=True) #copy_index is important if you're using time in your index\n",
    dfvx.export_hdf5('batch'+str(i)+'.hdf5') #export the chunk as a hdf file\n",
if bar:
    prog_bar.finish()

| |  #                                                | 1 Elapsed Time: 0:00:40


In [4]:
#this loads all the hdf stores that have been created and treats them as one store
vxhdf= vaex.open('batch*.hdf5')

In [5]:
#alternatively once loaded the combined store can be saved as one file,
vxhdf.export_hdf5('batches_combined.hdf5')

In [6]:
#accessing the data from a vaex store is very simple,
print(vxhdf.index.values)

['1999-01-01T01:01:00.000000000' '1999-01-01T01:01:00.000000000'
 '1999-01-01T01:01:00.000000000' ... '1999-04-01T00:00:00.000000000'
 '1999-04-01T00:00:00.000000000' '1999-04-01T00:00:00.000000000']


In [7]:
#selecting data appears much quicker than through pandas
date= np.datetime64('1999-01-01T01:05:00')
#doesn't work with pandas time stamps due to the default data type of the date column
vxhdf[vxhdf.index== date]

#,glon,glat,Bphi,Btheta,Br,Declination,Site,mlat,mlt,f1_0,f1_1,f2_0,f2_1,dst_Index,index
0,20.42,67.84,-0.76164532187404,1.479829856323824,2.3,5.50105325068072,KIR,64.48617553710938,2.8305669148763037,1.022489309310913,0.14385730028152466,-0.30845537781715393,0.8980472683906555,-4.0,1999-01-01 01:05:00.000000000
1,18.82,54.61,-0.47071630742210235,-1.3778338644143207,1.4,2.916568243327928,HLP,50.47977828979492,2.3218922932942725,1.1003310680389404,0.04359893500804901,-0.19340701401233673,0.9350515604019165,-4.0,1999-01-01 01:05:00.000000000
2,12.68,52.07,-0.10782239093870846,-0.3979627269132881,1.9,1.1232996617467088,NGK,47.811790466308594,1.9250356038411454,1.1347672939300537,0.05207890644669533,-0.1856136918067932,0.9190542101860046,-4.0,1999-01-01 01:05:00.000000000
3,15.83,78.2,-3.7718401774507746,-3.314094397534281,-21.3,2.889196997800079,LYR,75.02986145019531,3.437208557128905,0.9328954815864563,0.3402070701122284,-0.514703094959259,0.8120315074920654,-4.0,1999-01-01 01:05:00.000000000
4,23.53,68.02,1.7967067456994747,1.6036972500936713,2.2,6.615126484780932,MUO,64.50875854492188,3.0032216389973954,1.0163499116897583,0.13149338960647583,-0.3018020689487457,0.9099292159080505,-4.0,1999-01-01 01:05:00.000000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,19.2,74.5,1.2432700183566503,-1.632262130129672,2.0,5.729075574348678,BJN,71.24890899658203,3.1954416910807275,0.9774999618530273,0.23927836120128632,-0.4114638566970825,0.8609490990638733,-4.0,1999-01-01 01:05:00.000000000
62,17.83,59.35,-3.216810702990014,-2.1335718645334847,0.9,3.01842961989791,LOV,55.68308639526367,2.3793034871419287,1.0766257047653198,0.08103016763925552,-0.23433928191661835,0.9169272184371948,-4.0,1999-01-01 01:05:00.000000000
63,18.82,68.35,0.748261853632438,1.7435894581003146,3.4,4.7916537507809345,ABK,65.09888458251953,2.7718577067057275,1.0219682455062866,0.15699729323387146,-0.31860607862472534,0.8896398544311523,-4.0,1999-01-01 01:05:00.000000000
64,9.07,53.75,-0.9972442077190006,-2.1013100652142946,2.0,-0.07516476614205599,WNG,49.866294860839844,1.7569829305013016,1.1358120441436768,0.08060669898986816,-0.20573362708091736,0.9010711312294006,-4.0,1999-01-01 01:05:00.000000000


In [8]:
#selecting a range of a day
date2=date+np.timedelta64(1, 'D')
index= vxhdf.index>date
index&=vxhdf.index<=date2
# has been done using the index variable as there is a error when applying two condtions at once
vxhdf[index]

#,glon,glat,Bphi,Btheta,Br,Declination,Site,mlat,mlt,f1_0,f1_1,f2_0,f2_1,dst_Index,index
0,20.42,67.84,-0.9378760059564326,0.6931007123436028,2.1,5.50105325068072,KIR,64.48617553710938,2.8478047688802093,1.022489309310913,0.14385730028152466,-0.30845537781715393,0.8980472683906555,-4.0,1999-01-01 01:06:00.000000000
1,18.82,54.61,-0.37084583887734335,-1.3829220382173968,1.4,2.916568243327928,HLP,50.47977828979492,2.339130147298178,1.1003310680389404,0.04359893500804901,-0.19340701401233673,0.9350515604019165,-4.0,1999-01-01 01:06:00.000000000
2,12.68,52.07,-0.7077070845593652,-0.3862003139103896,1.9,1.1232996617467088,NGK,47.811790466308594,1.942273457845051,1.1347672939300537,0.05207890644669533,-0.1856136918067932,0.9190542101860046,-4.0,1999-01-01 01:06:00.000000000
3,15.83,78.2,-5.639182267232357,-2.6190882682590395,-21.7,2.889196997800079,LYR,75.02986145019531,3.454446411132814,0.9328954815864563,0.3402070701122284,-0.514703094959259,0.8120315074920654,-4.0,1999-01-01 01:06:00.000000000
4,23.53,68.02,1.7045472226499325,0.809023340674669,1.9,6.615126484780932,MUO,64.50875854492188,3.020459493001301,1.0163499116897583,0.13149338960647583,-0.3018020689487457,0.9099292159080505,-4.0,1999-01-01 01:06:00.000000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97315,19.2,74.5,30.825704963011244,-63.39381605120001,9.2,5.729075574348678,BJN,71.24897766113281,3.188336690266926,0.9775029420852661,0.2392718493938446,-0.41145622730255127,0.8609535694122314,-2.0,1999-01-02 01:05:00.000000000
97316,17.83,59.35,-0.45718058821405677,0.8252187041992256,-9.9,3.01842961989791,LOV,55.68312072753906,2.3722106933593743,1.0766266584396362,0.081025131046772,-0.23433440923690796,0.9169308543205261,-2.0,1999-01-02 01:05:00.000000000
97317,18.82,68.35,11.53627601954569,-14.414032593304867,-42.4,4.7916537507809345,ABK,65.09893035888672,2.764759318033853,1.0219700336456299,0.1569916158914566,-0.3185998797416687,0.8896437287330627,-2.0,1999-01-02 01:05:00.000000000
97318,9.07,53.75,3.699865628904793,0.10485384136472393,-4.5,-0.07516476614205599,WNG,49.866294860839844,1.7498911539713546,1.1358132362365723,0.08060088753700256,-0.20572905242443085,0.9010748267173767,-2.0,1999-01-02 01:05:00.000000000


In [9]:
#other functionalities can include
vxhdf.select(vxhdf.mlat>60)
print('average Br of all magnetometers:',vxhdf.mean(vxhdf.Br, selection=False))
print('average Br of magnetometers above 60 mlat:',vxhdf.mean(vxhdf.Br, selection=True))

average Br of all magnetometers: 2.4802884188803938
average Br of magnetometers above 60 mlat: 2.2848380147040874


- vaex can convert to and from pandas data frames
- vaex can convert to and from dask
- can also read parquet files
- there are some plotting functionalities and a range of functions that incorporate the selection tool
- seems much quicker than pandas and shares most functionality\n","- also if you can't perform the opertation you want you can convert easily to a format that can
- I asked questions on converting from a pandas hdf store to a vaex hdf store in an efficient way and the creators replied very quickly
- seems much quicker than pandas as shares most functionality
- also if you can't perform the opertation you want you can convert easily to a format that can
- I asked questions on converting from a pandas hdf store to and one of the creaters answered within a couple of hours