# Hands on HDF5
This brief lab session gives an overview over the basic usage of the HDF5 file format for Python applications.

Work through the examples and try to change some of the settings...

### Sources:
* HDF5 documentation https://portal.hdfgroup.org/display/HDF5/HDF5
* h5py API: http://docs.h5py.org/en/stable/



In [1]:
import sys
IN_COLAB = 'google.colab' in sys.modules
print('running in Colab:',IN_COLAB)
path='..'
if IN_COLAB:
  #in colab, we need to clone the data from the repo
  !git clone https://github.com/keuperj/DataScienceSS20.git
  path='DataScienceSS20'

running in Colab: False


## Creating a Data Set

In [2]:
import numpy as np
import h5py #this is the HDF5 lib 

In [3]:
#create some random data
matrix1 = np.random.random(size = (1000,1000))
matrix2 = np.random.random(size = (10000,100))

In [4]:
# write it to the same file - in two different arrays
with h5py.File('hdf5_data.h5', 'w') as hdf: #note the write mode 'w'
    hdf.create_dataset('dataset1', data=matrix1)
    hdf.create_dataset('dataset2', data=matrix2)

## Reading 

In [5]:
#opening, listing and reading files
with h5py.File('hdf5_data.h5','r') as hdf:
    ls = list(hdf.keys())
    print('List of datasets in this file: \n', ls)
    data = hdf.get('dataset2') #here data is still some hdf5 object
    dataset1 = np.array(data) #need to convert it into numpy
    print('Shape of dataset1: \n', dataset1.shape)

List of datasets in this file: 
 ['dataset1', 'dataset2']
Shape of dataset1: 
 (10000, 100)


In [6]:
dataset1

array([[0.91971145, 0.36230974, 0.31624066, ..., 0.00283999, 0.08925693,
        0.57769745],
       [0.23826873, 0.33277627, 0.08914027, ..., 0.6613984 , 0.44588789,
        0.72270579],
       [0.08313031, 0.05226765, 0.13767883, ..., 0.47499835, 0.70893577,
        0.39471245],
       ...,
       [0.97627921, 0.77523194, 0.44976618, ..., 0.91766727, 0.29534689,
        0.66616802],
       [0.82574779, 0.45119996, 0.86465028, ..., 0.96875913, 0.42592237,
        0.3807802 ],
       [0.98316821, 0.83198641, 0.06704393, ..., 0.73154225, 0.27841907,
        0.81031698]])

In [7]:
f = h5py.File('hdf5_data.h5', 'r')
ls = list(f.keys())
f.close()

In [8]:
ls

['dataset1', 'dataset2']

## Array Slicing
HDF5 support fancy array slicing - so we do not read all data just to get a slice: http://docs.h5py.org/en/latest/high/dataset.html#fancy-indexing

In [9]:
f = h5py.File('hdf5_data.h5', 'r')
f['dataset1'][100:120,:] # this notation mostly follows numpy notation -> try different slices!

array([[0.92142997, 0.57541022, 0.3885701 , ..., 0.998872  , 0.00903802,
        0.63366094],
       [0.23449488, 0.28482728, 0.97169593, ..., 0.14189487, 0.65587375,
        0.85942152],
       [0.55885495, 0.42931871, 0.05031121, ..., 0.92223714, 0.89190211,
        0.15570669],
       ...,
       [0.33034277, 0.23511998, 0.69677524, ..., 0.666284  , 0.91152655,
        0.12095762],
       [0.21625754, 0.5701883 , 0.35523836, ..., 0.53997096, 0.57559244,
        0.68611711],
       [0.79302093, 0.86823233, 0.81046259, ..., 0.0422652 , 0.07660074,
        0.41612614]])

## Creating Groups
We can organize data in groups, just like in file systems where we have files (here datasets) in folders (here groups) 

In [10]:
matrix1 = np.random.random(size = (1000,1000))
matrix2 = np.random.random(size = (1000,1000))
matrix3 = np.random.random(size = (1000,1000))
matrix4 = np.random.random(size = (1000,1000))

In [11]:
with h5py.File('hdf5_groups.h5', 'w') as hdf:
    G1 = hdf.create_group('Group1')
    G1.create_dataset('dataset1', data = matrix1)
    G1.create_dataset('dataset4', data = matrix4)
 
    G21 = hdf.create_group('Group2/SubGroup1')
    G21.create_dataset('dataset3', data = matrix3)
    
    G22 = hdf.create_group('Group2/SubGroup2')
    G22.create_dataset('dataset2', data = matrix2)

## Reading Groups

In [12]:
with h5py.File('hdf5_groups.h5','r') as hdf:
    base_items = list(hdf.items())
    print('Items in the base directory:', base_items)
    G2 = hdf.get('Group2')
    G2_items = list(G2.items())
    print('Items in Group2:', G2_items)
    G21 = G2.get('/Group2/SubGroup1')
    G21_items = list(G21.items())
    print('Items in Group21:', G21_items)
    dataset3 = np.array(G21.get('dataset3'))
    print(dataset3.shape)


Items in the base directory: [('Group1', <HDF5 group "/Group1" (2 members)>), ('Group2', <HDF5 group "/Group2" (2 members)>)]
Items in Group2: [('SubGroup1', <HDF5 group "/Group2/SubGroup1" (1 members)>), ('SubGroup2', <HDF5 group "/Group2/SubGroup2" (1 members)>)]
Items in Group21: [('dataset3', <HDF5 dataset "dataset3": shape (1000, 1000), type "<f8">)]
(1000, 1000)


### What is happening? Interpret the results.

## Compress Data
HDF5 also support native data compression:

In [13]:
matrix1 = np.random.random(size = (1000,1000))
matrix2 = np.random.random(size = (1000,1000))
matrix3 = np.random.random(size = (1000,1000))
matrix4 = np.random.random(size = (1000,1000))

In [14]:
with h5py.File('hdf5_groups_compressed.h5', 'w') as hdf:
    G1 = hdf.create_group('Group1')
    G1.create_dataset('dataset1', data = matrix1, compression="gzip", compression_opts=9)
    G1.create_dataset('dataset4', data = matrix4, compression="gzip", compression_opts=9)
 
    G21 = hdf.create_group('Group2/SubGroup1')
    G21.create_dataset('dataset3', data = matrix3, compression="gzip", compression_opts=9)
    
    G22 = hdf.create_group('Group2/SubGroup2')
    G22.create_dataset('dataset2', data = matrix2, compression="gzip", compression_opts=9)

## Attributes
We can add meta information in form of attributes of files, groups and datasets:

In [None]:
matrix1 = np.random.random(size = (1000,1000))
matrix2 = np.random.random(size = (10000,100))

In [None]:
# Create the HDF5 file
hdf = h5py.File('test.h5', 'w')

# Create the datasets
dataset1 = hdf.create_dataset('dataset1', data=matrix1)
dataset2 = hdf.create_dataset('dataset2', data=matrix2)

# Set attributes
dataset1.attrs['CLASS'] = 'DATA MATRIX'
dataset1.attrs['VERSION'] = '1.1'

hdf.close()

In [None]:
# Read the HDF5 file
hdf = h5py.File('test.h5', 'r')
ls = list(hdf.keys())
print('List of datasets in this file: \n', ls)
data = hdf.get('dataset1')
dataset1 = np.array(data)
print('Shape of dataset1: \n', dataset1.shape)
#read the attributes
k = list(data.attrs.keys())
v = list(data.attrs.values())
print(k[0])
print(v[0])
print(data.attrs[k[0]])

hdf.close()

## HDF5 and ***Pandas***

In [None]:
import pandas as pd
# creates (or opens in append mode) an hdf5 file
hdf = pd.HDFStore('hdf5_pandas.h5')

In [None]:
df1 = pd.read_csv(path+'/DATA/FL_insurance_sample.csv')# put the dataset in the storage
hdf.put('DF1', df1, format='table', data_columns=True)

In [None]:
data = {
         "city": ["Tripoli", "Sydney", "Tripoli", "Rome", "Rome", "Tripoli","Rome", "Sydney", "Sydney"],
         "rank": ["1st", "2nd", "1st", "2nd", "1st", "2nd","1st", "2nd", "1st"], 
         "score1": [44, 48, 39, 41, 38, 44, 34, 54, 61],
         "score2": [67, 63, 55, 70, 64, 77, 45, 66, 72]
        }
        
df2 = pd.DataFrame(data, columns = ['city', 'rank','score1','score2'])

In [None]:
df2

In [None]:
hdf.put('DF2Key', df2,format='table', data_columns=True)

In [None]:
hdf.close() # close the hdf5 file

### now read

In [None]:
# open hdf5 file for reading
hdf = pd.HDFStore('hdf5_pandas.h5',mode='r')

In [None]:
hdf.keys()

In [None]:
df1 = hdf.get('/DF1')

In [None]:
type(df1)

In [None]:
df1.head()

In [None]:
hdf.close()