# Handling retrieved data

This tutorial describes some basic tasks for handling the datasets and data enclosed in the output files from the acquisition of the meteorological products from the GFS model.

### 1. Import of packages

In [1]:
import h5py, numpy as np
import pandas as pd

### 2. Loading of h5 File

Useful documentation for managing h5 files with Python can be found [here](https://docs.h5py.org/en/stable/quick.html#)

In [2]:
fname = '../data/2020-01-01_2021-01-01_0p25_MDE.h5'
f = h5py.File(fname,'r')

### 3. Reading file into a Python Dictionary

Useful documentation for working with Python Dictionary can be found [here](https://docs.python.org/3/tutorial/datastructures.html#dictionaries)

In [3]:
def read2dict(data,f):
    for gr in f.keys():
        if hasattr(f[gr], 'keys'):
            data[gr] = {} if not gr in data.keys() else data[gr]
            read2dict(data[gr],f[gr])
        else:  
            data['units'] = {} if 'units' not in data.keys() else data['units']
            if not gr == 'latitude' and not gr == 'longitude':
                data['units'][gr] = f[gr].attrs['units']
                dset_data = f[gr][:]
                data[gr] = np.append(data[gr],dset_data) if gr in data.keys() else dset_data
    return(data)

In [4]:
data  = read2dict({},f)

This is how information can be access in the data dictionary

In [5]:
print('Forecast runtimes: ',list(data.keys()),'\n')
print('Solar variables from GFS Model: ',*list(data['00UTC'].keys()),sep = '\n')
print('\nCloud water: ',data['00UTC']['Cloud_water_entire_atmosphere_single_layer'],'\n')
print('Units of solar variables: ',data['00UTC']['units'])

Forecast runtimes:  ['00UTC', '06UTC', '12UTC', '18UTC', 'coords'] 

Solar variables from GFS Model: 
units
Cloud_water_entire_atmosphere_single_layer
Downward_Long-Wave_Radp_Flux_surface_
Downward_Short-Wave_Radiation_Flux_surface_
Planetary_Boundary_Layer_Height_surface
Precipitable_water_entire_atmosphere_single_layer
Precipitation_rate_surface
Precipitation_rate_surface_
Temperature_surface
Total_cloud_cover_boundary_layer_cloud_
Total_cloud_cover_convective_cloud
Total_cloud_cover_entire_atmosphere_
timestamp

Cloud water:  [0.86 0.86 1.8  ... 0.02 0.02 0.02] 

Units of solar variables:  {'Cloud_water_entire_atmosphere_single_layer': 'kg.m-2', 'Downward_Long-Wave_Radp_Flux_surface_': 'W.m-2', 'Downward_Short-Wave_Radiation_Flux_surface_': 'W.m-2', 'Planetary_Boundary_Layer_Height_surface': 'm', 'Precipitable_water_entire_atmosphere_single_layer': 'kg.m-2', 'Precipitation_rate_surface': 'kg.m-2.s-1', 'Precipitation_rate_surface_': 'kg.m-2.s-1', 'Temperature_surface': 'K', 'Total_cl

### 4. Reading file into a Pandas Dataframe
In this case, it is better to select a forecast runtime of interest for building the dataframe and operate with it.

For more information regarding the handling of Pandas Dataframe, visit the documentation [here](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html)

In [6]:
def read2df(runtime,df,f):
    df.attrs = {'forecast_runtime': runtime}
    f = f[runtime]
    for prm in f.keys():
        if prm == 'timestamp':
            df.index = f[prm][:].astype('datetime64[s]')
        else:
            df[prm] = f[prm][:]
            df[prm].attrs = {'units':f[prm].attrs['units']}
    return(df)

In [7]:
df = read2df('00UTC',pd.DataFrame(),f)

In [8]:
df

Unnamed: 0,Cloud_water_entire_atmosphere_single_layer,Downward_Long-Wave_Radp_Flux_surface_,Downward_Short-Wave_Radiation_Flux_surface_,Planetary_Boundary_Layer_Height_surface,Precipitable_water_entire_atmosphere_single_layer,Precipitation_rate_surface,Precipitation_rate_surface_,Temperature_surface,Total_cloud_cover_boundary_layer_cloud_,Total_cloud_cover_convective_cloud,Total_cloud_cover_entire_atmosphere_
2020-01-01 03:00:00,0.86,376.000000,0.0,85.954681,31.700001,0.0002,0.000220,288.120850,43.0,48.0,100.0
2020-01-01 06:00:00,0.86,374.000000,0.0,67.562698,31.100000,0.0005,0.000283,287.633972,62.0,45.0,100.0
2020-01-01 09:00:00,1.80,377.000000,0.0,73.985466,31.900000,0.0014,0.000880,287.500000,92.0,31.0,100.0
2020-01-01 12:00:00,1.36,378.342773,0.0,91.371887,31.299999,0.0005,0.000933,287.495667,94.0,14.0,100.0
2020-01-01 15:00:00,0.36,377.000000,20.0,121.246613,29.799999,0.0001,0.000270,287.865417,91.0,28.0,100.0
...,...,...,...,...,...,...,...,...,...,...,...
2020-12-31 12:00:00,0.01,330.532104,10.0,66.889061,24.100000,0.0000,0.000030,285.530792,46.0,0.0,88.0
2020-12-31 15:00:00,0.02,316.200714,470.0,828.324158,24.100000,0.0000,0.000000,295.615387,34.0,7.0,100.0
2020-12-31 18:00:00,0.02,346.723877,530.0,1083.211060,24.799999,0.0001,0.000050,294.976959,43.0,27.0,100.0
2020-12-31 21:00:00,0.02,369.915863,500.0,706.621643,25.700001,0.0004,0.000230,292.785889,0.0,64.0,100.0


### 5. Rename of variables

For switching the default variable names for ones more suitables to work with such as abbreviations or simply shorter names, here below a way to make this is presented.

First a dictionary with the new naming has to be defined:

In [9]:
new_naming = {'Cloud_water_entire_atmosphere_single_layer':'total_water_clouds',
             'Downward_Long-Wave_Radp_Flux_surface_': 'ghi_lw_avg',
             'Downward_Short-Wave_Radiation_Flux_surface_':'ghi_sw_avg',
             'Planetary_Boundary_Layer_Height_surface':'PBL_height',
             'Precipitable_water_entire_atmosphere_single_layer':'total_precip_water',
             'Precipitation_rate_surface':'total_precip_rate',
             'Precipitation_rate_surface_':'precip_rate_avg',
             'Temperature_surface':'temp_air',
             'Total_cloud_cover_boundary_layer_cloud_':'boundary_clouds_avg',
             'Total_cloud_cover_convective_cloud':'convect_clouds',
             'Total_cloud_cover_entire_atmosphere_': 'total_clouds_avg'}

In [10]:
def rename(obj,new_naming):
    if isinstance(obj, pd.DataFrame):
        obj = obj.rename(columns=new_naming)
    elif isinstance(obj, dict):
        for key in obj.keys():
            if hasattr(obj[key], 'keys'):
                obj[key] = {(new_naming[k] if k in new_naming else k):v  for (k,v) in obj[key].items() }
                rename(obj[key],new_naming)
    return(obj)

In [11]:
df_ = rename(df,new_naming)
df_

Unnamed: 0,total_water_clouds,ghi_lw_avg,ghi_sw_avg,PBL_height,total_precip_water,total_precip_rate,precip_rate_avg,temp_air,boundary_clouds_avg,convect_clouds,total_clouds_avg
2020-01-01 03:00:00,0.86,376.000000,0.0,85.954681,31.700001,0.0002,0.000220,288.120850,43.0,48.0,100.0
2020-01-01 06:00:00,0.86,374.000000,0.0,67.562698,31.100000,0.0005,0.000283,287.633972,62.0,45.0,100.0
2020-01-01 09:00:00,1.80,377.000000,0.0,73.985466,31.900000,0.0014,0.000880,287.500000,92.0,31.0,100.0
2020-01-01 12:00:00,1.36,378.342773,0.0,91.371887,31.299999,0.0005,0.000933,287.495667,94.0,14.0,100.0
2020-01-01 15:00:00,0.36,377.000000,20.0,121.246613,29.799999,0.0001,0.000270,287.865417,91.0,28.0,100.0
...,...,...,...,...,...,...,...,...,...,...,...
2020-12-31 12:00:00,0.01,330.532104,10.0,66.889061,24.100000,0.0000,0.000030,285.530792,46.0,0.0,88.0
2020-12-31 15:00:00,0.02,316.200714,470.0,828.324158,24.100000,0.0000,0.000000,295.615387,34.0,7.0,100.0
2020-12-31 18:00:00,0.02,346.723877,530.0,1083.211060,24.799999,0.0001,0.000050,294.976959,43.0,27.0,100.0
2020-12-31 21:00:00,0.02,369.915863,500.0,706.621643,25.700001,0.0004,0.000230,292.785889,0.0,64.0,100.0


In [12]:
data_ = rename(data,new_naming)

In [13]:
print('Forecast runtimes: ',list(data.keys()),'\n')
print('Solar variables from GFS Model: ',*list(data['00UTC'].keys()),sep = '\n')
print('\nCloud water: ',data['00UTC']['total_water_clouds'],'\n')
print('Units of solar variables: ',data['00UTC']['units'])

Forecast runtimes:  ['00UTC', '06UTC', '12UTC', '18UTC', 'coords'] 

Solar variables from GFS Model: 
units
total_water_clouds
ghi_lw_avg
ghi_sw_avg
PBL_height
total_precip_water
total_precip_rate
precip_rate_avg
temp_air
boundary_clouds_avg
convect_clouds
total_clouds_avg
timestamp

Cloud water:  [0.86 0.86 1.8  ... 0.02 0.02 0.02] 

Units of solar variables:  {'total_water_clouds': 'kg.m-2', 'ghi_lw_avg': 'W.m-2', 'ghi_sw_avg': 'W.m-2', 'PBL_height': 'm', 'total_precip_water': 'kg.m-2', 'total_precip_rate': 'kg.m-2.s-1', 'precip_rate_avg': 'kg.m-2.s-1', 'temp_air': 'K', 'boundary_clouds_avg': '%', 'convect_clouds': '%', 'total_clouds_avg': '%', 'timestamp': 'EPOCH'}


### 6. Correction of 6 hour average to 3 hour average


In [18]:
def avg_correction(obj):
    if isinstance(obj, pd.DataFrame):
        for prm in obj.columns:
            if 'avg' in prm or prm[-1] == '_':
                df.loc[df.index.hour % 6 == 0 ,[prm]] = (df[prm]*2)-df[prm].shift(1)
    elif isinstance(obj, dict):
        for key in obj.keys():
            if hasattr(obj[key], 'keys'):
                obj[key] = {(new_naming[k] if k in new_naming else k):v  for (k,v) in obj[key].items() }
                rename(obj[key],new_naming)
    return(obj)