In [1]:
from kerchunk.grib2 import parse_grib_idx, build_idx_grib_mapping, map_from_index
import pandas as pd

## Trying out the workflow for **GEFS** data on AWS
---

In [2]:
# what an idx dataframe looks like
idxdf = parse_grib_idx("s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z.pgrb2af006", storage_options=dict(anon=True))
idxdf.head(10)

Unnamed: 0_level_0,offset,date,attrs,length,idx_uri,grib_uri
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0,d=2017010106,HGT:10 mb:6 hour fcst:ENS=low-res ctl,47493,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...
2,47493,d=2017010106,TMP:10 mb:6 hour fcst:ENS=low-res ctl,19438,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...
3,66931,d=2017010106,RH:10 mb:6 hour fcst:ENS=low-res ctl,10835,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...
4,77766,d=2017010106,UGRD:10 mb:6 hour fcst:ENS=low-res ctl,22625,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...
5,100391,d=2017010106,VGRD:10 mb:6 hour fcst:ENS=low-res ctl,20488,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...
6,120879,d=2017010106,HGT:50 mb:6 hour fcst:ENS=low-res ctl,55619,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...
7,176498,d=2017010106,TMP:50 mb:6 hour fcst:ENS=low-res ctl,22692,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...
8,199190,d=2017010106,RH:50 mb:6 hour fcst:ENS=low-res ctl,22586,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...
9,221776,d=2017010106,UGRD:50 mb:6 hour fcst:ENS=low-res ctl,47282,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...
10,269058,d=2017010106,VGRD:50 mb:6 hour fcst:ENS=low-res ctl,45653,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...


In [3]:
idxdf.loc[1, 'attrs'] 

'HGT:10 mb:6 hour fcst:ENS=low-res ctl'

In [4]:
# creating a mapping which will be used later
mapping = build_idx_grib_mapping("s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z.pgrb2af036", storage_options=dict(anon=True), remote_options=dict(anon=True))
mapping.head(10)

The grib hierarchy in s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z.pgrb2af036 is not unique for 54 variables: ['gh', 't', 'r', 'u', 'v', 'gh', 't', 'r', 'u', 'v', 'gh', 't', 'r', 'u', 'v', 'gh', 't', 'r', 'u', 'v', 'gh', 't', 'r', 'u', 'v', 'u', 'v', 'u', 'v', 'gh', 't', 'r', 'u', 'v', 'gh', 't', 'r', 'u', 'v', 'gh', 't', 'r', 'u', 'v', 'gh', 't', 'r', 'u', 'v', 't', 'r', 'u', 'v', 'gh']


Unnamed: 0_level_0,offset_idx,date,attrs,length_idx,idx_uri,grib_uri,varname,typeOfLevel,stepType,name,level,step,time,valid_time,uri,offset_grib,length_grib,inline_value
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,0,d=2017010106,HGT:10 mb:36 hour fcst:ENS=low-res ctl,48453,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...,gh,isobaricInhPa,instant,Geopotential height,0.0,1 days 12:00:00,2017-01-01 06:00:00,2017-01-02 18:00:00,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...,0,48453,
2,48453,d=2017010106,TMP:10 mb:36 hour fcst:ENS=low-res ctl,20771,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...,t,isobaricInhPa,instant,Temperature,0.0,1 days 12:00:00,2017-01-01 06:00:00,2017-01-02 18:00:00,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...,48453,20771,
3,69224,d=2017010106,RH:10 mb:36 hour fcst:ENS=low-res ctl,10194,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...,r,isobaricInhPa,instant,Relative humidity,0.0,1 days 12:00:00,2017-01-01 06:00:00,2017-01-02 18:00:00,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...,69224,10194,
4,79418,d=2017010106,UGRD:10 mb:36 hour fcst:ENS=low-res ctl,23858,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...,u,isobaricInhPa,instant,U component of wind,0.0,1 days 12:00:00,2017-01-01 06:00:00,2017-01-02 18:00:00,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...,79418,23858,
5,103276,d=2017010106,VGRD:10 mb:36 hour fcst:ENS=low-res ctl,21523,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...,v,isobaricInhPa,instant,V component of wind,0.0,1 days 12:00:00,2017-01-01 06:00:00,2017-01-02 18:00:00,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...,103276,21523,
6,124799,d=2017010106,HGT:50 mb:36 hour fcst:ENS=low-res ctl,56791,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...,gh,isobaricInhPa,instant,Geopotential height,0.0,1 days 12:00:00,2017-01-01 06:00:00,2017-01-02 18:00:00,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...,124799,56791,
7,181590,d=2017010106,TMP:50 mb:36 hour fcst:ENS=low-res ctl,24273,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...,t,isobaricInhPa,instant,Temperature,0.0,1 days 12:00:00,2017-01-01 06:00:00,2017-01-02 18:00:00,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...,181590,24273,
8,205863,d=2017010106,RH:50 mb:36 hour fcst:ENS=low-res ctl,22976,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...,r,isobaricInhPa,instant,Relative humidity,0.0,1 days 12:00:00,2017-01-01 06:00:00,2017-01-02 18:00:00,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...,205863,22976,
9,228839,d=2017010106,UGRD:50 mb:36 hour fcst:ENS=low-res ctl,49049,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...,u,isobaricInhPa,instant,U component of wind,0.0,1 days 12:00:00,2017-01-01 06:00:00,2017-01-02 18:00:00,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...,228839,49049,
10,277888,d=2017010106,VGRD:50 mb:36 hour fcst:ENS=low-res ctl,46940,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...,v,isobaricInhPa,instant,V component of wind,0.0,1 days 12:00:00,2017-01-01 06:00:00,2017-01-02 18:00:00,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...,277888,46940,


In [5]:
mapping.loc[1, 'attrs']  

'HGT:10 mb:36 hour fcst:ENS=low-res ctl'

In [6]:
# this step will be performed for every grib-idx pair where we will be using the "mapping" dataframe which we created previously 
mapped_index = map_from_index(
    pd.Timestamp("2017-01-01T06"),
    mapping.loc[~mapping["attrs"].duplicated(keep="first"), :],       # file : gec00.t06z.pgrb2af036
    idxdf.loc[~idxdf["attrs"].duplicated(keep="first"), :],           # file : gec00.t06z.pgrb2af006
)
mapped_index

Unnamed: 0,varname,typeOfLevel,stepType,name,step,level,time,valid_time,uri,offset,length,inline_value


In [7]:
# mapping is not happening between as the "attrs" column in "mapping" and "idxdf" are not same for GEFS data on AWS

## Mapping on GEFS on AWS

In [8]:
idxdf2 = parse_grib_idx("s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z.pgrb2af006", storage_options=dict(anon=True))

In [9]:
mapping2 = build_idx_grib_mapping("s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z.pgrb2af006", storage_options=dict(anon=True), remote_options=dict(anon=True))

The grib hierarchy in s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z.pgrb2af006 is not unique for 54 variables: ['gh', 't', 'r', 'u', 'v', 'gh', 't', 'r', 'u', 'v', 'gh', 't', 'r', 'u', 'v', 'gh', 't', 'r', 'u', 'v', 'gh', 't', 'r', 'u', 'v', 'u', 'v', 'u', 'v', 'gh', 't', 'r', 'u', 'v', 'gh', 't', 'r', 'u', 'v', 'gh', 't', 'r', 'u', 'v', 'gh', 't', 'r', 'u', 'v', 't', 'r', 'u', 'v', 'gh']


In [12]:
mapped_index = map_from_index(
    pd.Timestamp("2017-01-01T06"),
    mapping2.loc[~mapping["attrs"].duplicated(keep="first"), :],       # file : gec00.t06z.pgrb2af006
    idxdf2.loc[~idxdf["attrs"].duplicated(keep="first"), :],           # file : gec00.t06z.pgrb2af006
)

In [13]:
mapped_index

Unnamed: 0,varname,typeOfLevel,stepType,name,step,level,time,valid_time,uri,offset,length,inline_value
0,gh,isobaricInhPa,instant,Geopotential height,0 days 06:00:00,0.0,2017-01-01 06:00:00,2017-01-01 12:00:00,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...,0,47493,
1,t,isobaricInhPa,instant,Temperature,0 days 06:00:00,0.0,2017-01-01 06:00:00,2017-01-01 12:00:00,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...,47493,19438,
2,r,isobaricInhPa,instant,Relative humidity,0 days 06:00:00,0.0,2017-01-01 06:00:00,2017-01-01 12:00:00,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...,66931,10835,
3,u,isobaricInhPa,instant,U component of wind,0 days 06:00:00,0.0,2017-01-01 06:00:00,2017-01-01 12:00:00,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...,77766,22625,
4,v,isobaricInhPa,instant,V component of wind,0 days 06:00:00,0.0,2017-01-01 06:00:00,2017-01-01 12:00:00,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...,100391,20488,
...,...,...,...,...,...,...,...,...,...,...,...,...
78,ulwrf,surface,avg,Upward long-wave radiation flux,0 days 06:00:00,0.0,2017-01-01 06:00:00,2017-01-01 12:00:00,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...,3885258,39087,
79,ulwrf,nominalTop,avg,Upward long-wave radiation flux,0 days 06:00:00,0.0,2017-01-01 06:00:00,2017-01-01 12:00:00,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...,3924345,43221,
80,cape,pressureFromGroundLayer,instant,Convective available potential energy,0 days 06:00:00,0.0,2017-01-01 06:00:00,2017-01-01 12:00:00,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...,3967566,42488,
81,cin,pressureFromGroundLayer,instant,Convective inhibition,0 days 06:00:00,0.0,2017-01-01 06:00:00,2017-01-01 12:00:00,s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z...,4010054,43027,
