# Preparing ClimART data into graph

---

### Overview
1. Inspect raw data
2. Process raw data
3. Augment and merge data
4. Training, validation, testing data
---

In [1]:
import hyper
import prep_climart

HYPER = hyper.HyperParameter()

### 1. Inspect raw data


#### 1.1 Input and output data

In [2]:
# choose an exemplar year
year='1994'

# import inputs and outputs
inputs, outputs_clear_sky, outputs_pristine = prep_climart.import_h5_data(HYPER, year)

# show us keys and data shapes
prep_climart.visualize_raw_keys_and_shapes(inputs, 'inputs')
prep_climart.visualize_raw_keys_and_shapes(outputs_clear_sky, 'outputs_clear_sky')
prep_climart.visualize_raw_keys_and_shapes(outputs_pristine, 'outputs_pristine')


Keys of inputs data:
['globals', 'layers', 'levels']

globals: 
(352256, 82)

layers: 
(352256, 49, 45)

levels: 
(352256, 50, 4)

Keys of outputs_clear_sky data:
['hrlc', 'hrsc', 'rldc', 'rluc', 'rsdc', 'rsuc']

hrlc: 
(352256, 49)

hrsc: 
(352256, 49)

rldc: 
(352256, 50)

rluc: 
(352256, 50)

rsdc: 
(352256, 50)

rsuc: 
(352256, 50)

Keys of outputs_pristine data:
['hrlc', 'hrsc', 'rldc', 'rluc', 'rsdc', 'rsuc']

hrlc: 
(352256, 49)

hrsc: 
(352256, 49)

rldc: 
(352256, 50)

rluc: 
(352256, 50)

rsdc: 
(352256, 50)

rsuc: 
(352256, 50)


#### 1.2 Meta data in json format

In [3]:
input_dims, variables, feature_by_var = prep_climart.import_meta_json(HYPER)

# Show data
print("Input dimensions")
display(input_dims)

print("Variables")
display(variables)

print("Feature by variable")
display(feature_by_var)

Input dimensions


{'pristine': {'globals': 82, 'layers': 14, 'levels': 4},
 'clear_sky': {'globals': 82, 'layers': 45, 'levels': 4}}

Variables


{'cszrow': {'data_type': 'globals',
  'num_features': 1,
  'only_clear_sky': False},
 'shj': {'data_type': 'layers', 'num_features': 1, 'only_clear_sky': False},
 'shtj': {'data_type': 'levels', 'num_features': 1, 'only_clear_sky': False},
 'gtrow': {'data_type': 'globals', 'num_features': 1, 'only_clear_sky': False},
 'tlayer': {'data_type': 'layers', 'num_features': 1, 'only_clear_sky': False},
 'tfrow': {'data_type': 'levels', 'num_features': 1, 'only_clear_sky': False},
 'pressg': {'data_type': 'globals',
  'num_features': 1,
  'only_clear_sky': False},
 'layer_pressure': {'data_type': 'layers',
  'num_features': 1,
  'only_clear_sky': False},
 'level_pressure': {'data_type': 'levels',
  'num_features': 1,
  'only_clear_sky': False},
 'oztop': {'data_type': 'globals', 'num_features': 1, 'only_clear_sky': False},
 'ozphs': {'data_type': 'layers', 'num_features': 1, 'only_clear_sky': False},
 'qc': {'data_type': 'layers', 'num_features': 1, 'only_clear_sky': False},
 'dz': {'data_typ

Feature by variable


{'globals': {'cszrow': {'start': 0, 'end': 1},
  'gtrow': {'start': 1, 'end': 2},
  'pressg': {'start': 2, 'end': 3},
  'oztop': {'start': 3, 'end': 4},
  'emisrow': {'start': 4, 'end': 5},
  'salbrol': {'start': 5, 'end': 9},
  'csalrol': {'start': 9, 'end': 13},
  'emisrot': {'start': 13, 'end': 19},
  'gtrot': {'start': 19, 'end': 25},
  'farerot': {'start': 25, 'end': 31},
  'salbrot': {'start': 31, 'end': 55},
  'csalrot': {'start': 55, 'end': 79},
  'x_cord': {'start': 79, 'end': 80},
  'y_cord': {'start': 80, 'end': 81},
  'z_cord': {'start': 81, 'end': 82}},
 'levels': {'shtj': {'start': 0, 'end': 1},
  'tfrow': {'start': 1, 'end': 2},
  'level_pressure': {'start': 2, 'end': 3},
  'height': {'start': 3, 'end': 4}},
 'layers': {'shj': {'start': 0, 'end': 1},
  'tlayer': {'start': 1, 'end': 2},
  'layer_pressure': {'start': 2, 'end': 3},
  'ozphs': {'start': 3, 'end': 4},
  'qc': {'start': 4, 'end': 5},
  'dz': {'start': 5, 'end': 6},
  'dshj': {'start': 6, 'end': 7},
  'co2rox':

#### 1.3 Statistics of all data

In [4]:
stats_dict = prep_climart.import_data_stats(HYPER)

# print out data
stats_dict

{'outputs_clear_sky_rluc_std': array(75.33053593),
 'outputs_pristine_hrlc_spatial_max': array([9.96921e+36, 9.96921e+36, 9.96921e+36, 9.96921e+36, 9.96921e+36,
        9.96921e+36, 9.96921e+36, 9.96921e+36, 9.96921e+36, 9.96921e+36,
        9.96921e+36, 9.96921e+36, 9.96921e+36, 9.96921e+36, 9.96921e+36,
        9.96921e+36, 9.96921e+36, 9.96921e+36, 9.96921e+36, 9.96921e+36,
        9.96921e+36, 9.96921e+36, 9.96921e+36, 9.96921e+36, 9.96921e+36,
        9.96921e+36, 9.96921e+36, 9.96921e+36, 9.96921e+36, 9.96921e+36,
        9.96921e+36, 9.96921e+36, 9.96921e+36, 9.96921e+36, 9.96921e+36,
        9.96921e+36, 9.96921e+36, 9.96921e+36, 9.96921e+36, 9.96921e+36,
        9.96921e+36, 9.96921e+36, 9.96921e+36, 9.96921e+36, 9.96921e+36,
        9.96921e+36, 9.96921e+36, 9.96921e+36, 9.96921e+36], dtype=float32),
 'outputs_clear_sky_hrlc_min': array(-0.00064009, dtype=float32),
 'levels_spatial_std': array([[6.66817359e-05, 1.42028666e+01, 4.55548297e-06, 2.38510352e+03],
        [1.65712

### 2. Process raw data

In [5]:
(
    df_inputs_clear_sky, 
    df_inputs_pristine, 
    df_outputs_clear_sky, 
    df_outputs_pristine
) = prep_climart.process_raw_data(
    feature_by_var,
    inputs, 
    outputs_clear_sky, 
    outputs_pristine
)

# free up memory
inputs, outputs_clear_sky, outputs_pristine = 0, 0, 0

# show us results
display(df_inputs_clear_sky)
display(df_inputs_pristine)
display(df_outputs_clear_sky)
display(df_outputs_pristine)

Unnamed: 0,cszrow,gtrow,pressg,oztop,emisrow,salbrol_0,salbrol_1,salbrol_2,salbrol_3,csalrol_0,...,l_47_level_pressure,l_47_height,l_48_shtj,l_48_tfrow,l_48_level_pressure,l_48_height,l_49_shtj,l_49_tfrow,l_49_level_pressure,l_49_height
0,0.061690,229.160004,69374.320312,0.000005,1.000000,0.972808,0.755951,0.153404,0.017170,0.972808,...,68246.718750,111.184998,0.992667,231.787292,68865.601562,49.935001,1.0,229.160004,69374.320312,0.0
1,0.062951,229.410004,69203.640625,0.000005,1.000000,0.972791,0.757484,0.136108,0.014977,0.972791,...,68081.406250,110.780998,0.992684,231.482498,68697.343750,49.750999,1.0,229.410004,69203.640625,0.0
2,0.064145,228.910004,69041.210938,0.000005,1.000000,0.972846,0.756655,0.152407,0.010590,0.972846,...,67924.070312,110.391998,0.992700,231.179092,68537.226562,49.573002,1.0,228.910004,69041.210938,0.0
3,0.065270,229.160004,68886.593750,0.000005,1.000000,0.972872,0.755150,0.131031,0.010182,0.972872,...,67774.312500,110.018997,0.992716,230.889206,68384.820312,49.403000,1.0,229.160004,68886.593750,0.0
4,0.066322,229.029999,68739.132812,0.000005,1.000000,0.972831,0.755400,0.130505,0.012342,0.972831,...,67631.476562,109.660004,0.992731,230.608994,68239.453125,49.238998,1.0,229.035004,68739.132812,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
352251,0.365036,269.279999,102287.968750,0.000004,0.969825,0.951713,0.758219,0.143208,0.010097,0.957311,...,100127.328125,167.057999,0.990445,267.366486,101310.609375,75.252998,1.0,268.157013,102287.968750,0.0
352252,0.366692,269.279999,102310.210938,0.000004,0.969780,0.949400,0.758195,0.145490,0.010420,0.954994,...,100148.875000,167.089996,0.990444,267.390411,101332.523438,75.264999,1.0,268.172607,102310.210938,0.0
352253,0.368364,269.529999,102333.921875,0.000004,0.969782,0.949823,0.757028,0.143886,0.010280,0.955361,...,100171.843750,167.122009,0.990443,267.408600,101355.906250,75.277000,1.0,268.182800,102333.921875,0.0
352254,0.370048,269.559998,102358.906250,0.000004,0.969800,0.951324,0.756152,0.141692,0.010038,0.956763,...,100196.039062,167.149994,0.990442,267.417389,101380.523438,75.287003,1.0,268.187195,102358.906250,0.0


Unnamed: 0,cszrow,gtrow,pressg,oztop,emisrow,salbrol_0,salbrol_1,salbrol_2,salbrol_3,csalrol_0,...,l_47_level_pressure,l_47_height,l_48_shtj,l_48_tfrow,l_48_level_pressure,l_48_height,l_49_shtj,l_49_tfrow,l_49_level_pressure,l_49_height
0,0.061690,229.160004,69374.320312,0.000005,1.000000,0.972808,0.755951,0.153404,0.017170,0.972808,...,68246.718750,111.184998,0.992667,231.787292,68865.601562,49.935001,1.0,229.160004,69374.320312,0.0
1,0.062951,229.410004,69203.640625,0.000005,1.000000,0.972791,0.757484,0.136108,0.014977,0.972791,...,68081.406250,110.780998,0.992684,231.482498,68697.343750,49.750999,1.0,229.410004,69203.640625,0.0
2,0.064145,228.910004,69041.210938,0.000005,1.000000,0.972846,0.756655,0.152407,0.010590,0.972846,...,67924.070312,110.391998,0.992700,231.179092,68537.226562,49.573002,1.0,228.910004,69041.210938,0.0
3,0.065270,229.160004,68886.593750,0.000005,1.000000,0.972872,0.755150,0.131031,0.010182,0.972872,...,67774.312500,110.018997,0.992716,230.889206,68384.820312,49.403000,1.0,229.160004,68886.593750,0.0
4,0.066322,229.029999,68739.132812,0.000005,1.000000,0.972831,0.755400,0.130505,0.012342,0.972831,...,67631.476562,109.660004,0.992731,230.608994,68239.453125,49.238998,1.0,229.035004,68739.132812,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
352251,0.365036,269.279999,102287.968750,0.000004,0.969825,0.951713,0.758219,0.143208,0.010097,0.957311,...,100127.328125,167.057999,0.990445,267.366486,101310.609375,75.252998,1.0,268.157013,102287.968750,0.0
352252,0.366692,269.279999,102310.210938,0.000004,0.969780,0.949400,0.758195,0.145490,0.010420,0.954994,...,100148.875000,167.089996,0.990444,267.390411,101332.523438,75.264999,1.0,268.172607,102310.210938,0.0
352253,0.368364,269.529999,102333.921875,0.000004,0.969782,0.949823,0.757028,0.143886,0.010280,0.955361,...,100171.843750,167.122009,0.990443,267.408600,101355.906250,75.277000,1.0,268.182800,102333.921875,0.0
352254,0.370048,269.559998,102358.906250,0.000004,0.969800,0.951324,0.756152,0.141692,0.010038,0.956763,...,100196.039062,167.149994,0.990442,267.417389,101380.523438,75.287003,1.0,268.187195,102358.906250,0.0


Unnamed: 0,hrlc_0,hrlc_1,hrlc_2,hrlc_3,hrlc_4,hrlc_5,hrlc_6,hrlc_7,hrlc_8,hrlc_9,...,rsuc_40,rsuc_41,rsuc_42,rsuc_43,rsuc_44,rsuc_45,rsuc_46,rsuc_47,rsuc_48,rsuc_49
0,-0.000109,-0.000067,-0.000056,-0.000047,-0.000040,-0.000035,-0.000030,-0.000026,-0.000024,-0.000022,...,39.190990,38.858440,38.579479,38.350433,38.210892,38.080860,37.956921,37.856308,37.774498,37.708302
1,-0.000110,-0.000067,-0.000056,-0.000047,-0.000040,-0.000035,-0.000030,-0.000026,-0.000024,-0.000022,...,39.993397,39.640774,39.357277,39.106567,38.965370,38.834827,38.713020,38.613045,38.532307,38.466969
2,-0.000110,-0.000067,-0.000056,-0.000047,-0.000041,-0.000035,-0.000029,-0.000026,-0.000024,-0.000022,...,40.990307,40.626022,40.323696,40.062160,39.920147,39.789955,39.670635,39.571918,39.492737,39.428665
3,-0.000110,-0.000067,-0.000057,-0.000047,-0.000041,-0.000035,-0.000029,-0.000026,-0.000024,-0.000022,...,41.573193,41.211086,40.899818,40.637363,40.488441,40.357834,40.239902,40.141876,40.063774,40.000561
4,-0.000110,-0.000067,-0.000057,-0.000048,-0.000041,-0.000035,-0.000029,-0.000026,-0.000024,-0.000022,...,42.356976,41.991856,41.680443,41.417767,41.247337,41.116821,41.000614,40.903587,40.826813,40.764668
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
352251,-0.000261,-0.000204,-0.000164,-0.000115,-0.000085,-0.000069,-0.000058,-0.000048,-0.000041,-0.000036,...,292.438568,292.359192,292.303345,292.267639,292.250336,292.258759,292.283417,292.197540,292.153381,292.123474
352252,-0.000261,-0.000204,-0.000164,-0.000115,-0.000085,-0.000069,-0.000058,-0.000048,-0.000041,-0.000035,...,293.571533,293.492310,293.436554,293.400879,293.383362,293.391846,293.417877,293.330048,293.284729,293.254211
352253,-0.000261,-0.000204,-0.000164,-0.000114,-0.000085,-0.000070,-0.000058,-0.000048,-0.000041,-0.000035,...,294.843079,294.764435,294.709198,294.673950,294.656219,294.664886,294.692230,294.602356,294.556030,294.524628
352254,-0.000262,-0.000204,-0.000164,-0.000114,-0.000085,-0.000070,-0.000058,-0.000048,-0.000041,-0.000035,...,296.312622,296.235199,296.181030,296.146515,296.129089,296.138519,296.167450,296.075958,296.028748,295.996765


Unnamed: 0,hrlc_0,hrlc_1,hrlc_2,hrlc_3,hrlc_4,hrlc_5,hrlc_6,hrlc_7,hrlc_8,hrlc_9,...,rsuc_40,rsuc_41,rsuc_42,rsuc_43,rsuc_44,rsuc_45,rsuc_46,rsuc_47,rsuc_48,rsuc_49
0,-0.000109,-0.000067,-0.000056,-0.000047,-0.000040,-0.000035,-0.000030,-0.000026,-0.000024,-0.000022,...,40.289005,40.207047,40.139862,40.080505,40.021446,39.962864,39.904751,39.846943,39.799892,39.761494
1,-0.000110,-0.000067,-0.000056,-0.000047,-0.000040,-0.000035,-0.000030,-0.000026,-0.000024,-0.000022,...,41.137012,41.053501,40.984982,40.924435,40.864193,40.804440,40.745155,40.686218,40.638260,40.599136
2,-0.000110,-0.000067,-0.000056,-0.000047,-0.000041,-0.000035,-0.000029,-0.000026,-0.000024,-0.000022,...,42.229313,42.144619,42.075153,42.013832,41.952675,41.891987,41.831760,41.771877,41.723183,41.683437
3,-0.000110,-0.000067,-0.000057,-0.000047,-0.000041,-0.000035,-0.000029,-0.000026,-0.000024,-0.000022,...,42.856510,42.770298,42.699722,42.637291,42.575043,42.513271,42.451962,42.391052,42.341541,42.301128
4,-0.000110,-0.000067,-0.000057,-0.000048,-0.000041,-0.000035,-0.000029,-0.000026,-0.000024,-0.000022,...,43.696312,43.608932,43.537365,43.474052,43.411045,43.348385,43.286201,43.224422,43.174221,43.133259
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
352251,-0.000261,-0.000204,-0.000164,-0.000115,-0.000085,-0.000069,-0.000058,-0.000048,-0.000041,-0.000036,...,298.401978,298.354889,298.328430,298.320404,298.334595,298.363159,298.418091,298.502808,298.601990,298.689728
352252,-0.000261,-0.000204,-0.000164,-0.000115,-0.000085,-0.000069,-0.000058,-0.000048,-0.000041,-0.000035,...,299.572876,299.525269,299.498383,299.490204,299.503815,299.533234,299.589294,299.676605,299.777893,299.867096
352253,-0.000261,-0.000204,-0.000164,-0.000114,-0.000085,-0.000070,-0.000058,-0.000048,-0.000041,-0.000035,...,300.894104,300.846283,300.819336,300.811157,300.824097,300.854370,300.911316,301.000946,301.103821,301.194275
352254,-0.000262,-0.000204,-0.000163,-0.000114,-0.000085,-0.000070,-0.000058,-0.000048,-0.000041,-0.000035,...,302.417572,302.369995,302.343414,302.335632,302.348145,302.379486,302.437531,302.529755,302.634552,302.726135


### 3. Augment and merge data

In [6]:
df_clear_sky, df_pristine = prep_climart.augment_and_merge(
    year,
    df_inputs_clear_sky, 
    df_inputs_pristine, 
    df_outputs_clear_sky, 
    df_outputs_pristine
)

display(df_clear_sky)
display(df_pristine)

  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,lat,lon,year,hour_of_year,cszrow,gtrow,pressg,oztop,emisrow,salbrol_0,...,rsuc_40,rsuc_41,rsuc_42,rsuc_43,rsuc_44,rsuc_45,rsuc_46,rsuc_47,rsuc_48,rsuc_49
0,0.000000,0.000000,1994,0,0.061690,229.160004,69374.320312,0.000005,1.000000,0.972808,...,39.190990,38.858440,38.579479,38.350433,38.210892,38.080860,37.956921,37.856308,37.774498,37.708302
1,0.000000,57.295776,1994,0,0.062951,229.410004,69203.640625,0.000005,1.000000,0.972791,...,39.993397,39.640774,39.357277,39.106567,38.965370,38.834827,38.713020,38.613045,38.532307,38.466969
2,0.000000,114.591553,1994,0,0.064145,228.910004,69041.210938,0.000005,1.000000,0.972846,...,40.990307,40.626022,40.323696,40.062160,39.920147,39.789955,39.670635,39.571918,39.492737,39.428665
3,0.000000,171.887344,1994,0,0.065270,229.160004,68886.593750,0.000005,1.000000,0.972872,...,41.573193,41.211086,40.899818,40.637363,40.488441,40.357834,40.239902,40.141876,40.063774,40.000561
4,0.000000,130.816864,1994,0,0.066322,229.029999,68739.132812,0.000005,1.000000,0.972831,...,42.356976,41.991856,41.680443,41.417767,41.247337,41.116821,41.000614,40.903587,40.826813,40.764668
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
352251,9.634109,152.619110,1994,8610,0.365036,269.279999,102287.968750,0.000004,0.969825,0.951713,...,292.438568,292.359192,292.303345,292.267639,292.250336,292.258759,292.283417,292.197540,292.153381,292.123474
352252,9.634109,95.323334,1994,8610,0.366692,269.279999,102310.210938,0.000004,0.969780,0.949400,...,293.571533,293.492310,293.436554,293.400879,293.383362,293.391846,293.417877,293.330048,293.284729,293.254211
352253,9.634109,38.027557,1994,8610,0.368364,269.529999,102333.921875,0.000004,0.969782,0.949823,...,294.843079,294.764435,294.709198,294.673950,294.656219,294.664886,294.692230,294.602356,294.556030,294.524628
352254,9.634109,19.268215,1994,8610,0.370048,269.559998,102358.906250,0.000004,0.969800,0.951324,...,296.312622,296.235199,296.181030,296.146515,296.129089,296.138519,296.167450,296.075958,296.028748,295.996765


Unnamed: 0,lat,lon,year,hour_of_year,cszrow,gtrow,pressg,oztop,emisrow,salbrol_0,...,rsuc_40,rsuc_41,rsuc_42,rsuc_43,rsuc_44,rsuc_45,rsuc_46,rsuc_47,rsuc_48,rsuc_49
0,0.000000,0.000000,1994,0,0.061690,229.160004,69374.320312,0.000005,1.000000,0.972808,...,40.289005,40.207047,40.139862,40.080505,40.021446,39.962864,39.904751,39.846943,39.799892,39.761494
1,0.000000,57.295776,1994,0,0.062951,229.410004,69203.640625,0.000005,1.000000,0.972791,...,41.137012,41.053501,40.984982,40.924435,40.864193,40.804440,40.745155,40.686218,40.638260,40.599136
2,0.000000,114.591553,1994,0,0.064145,228.910004,69041.210938,0.000005,1.000000,0.972846,...,42.229313,42.144619,42.075153,42.013832,41.952675,41.891987,41.831760,41.771877,41.723183,41.683437
3,0.000000,171.887344,1994,0,0.065270,229.160004,68886.593750,0.000005,1.000000,0.972872,...,42.856510,42.770298,42.699722,42.637291,42.575043,42.513271,42.451962,42.391052,42.341541,42.301128
4,0.000000,130.816864,1994,0,0.066322,229.029999,68739.132812,0.000005,1.000000,0.972831,...,43.696312,43.608932,43.537365,43.474052,43.411045,43.348385,43.286201,43.224422,43.174221,43.133259
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
352251,9.634109,152.619110,1994,8610,0.365036,269.279999,102287.968750,0.000004,0.969825,0.951713,...,298.401978,298.354889,298.328430,298.320404,298.334595,298.363159,298.418091,298.502808,298.601990,298.689728
352252,9.634109,95.323334,1994,8610,0.366692,269.279999,102310.210938,0.000004,0.969780,0.949400,...,299.572876,299.525269,299.498383,299.490204,299.503815,299.533234,299.589294,299.676605,299.777893,299.867096
352253,9.634109,38.027557,1994,8610,0.368364,269.529999,102333.921875,0.000004,0.969782,0.949823,...,300.894104,300.846283,300.819336,300.811157,300.824097,300.854370,300.911316,301.000946,301.103821,301.194275
352254,9.634109,19.268215,1994,8610,0.370048,269.559998,102358.906250,0.000004,0.969800,0.951324,...,302.417572,302.369995,302.343414,302.335632,302.348145,302.379486,302.437531,302.529755,302.634552,302.726135


### 4. Training, validation, testing data

In [2]:
prep_climart.train_val_test_split(HYPER)

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


OSError: Unable to open file (truncated file: eof = 3342516224, sblock->base_addr = 0, stored_eof = 3504244736)