# Offline diagnostics for the CASTLE single output networks following Rasp et al. (2018) architecture

# Profile Plots

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os

# 0 = all messages are logged (default behavior)
# 1 = INFO messages are not printed
# 2 = INFO and WARNING messages are not printed
# 3 = INFO, WARNING, and ERROR messages are not printe
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1' 

In [3]:
module_path = os.path.abspath(os.path.join('..'))
# Relative imports
if module_path not in sys.path:
    sys.path.append(module_path)

In [4]:
import tensorflow as tf

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  4


In [5]:
from utils.setup import SetupDiagnostics
from neural_networks.load_models import load_models, get_save_plot_folder
from neural_networks.model_diagnostics import ModelDiagnostics


Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


In [6]:
from pathlib import Path

## Load trained CASTLE models

In [7]:
argv  = ["-c", "../output_castle/training_6_normal/cfg_castle_NN_Creation.yml"]
plot_dir = Path("../output_castle/training_6_normal/plots_offline_evaluation/plot_profiles/")

In [8]:
castle_setup = SetupDiagnostics(argv)

In [9]:
castle_models = load_models(castle_setup)


Load model: /work/bd1179/b309247/pycharm_projects/iglesias-suarez2yxx_spuriouslinks/output_castle/training_6_normal/models_castle/castleNN/r1.0-a1.0-b1.0-l1.0/hl_256_256_256_256_256_256_256_256_256-act_ReLU-e_18/1_0_model.h5

Load model: /work/bd1179/b309247/pycharm_projects/iglesias-suarez2yxx_spuriouslinks/output_castle/training_6_normal/models_castle/castleNN/r1.0-a1.0-b1.0-l1.0/hl_256_256_256_256_256_256_256_256_256-act_ReLU-e_18/1_1_model.h5

Load model: /work/bd1179/b309247/pycharm_projects/iglesias-suarez2yxx_spuriouslinks/output_castle/training_6_normal/models_castle/castleNN/r1.0-a1.0-b1.0-l1.0/hl_256_256_256_256_256_256_256_256_256-act_ReLU-e_18/1_2_model.h5

Load model: /work/bd1179/b309247/pycharm_projects/iglesias-suarez2yxx_spuriouslinks/output_castle/training_6_normal/models_castle/castleNN/r1.0-a1.0-b1.0-l1.0/hl_256_256_256_256_256_256_256_256_256-act_ReLU-e_18/1_3_model.h5

Load model: /work/bd1179/b309247/pycharm_projects/iglesias-suarez2yxx_spuriouslinks/output_cast

In [10]:
len(castle_models)

1

In [11]:
len(castle_models['castleNN'])

65

In [12]:
# Note: keys are variables not strings
castle_models['castleNN'].keys()

dict_keys(['tphystnd-3.64', 'tphystnd-7.59', 'tphystnd-14.36', 'tphystnd-24.61', 'tphystnd-38.27', 'tphystnd-54.6', 'tphystnd-72.01', 'tphystnd-87.82', 'tphystnd-103.32', 'tphystnd-121.55', 'tphystnd-142.99', 'tphystnd-168.23', 'tphystnd-197.91', 'tphystnd-232.83', 'tphystnd-273.91', 'tphystnd-322.24', 'tphystnd-379.1', 'tphystnd-445.99', 'tphystnd-524.69', 'tphystnd-609.78', 'tphystnd-691.39', 'tphystnd-763.4', 'tphystnd-820.86', 'tphystnd-859.53', 'tphystnd-887.02', 'tphystnd-912.64', 'tphystnd-936.2', 'tphystnd-957.49', 'tphystnd-976.33', 'tphystnd-992.56', 'phq-3.64', 'phq-7.59', 'phq-14.36', 'phq-24.61', 'phq-38.27', 'phq-54.6', 'phq-72.01', 'phq-87.82', 'phq-103.32', 'phq-121.55', 'phq-142.99', 'phq-168.23', 'phq-197.91', 'phq-232.83', 'phq-273.91', 'phq-322.24', 'phq-379.1', 'phq-445.99', 'phq-524.69', 'phq-609.78', 'phq-691.39', 'phq-763.4', 'phq-820.86', 'phq-859.53', 'phq-887.02', 'phq-912.64', 'phq-936.2', 'phq-957.49', 'phq-976.33', 'phq-992.56', 'fsnt', 'fsns', 'flnt', 'fl

## Vertical cross-section plots

In [13]:
# This variable does not exist in the code (but key nn_type is the same)
castle_model_type = "castleNN"
castle_setup.model_type = castle_model_type

In [14]:
castle_md = ModelDiagnostics(setup=castle_setup, 
                             models=castle_models[castle_model_type]) 

In [15]:
castle_md

<neural_networks.model_diagnostics.ModelDiagnostics at 0x7ff9403e8f70>

In [16]:
dict_keys = castle_models['castleNN'].keys()

### Single Variable

In [17]:
from utils.variable import Variable_Lev_Metadata

In [18]:
var = Variable_Lev_Metadata.parse_var_name("tphystnd-0")
var_keys = [k for k in dict_keys if var.var.value in str(k)]
unit = "K/s"
# phq unit "kg/(kg*s)"

In [19]:
i_time = "range"
n_time = 5
stats = "mse"
fig = castle_md. plot_double_profile(var, var_keys, itime=i_time, nTime=n_time, 
                                     lats=[-90, 90], lons=[0., 359.], save=False, 
                                     stats=stats, show_plot=True, unit=unit)
fig.show()


Plotting double_profiles for variable tphystnd

Validation batch size = 8192.
Time samples: 5


2023-07-16 16:19:10.658962: W tensorflow/tsl/framework/bfc_allocator.cc:485] Allocator (GPU_0_bfc) ran out of memory trying to allocate 8.00MiB (rounded to 8388608)requested by op castleNN/input_sub_layer_92/MatMul
If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve the situation. 
Current allocation summary follows.
Current allocation summary follows.
2023-07-16 16:19:10.669283: W tensorflow/tsl/framework/bfc_allocator.cc:497] ****************************************************************************************************
2023-07-16 16:19:10.670223: W tensorflow/core/framework/op_kernel.cc:1830] OP_REQUIRES failed at matmul_op_impl.h:730 : RESOURCE_EXHAUSTED: OOM when allocating tensor with shape[8192,256] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
2023-07-16 16:19:20.680184: W tensorflow/tsl/framework/bfc_allocator.cc:485] Allocator (GPU_0_bfc) ran out of memory trying to 

ResourceExhaustedError: Graph execution error:

Detected at node 'castleNN/input_sub_layer_92/MatMul' defined at (most recent call last):
    File "/work/bd1179/b309247/miniconda3/envs/tensorflow_env/lib/python3.9/runpy.py", line 197, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "/work/bd1179/b309247/miniconda3/envs/tensorflow_env/lib/python3.9/runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "/work/bd1179/b309247/miniconda3/envs/tensorflow_env/lib/python3.9/site-packages/ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "/work/bd1179/b309247/miniconda3/envs/tensorflow_env/lib/python3.9/site-packages/traitlets/config/application.py", line 1043, in launch_instance
      app.start()
    File "/work/bd1179/b309247/miniconda3/envs/tensorflow_env/lib/python3.9/site-packages/ipykernel/kernelapp.py", line 725, in start
      self.io_loop.start()
    File "/work/bd1179/b309247/miniconda3/envs/tensorflow_env/lib/python3.9/site-packages/tornado/platform/asyncio.py", line 195, in start
      self.asyncio_loop.run_forever()
    File "/work/bd1179/b309247/miniconda3/envs/tensorflow_env/lib/python3.9/asyncio/base_events.py", line 601, in run_forever
      self._run_once()
    File "/work/bd1179/b309247/miniconda3/envs/tensorflow_env/lib/python3.9/asyncio/base_events.py", line 1905, in _run_once
      handle._run()
    File "/work/bd1179/b309247/miniconda3/envs/tensorflow_env/lib/python3.9/asyncio/events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "/work/bd1179/b309247/miniconda3/envs/tensorflow_env/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 513, in dispatch_queue
      await self.process_one()
    File "/work/bd1179/b309247/miniconda3/envs/tensorflow_env/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 502, in process_one
      await dispatch(*args)
    File "/work/bd1179/b309247/miniconda3/envs/tensorflow_env/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 409, in dispatch_shell
      await result
    File "/work/bd1179/b309247/miniconda3/envs/tensorflow_env/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 729, in execute_request
      reply_content = await reply_content
    File "/work/bd1179/b309247/miniconda3/envs/tensorflow_env/lib/python3.9/site-packages/ipykernel/ipkernel.py", line 422, in do_execute
      res = shell.run_cell(
    File "/work/bd1179/b309247/miniconda3/envs/tensorflow_env/lib/python3.9/site-packages/ipykernel/zmqshell.py", line 540, in run_cell
      return super().run_cell(*args, **kwargs)
    File "/work/bd1179/b309247/miniconda3/envs/tensorflow_env/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3009, in run_cell
      result = self._run_cell(
    File "/work/bd1179/b309247/miniconda3/envs/tensorflow_env/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3064, in _run_cell
      result = runner(coro)
    File "/work/bd1179/b309247/miniconda3/envs/tensorflow_env/lib/python3.9/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "/work/bd1179/b309247/miniconda3/envs/tensorflow_env/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3269, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "/work/bd1179/b309247/miniconda3/envs/tensorflow_env/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3448, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "/work/bd1179/b309247/miniconda3/envs/tensorflow_env/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3508, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "/tmp/ipykernel_1495777/3908021388.py", line 4, in <module>
      fig = castle_md. plot_double_profile(var, var_keys, itime=i_time, nTime=n_time,
    File "/work/bd1179/b309247/pycharm_projects/iglesias-suarez2yxx_spuriouslinks/neural_networks/model_diagnostics.py", line 623, in plot_double_profile
      t, p = self.get_truth_pred(itime, varkeys[0], nTime=nTime)
    File "/work/bd1179/b309247/pycharm_projects/iglesias-suarez2yxx_spuriouslinks/neural_networks/model_diagnostics.py", line 137, in get_truth_pred
      p_tmp = model.predict_on_batch(X_tmp[:, inputs])
    File "/work/bd1179/b309247/miniconda3/envs/tensorflow_env/lib/python3.9/site-packages/keras/engine/training.py", line 2603, in predict_on_batch
      outputs = self.predict_function(iterator)
    File "/work/bd1179/b309247/miniconda3/envs/tensorflow_env/lib/python3.9/site-packages/keras/engine/training.py", line 2169, in predict_function
      return step_function(self, iterator)
    File "/work/bd1179/b309247/miniconda3/envs/tensorflow_env/lib/python3.9/site-packages/keras/engine/training.py", line 2155, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/work/bd1179/b309247/miniconda3/envs/tensorflow_env/lib/python3.9/site-packages/keras/engine/training.py", line 2143, in run_step
      outputs = model.predict_step(data)
    File "/work/bd1179/b309247/miniconda3/envs/tensorflow_env/lib/python3.9/site-packages/keras/engine/training.py", line 2111, in predict_step
      return self(x, training=False)
    File "/work/bd1179/b309247/miniconda3/envs/tensorflow_env/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/work/bd1179/b309247/miniconda3/envs/tensorflow_env/lib/python3.9/site-packages/keras/engine/training.py", line 558, in __call__
      return super().__call__(*args, **kwargs)
    File "/work/bd1179/b309247/miniconda3/envs/tensorflow_env/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/work/bd1179/b309247/miniconda3/envs/tensorflow_env/lib/python3.9/site-packages/keras/engine/base_layer.py", line 1145, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/work/bd1179/b309247/miniconda3/envs/tensorflow_env/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "/work/bd1179/b309247/miniconda3/envs/tensorflow_env/lib/python3.9/site-packages/keras/engine/functional.py", line 512, in call
      return self._run_internal_graph(inputs, training=training, mask=mask)
    File "/work/bd1179/b309247/miniconda3/envs/tensorflow_env/lib/python3.9/site-packages/keras/engine/functional.py", line 669, in _run_internal_graph
      outputs = node.layer(*args, **kwargs)
    File "/work/bd1179/b309247/miniconda3/envs/tensorflow_env/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/work/bd1179/b309247/miniconda3/envs/tensorflow_env/lib/python3.9/site-packages/keras/engine/base_layer.py", line 1145, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/work/bd1179/b309247/miniconda3/envs/tensorflow_env/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "/work/bd1179/b309247/miniconda3/envs/tensorflow_env/lib/python3.9/site-packages/keras/layers/core/dense.py", line 241, in call
      outputs = tf.matmul(a=inputs, b=self.kernel)
Node: 'castleNN/input_sub_layer_92/MatMul'
OOM when allocating tensor with shape[8192,256] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node castleNN/input_sub_layer_92/MatMul}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_predict_function_966703]

In [None]:
i_time = "range"
n_time = 5
stats = "r2"
fig = castle_md. plot_double_profile(var, var_keys, itime=i_time, nTime=n_time, 
                                     lats=[-90, 90], lons=[0., 359.], save=False, 
                                     stats=stats, show_plot=True, unit=unit)
fig.show()

In [None]:
i_time = "range"
n_time = 5
stats = "r2"
_ = castle_md.plot_double_profile(var, var_keys, itime=i_time, nTime=n_time, 
                                     lats=[-90, 90], lons=[0., 359.], save=plot_dir, 
                                     stats=stats, show_plot=False, unit=unit)

### All variables 3d variables

In [None]:
def get_save_str(idx_time, num_time=False, latitudes=False, 
                 longitudes=False, statistics=False):
    if type(idx_time) is int:
        idx_time_str = f"step-{idx_time}"
    elif type(idx_time) is str:
        if num_time:
            idx_time_str = f"{idx_time}-{num_time}"
        else:
            idx_time_str = f"{idx_time}-all"
    else:
        raise ValueError(f"Unkown value for idx_time: {idx_time}")

    lats_str = f"_lats_{latitudes[0]}_{latitudes[1]}" if latitudes else ""
    lons_str = f"_lats_{longitudes[0]}_{longitudes[1]}n" if longitudes else ""
    stats_str =  f"_stats-{statistics}" if statistics else ""
    
    return idx_time_str + lats_str + lons_str + stats_str
        

In [None]:
# only 3d
# tphystnd unit "K/s"
# phq unit "kg/(kg*s)"
var_unit_str_three_d = [("tphystnd-3.64", "K/s"), ("phq-3.64", "kg/(kg*s)")] 
three_d_keys = [(Variable_Lev_Metadata.parse_var_name(var_str), unit) for var_str, unit in var_unit_str_three_d]

dict_keys = castle_models['castleNN'].keys()

In [None]:
# Not function parameters, uses variables that are set in Notebook cells!!
def run_plot_profiles():
    save_dir = Path(plot_dir, get_save_str(i_time, num_time=n_time, latitudes=lats, 
                                           longitudes=lons, statistics=stats))
    Path(save_dir).mkdir(parents=True, exist_ok=True)


    for var, unit in three_d_keys:
        print(var)
        var_keys = [k for k in dict_keys if var.var.value in str(k)]
        
        _ = castle_md.plot_double_profile(var, var_keys, itime=i_time, nTime=n_time, 
                                          lats=lats, lons=lons, save=save_dir, 
                                          stats=stats, show_plot=False, unit=unit)

#### Time range, 5 steps, mse

In [None]:
i_time = "range"
n_time = 5
lats=[-90, 90]
lons=[0., 359.]
stats = "mse"

run_plot_profiles()

#### Time range, 5 steps, r2

In [None]:
i_time = "range"
n_time = 5
lats=[-90, 90]
lons=[0., 359.]
stats = "r2"

run_plot_profiles()

#### Time range, 1440 steps, mse

In [None]:
# Time step CAM=30min
# Time step SRM=20sec
# (30*60sec)/20sec = 90 
# Or: 120 as Nando did
i_time = "range"
n_time = 1440
lats=[-90, 90]
lons=[0., 359.]
# stats = "mse" --> does not work with stats turned on 

run_plot_profiles()

#### Time range, 1440 steps, r2

In [None]:
# Time step CAM=30min
# Time step SRM=20sec
# (30*60sec)/20sec = 90 
# Or: 120 as Nando did
i_time = "range"
n_time = 1440
lats=[-90, 90]
lons=[0., 359.]
stats = "r2"

run_plot_profiles()