In [None]:
import utils.utils as utils
example_aggregated = "./aggregated_results/cfg_single_lon120.obj"
example_aggregated = "./aggregated_results/cfg_testing.obj"
aggregated_results = utils.load_results(example_aggregated)


# Create Neural Networks

In [None]:
from tensorflow.keras import Sequential, Input
from tensorflow.keras.layers import Dense


def dense_nn(input_shape, output_shape, hidden_layers, activation):
    """
    Creates a dense NN in base of the parameters received
    """
    model = Sequential()
    model.add(Input(shape=input_shape))

    for n_layer_nodes in hidden_layers:
        model.add(Dense(n_layer_nodes, activation=activation))

    model.add(Dense(output_shape))
    return model


def fc_model(
    input_shape,
    output_shape,
    hidden_layers,
    activation,
    conservation_layer=False,
    inp_sub=None,
    inp_div=None,
    norm_q=None,
):
    inp = Input(shape=(input_shape,))

    # First hidden layer
    x = Dense(hidden_layers[0])(inp)
    x = act_layer(activation)(x)

    # Remaining hidden layers
    for h in hidden_layers[1:]:
        x = Dense(h)(x)
        x = act_layer(activation)(x)

    if conservation_layer:
        x = SurRadLayer(inp_sub, inp_div, norm_q)([inp, x])
        x = MassConsLayer(inp_sub, inp_div, norm_q)([inp, x])
        out = EntConsLayer(inp_sub, inp_div, norm_q)([inp, x])

    else:
        out = Dense(output_shape)(x)

    return tf.keras.models.Model(inp, out)


In [None]:
from pathlib import Path
from utils.constants import SPCAM_Vars, DATA_FOLDER, ANCIL_FILE
import utils.utils as utils


class Variable:

    """
    A combination of a SPCAM variable and one specific level, so the
    information can be easily referred.
    The variable is expressed as the corresponding SPCAM_Vars object.
    The level is expressed as a pair: [altitude, index]
    """

    def __init__(self, spcam_var, level_altitude, level_idx):
        self.var = spcam_var
        self.level = level_altitude
        self.level_idx = level_idx

    def parse_var_name(var_name):
        """
        Parses a var_name string with format "{name}-{altitude}" to a
        Variable object
        """
        values = var_name.split("-")
        spcam_name = values[0]
        dict_spcam_vars = {v.name: v for v in SPCAM_Vars}
        spcam_var = dict_spcam_vars[spcam_name]

        if spcam_var.dimensions == 2:
            level_altitude = level_idx = None
        elif spcam_var.dimensions == 3:
            levels, _, _ = utils.read_ancilaries(Path(DATA_FOLDER, ANCIL_FILE))
            level_altitude = float(values[1])
            level_idx = utils.find_closest_value(levels, level_altitude)

        return Variable(spcam_var, level_altitude, level_idx)

    def __str__(self):
        if self.var.dimensions == 2:
            return f"{self.var.name}"
        elif self.var.dimensions == 3:
            return f"{self.var.name}-{self.level}"

    def __repr__(self):
        return repr(str(self))


In [None]:
import numpy as np
import tensorflow as tf

# NOTE: This will generate a lot of models, it may be better to put
# them on different folders
MODEL_FILENAME_PATTERN = "model-{variable}-a{pc_alpha}-t{threshold}.h5"
MODEL_FILENAME_PATTERN = "models/a{pc_alpha}-t{threshold}/{variable}.h5" # Folders

class ModelDescription:
    def __init__(
        self,
        variable,
        pc_alpha,
        threshold,
        parents,
        hidden_layers = [32, 32, 32],
        activation = "relu"
    ):
        self.variable = Variable.parse_var_name(variable)
        self.pc_alpha = pc_alpha
        self.threshold = threshold
        self.parents = [Variable.parse_var_name(p) for p in parents]
        self.hidden_layers = hidden_layers
        self.activation = activation
        self.model = self._build_model()

    def _build_model(self):
        input_shape = len(parents)
        input_shape = (input_shape,)
        model = dense_nn(
            input_shape=input_shape,
            output_shape=1,  # Only one variable
            hidden_layers=self.hidden_layers,
            activation=self.activation,
        )
        model.compile(
            optimizer = "adam", # From train.py (default)
            loss = "mse", # From 006_8col_pnas_exact.yml
            metrics = [tf.keras.losses.mse], # From train.py (default)
            
        )
        return model
    
    def generate_filename(self):
        return MODEL_FILENAME_PATTERN.format(
            variable = str(self.variable),
            pc_alpha = self.pc_alpha,
            threshold = self.threshold
        )
    
    def save_model(self):
        self.model.save(generate_filename())

    def __str__(self):
        return f"{self.variable}: a{self.pc_alpha}-t{self.threshold}"

    def __repr__(self):
        return repr(str(self))

    def __hash__(self):
        return hash(str(self))

    def __eq__(self, other):
        return str(self) == str(other)


In [None]:
model_descriptions = list()
for variable, pc_alpha_dict in aggregated_results.items():
    print(variable)
    if len(pc_alpha_dict) == 0:  # May be empty
        # TODO How to approach this?
        print("Empty results")
        pass
    for pc_alpha, pc_alpha_results in pc_alpha_dict.items():
        var_names = np.array(pc_alpha_results["var_names"])
        for threshold, parent_idxs in pc_alpha_results["parents"].items():
            parents = var_names[parent_idxs]
            model_description = ModelDescription(variable, pc_alpha, threshold, parents)
            model_descriptions.append(model_description)


# Training

## Data generator

In [None]:
DATA_TYPE = "year"
DATA_TYPE = "month" # For testing

DATA_FOLDER = "/work/bd1083/b309162/preprocessed_data"
if DATA_TYPE == "year":
    TRAIN_FN    = "002_train_1_year.nc"
    VALID_FN    = "005_valid_1_year.nc"
elif DATA_TYPE == "month":
    TRAIN_FN    = "002_train_1_month.nc"
    VALID_FN    = "000_valid_1_month.nc"
# NORM_FN     = "000_norm.nc"
NORM_FN_PNAS = "/pf/b/b309198/projects/causal_discovery/rasp-et-al/data/001_norm.nc"

# TODO? Improve this, perhaps using my enum
INPUT_VARS  = "QBP TBP VBP PS SOLIN SHFLX LHFLX".split(" ") 
OUTPUT_VARS = "PHQ TPHYSTND FSNT FSNS FLNT FLNS PRECT".split(" ")

INPUT_SUB = "mean"
INPUT_DIV = "maxrs"
OUT_SCALE_DICT_FN = (
    "/work/bd1179/b309198/causal_discovery/rasp-et-al/CBRAIN-CAM/"
    "nn_config/scale_dicts/002_pnas_scaling.pkl"
)
BATCH_SIZE = 1024

# TODO Confirm
# VAR_CUT_OFF = {"QBP": 14, "TBP": 14}
VAR_CUT_OFF = None


In [None]:
from neural_networks.cbrain.utils import load_pickle
out_scale_dict = load_pickle(OUT_SCALE_DICT_FN)


In [None]:
from neural_networks.cbrain.data_generator import DataGenerator
from pathlib import Path

def build_train_generator(input_vars_dict, output_vars_dict):
    train_gen = DataGenerator(
        data_fn          = Path(DATA_FOLDER, TRAIN_FN),
        input_vars_dict       = input_vars_dict,
        output_vars_dict      = output_vars_dict,
    #     norm_fn          = Path(DATA_FOLDER, NORM_FN),
        norm_fn          = NORM_FN_PNAS,
        input_transform  = (INPUT_SUB, INPUT_DIV),
        output_transform = out_scale_dict,
        batch_size       = BATCH_SIZE,
        shuffle          = True, # This feature doesn't seem to work
    )
    return train_gen


nlat=64
nlon=128
ngeo = nlat * nlon

def build_valid_generator(input_vars_dict, output_vars_dict):
    valid_gen = DataGenerator(
            data_fn          = Path(DATA_FOLDER, VALID_FN),
            input_vars_dict       = input_vars_dict,
            output_vars_dict      = output_vars_dict,
            norm_fn          = NORM_FN_PNAS,
            input_transform  = (INPUT_SUB, INPUT_DIV),
            output_transform = out_scale_dict,
            batch_size       = ngeo,
            shuffle          = False,
            #xarray           = True,
    )
    return valid_gen


## Fit models

In [None]:
from importlib import reload
import neural_networks.cbrain.data_generator
import neural_networks.cbrain.normalization
import neural_networks.cbrain.utils
import neural_networks.cbrain.learning_rate_schedule
reload(neural_networks.cbrain.data_generator)
reload(neural_networks.cbrain.utils)
reload(neural_networks.cbrain.normalization)
reload(neural_networks.cbrain.learning_rate_schedule)

In [None]:
from tensorflow.keras.callbacks import LearningRateScheduler
from neural_networks.cbrain.learning_rate_schedule import LRUpdate

for model_description in model_descriptions:
    print(model_description)
    input_vars_dict = dict()
    for parent in model_description.parents:
        ds_name = parent.var.ds_name
        if parent.var.dimensions == 2:
            input_vars_dict[ds_name] = None
        elif parent.var.dimensions == 3:
            levels = input_vars_dict.get(ds_name, list())
            levels.append(parent.level_idx)
            input_vars_dict[ds_name] = levels
    if model_description.variable.var.dimensions == 2:
        level = None
    elif model_description.variable.var.dimensions == 3:
        level = [model_description.variable.level_idx]
    output_vars_dict = {model_description.variable.var.ds_name: level}
    
    # TODO Test that the data is being taken correctly
    with build_train_generator(
        input_vars_dict, output_vars_dict
    ) as train_gen, build_valid_generator(
        input_vars_dict, output_vars_dict
    ) as valid_gen:
        # TODO fit model
        lrs = LearningRateScheduler(LRUpdate(
            init_lr = 0.001, # From train.py (default)
            step = 1, # From 006_8col_pnas_exact.yml
            divide = 5, # From train.py (default)
        ))
        model_description.model.fit(
            x = train_gen,
            validation_data = valid_gen,
            epochs = 4, # From 006_8col_pnas_exact.yml
            # epochs = 18, # TODO: Decide number of epochs, maybe use early stopper
            # verbose = 0, # Silent
            # verbose = 2, # Summary on epoch
            callbacks = [lrs]
        )
        model_description.save_model()
#         print(len(train_gen))
        break # For testing
        pass
