We use mlxtend here becaue it does not need a scikit-learn estimator as its argument.

Source: https://github.com/rasbt/mlxtend/discussions/777 <br>
See also: http://rasbt.github.io/mlxtend/user_guide/feature_selection/SequentialFeatureSelector/

There's no early stopping implemented

In [1]:
# 150GB could be enough
# Won't run on a Levante GPU node

# Executed via /home/b/b309170/scripts/run_sfs_nns.sh

In [4]:
import gc
import os
import sys
import json
import time
import datetime
import numpy as np
import mlxtend

from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs

import matplotlib
import matplotlib.pyplot as plt
import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.regularizers import l1_l2
from tensorflow import nn 

# Add path with my_classes to sys.path
sys.path.insert(0, '/home/b/b309170' + '/workspace_icon-ml/cloud_cover_parameterization/')

# Reloading custom file to incorporate changes dynamically
import my_classes
from my_classes import read_mean_and_std

seed = int(sys.argv[1]) # [10, 20, ...]

np.random.seed(seed)
tf.random.set_seed(seed)

matplotlib.use('PDF')
# hour_min = '%d_%d'%(datetime.datetime.now().hour, datetime.datetime.now().minute)
hour_min = str(seed) # Just use the seed instead..!

# output_var = sys.argv[1] # 'cl_volume' or 'cl_area'
# subset_size = int(sys.argv[2]) # Tried 100000 and 150000

# floating_bool = bool(int(sys.argv[3]))

# split_by_cloud_regime = bool(int(sys.argv[4]))
# # Is only relevant if split_by_cloud_regime
# regime = int(sys.argv[5])

output_var = 'cl_area' # 'cl_volume' or 'cl_area'
subset_size = 150000 # Tried 100000 and 150000

floating_bool = False
# Only remove condensate-free cells
no_condensate_free_cells = bool(int(sys.argv[2]))

split_by_cloud_regime = False
# Is only relevant if split_by_cloud_regime
regime = 1

KeyboardInterrupt: 

In [2]:
EPOCHS = 25 
BATCH_SIZE = 32

In [3]:
# Read data
output_path = '/home/b/b309170/workspace_icon-ml/symbolic_regression/finding_symmetries/seq_feature_selector_dyamond_nns'    
folder_data = '/home/b/b309170/my_work/icon-ml_data/cloud_cover_parameterization/neighborhood_based_SR_DYAMOND/'

input_data = np.load(os.path.join(folder_data, 'cloud_cover_input_dyamond.npy'))
if output_var == 'cl_volume':
    output_data = np.load(os.path.join(folder_data, 'cloud_cover_output_dyamond.npy'))
elif output_var == 'cl_area':
    output_data = np.load(os.path.join(folder_data, 'cloud_area_output_dyamond.npy'))

features = ['hus', 'clw', 'cli', 'ta', 'pa', 'zg', 'fr_land', 'U', 'rh', 'ps', 'hus_z', 'hus_zz', 'clw_z', 'clw_zz', 'cli_z',\
            'cli_zz', 'ta_z', 'ta_zz', 'pa_z', 'pa_zz', 'U_z', 'U_zz', 'rh_z', 'rh_zz']
no_features = len(features)

samples_total, no_of_features = input_data.shape

# Split into train/valid
training_folds = []
validation_folds = []
two_week_incr = samples_total//6

for i in range(3):
    # Note that this is a temporal split since time was the first dimension in the original tensor
    first_incr = np.arange(samples_total//6*i, samples_total//6*(i+1))
    second_incr = np.arange(samples_total//6*(i+3), samples_total//6*(i+4))

    validation_folds.append(np.append(first_incr, second_incr))
    training_folds.append(np.arange(samples_total))
    training_folds[i] = np.delete(training_folds[i], validation_folds[i])

input_train = input_data[training_folds[1]]
input_valid = input_data[validation_folds[1]]
output_train = output_data[training_folds[1]]
output_valid = output_data[validation_folds[1]]

# Remove input_data, output_data
del input_data, output_data, training_folds, validation_folds
gc.collect()

20

In [4]:
# To locate variables
loc = {}
for i in range(len(features)):
    loc[features[i]] = i

In [5]:
input_train.shape

(190119664, 24)

**Split into cloud regimes**

According to both:
- a*q_i + q_c
- air pressure

--> There is no easy way to specify a, so I choose it to be equal to 1 (alternatively one could think about mean(a qi) = mean(qc)). Then I can interpret qi+qc as the condensate mixing ratio.

So I have four regimes in total: <br>
1) 0 < qi+qc < 1.6e-5 and p < 7.9e4 [High altitude, little condensate]
2) 0 < qi+qc < 1.6e-5 and p > 7.9e4 [Low altitude, little condensate]
3) qi+qc > 1.6e-5 and p < 7.9e4 [High altitude, high condensate]
4) qi+qc > 1.6e-5 and p > 7.9e4 [Low altitude, high condensate]

For $qi + qc = 0$ we simply set $C = 0$.

In every regime there are more than 2.3e6 samples.

In [6]:
# We train SFS NNs per cloud regime if split_by_cloud_regime is True
if split_by_cloud_regime:
    a = 1

    cod_subs = a*input_train[:, loc['cli']] + input_train[:, loc['clw']]
    cod_subs_med = np.median(cod_subs[cod_subs != 0])

    pa_med = np.median(input_train[cod_subs != 0, loc['pa']])

    # For the training data
    input_train_reg_1 = input_train[(0 < cod_subs) & (cod_subs < cod_subs_med) & (input_train[:, loc['pa']] < pa_med)]
    input_train_reg_2 = input_train[(0 < cod_subs) & (cod_subs < cod_subs_med) & (input_train[:, loc['pa']] > pa_med)]
    input_train_reg_3 = input_train[(cod_subs > cod_subs_med) & (input_train[:, loc['pa']] < pa_med)]
    input_train_reg_4 = input_train[(cod_subs > cod_subs_med) & (input_train[:, loc['pa']] > pa_med)]

    output_train_reg_1 = output_train[(0 < cod_subs) & (cod_subs < cod_subs_med) & (input_train[:, loc['pa']] < pa_med)]
    output_train_reg_2 = output_train[(0 < cod_subs) & (cod_subs < cod_subs_med) & (input_train[:, loc['pa']] > pa_med)]
    output_train_reg_3 = output_train[(cod_subs > cod_subs_med) & (input_train[:, loc['pa']] < pa_med)]
    output_train_reg_4 = output_train[(cod_subs > cod_subs_med) & (input_train[:, loc['pa']] > pa_med)]
    
    # Do the regimes have a similar size?
    for i in range(1, 5):
        print(locals()['input_train_reg_%d'%i].shape)
        print(locals()['output_train_reg_%d'%i].shape)
        
if no_condensate_free_cells:
    cod_subs = input_train[:, loc['cli']] + input_train[:, loc['clw']]
    input_train = input_train[(1e-20 < cod_subs)]
    output_train = output_train[(1e-20 < cod_subs)]

(38864092, 24)
(38864092,)
(23611276, 24)
(23611276,)
(23611277, 24)
(23611277,)
(38864096, 24)
(38864096,)


In [7]:
# We train SFS NNs per cloud regime if split_by_cloud_regime is True
if split_by_cloud_regime:
    # Same for the validation data
    cod_subs = a*input_valid[:, loc['cli']] + input_valid[:, loc['clw']]

    input_valid_reg_1 = input_valid[(0 < cod_subs) & (cod_subs < cod_subs_med) & (input_valid[:, loc['pa']] < pa_med)]
    input_valid_reg_2 = input_valid[(0 < cod_subs) & (cod_subs < cod_subs_med) & (input_valid[:, loc['pa']] > pa_med)]
    input_valid_reg_3 = input_valid[(cod_subs > cod_subs_med) & (input_valid[:, loc['pa']] < pa_med)]
    input_valid_reg_4 = input_valid[(cod_subs > cod_subs_med) & (input_valid[:, loc['pa']] > pa_med)]

    output_valid_reg_1 = output_valid[(0 < cod_subs) & (cod_subs < cod_subs_med) & (input_valid[:, loc['pa']] < pa_med)]
    output_valid_reg_2 = output_valid[(0 < cod_subs) & (cod_subs < cod_subs_med) & (input_valid[:, loc['pa']] > pa_med)]
    output_valid_reg_3 = output_valid[(cod_subs > cod_subs_med) & (input_valid[:, loc['pa']] < pa_med)]
    output_valid_reg_4 = output_valid[(cod_subs > cod_subs_med) & (input_valid[:, loc['pa']] > pa_med)]
    
    # Do the regimes have a similar size?
    for i in range(1, 5):
        print(locals()['input_valid_reg_%d'%i].shape)
        print(locals()['output_valid_reg_%d'%i].shape)

(19337328, 24)
(19337328,)
(11581298, 24)
(11581298,)
(11931635, 24)
(11931635,)
(19790540, 24)
(19790540,)


**Choose the appropriate regime**

In [8]:
# Convert string into variable name according to the right regime
if split_by_cloud_regime:
    input_train = locals()['input_train_reg_%d'%regime].copy()
    input_valid = locals()['input_valid_reg_%d'%regime].copy()

    output_train = locals()['output_train_reg_%d'%regime].copy()
    output_valid = locals()['output_valid_reg_%d'%regime].copy()

**Normalize the data**

In [9]:
# The second fold yields the best model
# Normalize the data acc. to the mean and std associated with the training data
mean, std = read_mean_and_std(os.path.join('/home/b/b309170/workspace_icon-ml/cloud_cover_parameterization/neighborhood_based_SR_DYAMOND/saved_models', 
                                           'cross_validation_neighborhood_based_sr_%s_fold_2.txt'%output_var))
input_train = (input_train - mean)/std
input_valid = (input_valid - mean)/std

samples_total, no_of_features = input_train.shape

In [10]:
# Whether to work with a subset of the data
subset = np.random.randint(0, input_train.shape[0], subset_size)

input_train = input_train[subset]
output_train = output_train[subset]

In [11]:
def create_model(input_train):
    # Create the model
    model = Sequential()

    # First hidden layer
    model.add(Dense(units=64, activation='tanh', input_dim=input_train.shape[1], 
                    kernel_regularizer=l1_l2(l1=0.004749, l2=0.008732)))

    # Second hidden layer
    model.add(Dense(units=64, activation=nn.leaky_relu, kernel_regularizer=l1_l2(l1=0.004749, l2=0.008732)))
    model.add(BatchNormalization())

    # Third hidden layer
    model.add(Dense(units=64, activation='tanh', kernel_regularizer=l1_l2(l1=0.004749, l2=0.008732)))

    # Output layer
    model.add(Dense(1, activation='linear', kernel_regularizer=l1_l2(l1=0.004749, l2=0.008732)))

    # Compile the model
    model.compile(
        optimizer=tf.keras.optimizers.Adadelta(learning_rate=0.000433, epsilon=0.1),
        loss=tf.keras.losses.MeanSquaredError()
    )

    return model

In [12]:
# Wrap Keras nn and turn it into a scikit-learn estimator
class MakeModel(object):
    def __init__(self, X=None, y=None):
        pass

    def predict(self, X):
        y_pred = self.model.predict(X)
        return y_pred
    
    def fit(self, X, y):
        skwrapped_model = KerasRegressor(build_fn=create_model,
                                          input_train=X,
                                          epochs=EPOCHS,
                                          batch_size=BATCH_SIZE,
                                          verbose=0)
        self.model = skwrapped_model
        self.model.fit(X, y)
        return self.model

In [None]:
# Could set floating to either True or False
max_features = 10

t0 = time.time()
sffs = SFS(MakeModel(),
           k_features=(1, max_features),
           floating=floating_bool, # Adds a check whether it is better to remove a feature from a given subset
           clone_estimator=False, # Set to False if the estimator doesn't implement scikit-learn's set_params and get_params methods
           cv=0, # Required if clone_estimator=False
           n_jobs=1, # Required if clone_estimator=False
           verbose=2,
           scoring='r2')

# Apply SFS to identify best feature subset
sffs = sffs.fit(input_train, output_train, custom_feature_names=features)

required_time = time.time() - t0

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   57.9s remaining:    0.0s


**Results**

In [None]:
fig1 = plot_sfs(sffs.get_metric_dict(), kind='std_dev')

# plt.ylim([0.8, 1])
plt.title('Sequential Forward Selection')
plt.grid()
plt.ylabel('$R^2$ score on training data')

if split_by_cloud_regime:
    plt.savefig('/home/b/b309170/workspace_icon-ml/symbolic_regression/finding_symmetries/seq_feature_selector_dyamond_nns/split_by_regime/seq_feat_selector_training_data_regime_%d_%s_%s.pdf'%(regime, output_var, hour_min))
elif no_condensate_free_cells:
    plt.savefig('/home/b/b309170/workspace_icon-ml/symbolic_regression/finding_symmetries/seq_feature_selector_dyamond_nns/no_condensate_free_cells/seq_feat_selector_training_data_%s_%s.pdf'%(output_var, hour_min))
else:
    plt.savefig('/home/b/b309170/workspace_icon-ml/symbolic_regression/finding_symmetries/seq_feature_selector_dyamond_nns/seq_feat_selector_training_data_%s_%s.pdf'%(output_var, hour_min))
# plt.show()

In [None]:
out_dict = {}
for k in range(1, max_features + 1):
    out_dict['features_%d'%k] = sffs.subsets_[k]['feature_names']
    out_dict['r2_score_%d'%k] = sffs.subsets_[k]['avg_score']
out_dict['Required time in minutes'] = required_time/60
out_dict['Epochs'] = EPOCHS
out_dict['Subset size'] = subset_size
if floating_bool:
    out_dict['Floating'] = 'True'

if split_by_cloud_regime:
    out_json_path = '/home/b/b309170/workspace_icon-ml/symbolic_regression/finding_symmetries/seq_feature_selector_dyamond_nns/split_by_regime/seq_feat_selector_training_data_regime_%d_%s_%s.json'%(regime, output_var, hour_min)    
elif no_condensate_free_cells:
    out_json_path = '/home/b/b309170/workspace_icon-ml/symbolic_regression/finding_symmetries/seq_feature_selector_dyamond_nns/no_condensate_free_cells/seq_feat_selector_training_data_%s_%s.json'%(output_var, hour_min)
else:
    out_json_path = '/home/b/b309170/workspace_icon-ml/symbolic_regression/finding_symmetries/seq_feature_selector_dyamond_nns/seq_feat_selector_training_data_%s_%s.json'%(output_var, hour_min) 
    
with open(out_json_path, 'w') as file:
    json.dump(out_dict, file)