### Read results

In [17]:
import numpy as np
import json
import os

In [21]:
# Current path
path = '/home/b/b309170/my_work/published_code/grundner23james_EquationDiscovery_CloudCover_addressing_reviews/sec3_data-driven_modeling/sec322_neural_networks/sfs_nn_feat_ranking/split_by_regime/'

### Averaged SFS order of variables over 10 seeds

In [26]:
def return_averaged_ranking(folder_abs_path, model_type, reg, max_no_features=10): 
    '''
        folder_abs_path: Folder that contains the relevant files (absolute path)
        model_type: 'nn', 'polynomial_degree_1', 'polynomial_degree_2', 'polynomial_degree_3'
    '''
    var_position = {}
    
    if model_type == 'nn':
        substring = '0.json'
        key_name = 'features_'
    elif model_type.startswith('polynomial'):
        substring = model_type.split('_', 1)[1]
        key_name = 'Number of variables '

    reg_files = [file for file in os.listdir(folder_abs_path) if '.json' in file and 'regime_%d'%reg in file]
    for file_name in reg_files:
        # Open the file if it is one of the 10 seeds
        if substring in file_name:
            with open(folder_abs_path + file_name, 'r') as file:
                d = json.load(file)
                # Either we can append to the list otherwise it needs to be initialized
                if model_type == 'nn':
                    try:
                        var_position[d['%s%d'%(key_name,1)][0]].append(1)
                    except:
                        var_position[d['%s%d'%(key_name,1)][0]] = [1]
                elif model_type.startswith('polynomial'):
                    try:
                        var_position[list(d['%s%d'%(key_name,1)].keys())[:-5][0]].append(1)
                    except:
                        var_position[list(d['%s%d'%(key_name,1)].keys())[:-5][0]] = [1]
                for k in range(1, max_no_features):
                    if model_type == 'nn':
                        features_old = set(d['%s%d'%(key_name,k)])
                        features_new = set(d['%s%d'%(key_name,k+1)])
                    elif model_type.startswith('polynomial'):
                        features_old = set(list(d['%s%d'%(key_name,k)].keys())[:-5])
                        features_new = set(list(d['%s%d'%(key_name,k+1)].keys())[:-5])
                    # The added feature (set containing a single element)
                    added_feature_set = set(features_new).symmetric_difference(features_old)
                    added_feature = added_feature_set.pop()
                    try:
                        var_position[added_feature].append(k+1)
                    except:
                        var_position[added_feature] = [k+1]

    # Set position to max_no_features + 1 if they were not in the top max_no_features for all seeds.
    for key in var_position.keys():
        # There are 10 different seeds
        while len(var_position[key]) < 10:
            var_position[key].append(max_no_features + 1)

    # Compute the average var_positions
    var_position_avrg = {}
    for key in var_position.keys():
        var_position_avrg[key] = np.mean(var_position[key])

    # Sort the variables according to their average position and return the resulting dictionary!
    dict_keys = list(var_position_avrg.keys())
    dict_values = list(var_position_avrg.values())
    sorted_inds = np.argsort(dict_values)
    sorted_dict = {dict_keys[ind]: dict_values[ind] for ind in sorted_inds}
    
    return sorted_dict

In [28]:
# Regime 1
return_averaged_ranking(path, 'nn', reg=1)

{'cli': 1.0,
 'rh': 2.0,
 'ta': 3.4,
 'rh_z': 4.0,
 'rh_zz': 6.4,
 'ta_z': 7.2,
 'pa_z': 8.9,
 'zg': 9.0,
 'ps': 9.3,
 'clw': 9.5,
 'cli_zz': 9.6,
 'cli_z': 9.7,
 'pa': 9.9,
 'hus': 10.2,
 'fr_land': 10.4,
 'pa_zz': 10.8,
 'U': 10.8,
 'hus_zz': 10.9}

In [29]:
# Regime 2
return_averaged_ranking(path, 'nn', reg=2)

{'cli': 1.0,
 'clw': 2.0,
 'rh': 3.0,
 'rh_z': 4.5,
 'pa_zz': 5.1,
 'ta': 7.7,
 'pa_z': 8.7,
 'rh_zz': 9.1,
 'hus': 9.1,
 'fr_land': 9.2,
 'zg': 9.6,
 'cli_z': 9.9,
 'ps': 10.3,
 'pa': 10.6,
 'hus_zz': 10.7,
 'ta_zz': 10.8,
 'ta_z': 10.9,
 'clw_z': 10.9,
 'U_zz': 10.9}

In [30]:
# Regime 3
return_averaged_ranking(path, 'nn', reg=3)

{'rh': 1.0,
 'ta': 2.0,
 'rh_z': 3.0,
 'ps': 5.5,
 'rh_zz': 5.6,
 'pa_z': 6.2,
 'clw': 7.5,
 'cli': 8.0,
 'pa': 9.6,
 'ta_z': 9.7,
 'hus_zz': 10.3,
 'zg': 10.3,
 'cli_z': 10.6,
 'U': 10.6,
 'cli_zz': 10.7,
 'hus': 10.8,
 'pa_zz': 10.9,
 'hus_z': 10.9,
 'U_z': 10.9,
 'U_zz': 10.9}

In [31]:
# Regime 4
return_averaged_ranking(path, 'nn', reg=4)

{'rh': 1.0,
 'rh_z': 2.0,
 'pa_zz': 3.0,
 'rh_zz': 5.9,
 'clw': 6.3,
 'ta': 7.5,
 'hus': 8.1,
 'pa': 8.3,
 'ps': 8.8,
 'clw_z': 9.4,
 'ta_z': 9.7,
 'hus_z': 10.1,
 'hus_zz': 10.3,
 'zg': 10.5,
 'U': 10.5,
 'ta_zz': 10.5,
 'clw_zz': 10.6,
 'cli': 10.8,
 'pa_z': 10.9,
 'U_z': 10.9,
 'cli_z': 10.9}