### Testing SINDy-PI

--> SINDy-PI is only applicable to differential equations, not for symbolic regression.

In [1]:
import gc
import os
import sys
import json
import time
import sympy as sp
import numpy as np
import pysindy as ps

from sklearn import tree

# 12/8/22: Changed from 10 to SEED. Added loss_exp.
SEED = int(10)
np.random.seed(SEED) 
loss_exp = int(2) # 3,4,5. Default is 2.

# Complexity of addition and multiplication. Try: [1,2,3]
# Complexity of variables is always set to 1
# verylow_ops_complexity = int(sys.argv[1])
verylow_ops_complexity = 3

# Try 1,2,3,4
# no_of_regimes = int(sys.argv[2])
no_of_regimes = 2
# Which regime should we work with (pick from [1, ..., (no_of_regimes - 1)])
# Set to 1 if no_of_regimes = 1
# regime = int(sys.argv[3])
regime = 1

# Try 500,1000,5000,10000
# subset_size = int(sys.argv[4])
subset_size = 5000

# 221108: Try eight input features
no_features = 5

# 221214: Without loc_issue?
loc_issue = False

**Load data**

In [2]:
all_possible_features = ['hus', 'clw', 'cli', 'ta', 'pa', 'zg', 'fr_land', 'U', 'rh', 'ps', 'hus_z', 'hus_zz', 'clw_z', 'clw_zz', 'cli_z',\
            'cli_zz', 'ta_z', 'ta_zz', 'pa_z', 'pa_zz', 'U_z', 'U_zz', 'rh_z', 'rh_zz']

loc_all = {}
for i in range(len(all_possible_features)):
    loc_all[all_possible_features[i]] = i
    
# Features
if no_features == 5:
    features = ['rh', 'ta', 'clw', 'cli', 'rh_z']
elif no_features == 8:
    features = ['rh', 'ta', 'clw', 'cli', 'rh_z', 'rh_zz', 'pa_z', 'pa_zz']
    
loc = {}
for i in range(len(features)):
    loc[features[i]] = i

In [3]:
path_data = os.path.join(os.environ['HOME'] + '/my_work/icon-ml_data/cloud_cover_parameterization/neighborhood_based_SR_DYAMOND')

# Load the input data and pick the five best features (rh, ta, clw, cli, rh_z)
input_data = np.load(path_data + '/cloud_cover_input_dyamond.npy')
input_data = np.concatenate([np.expand_dims(input_data[:, loc_all[sel_var]], axis=1) for sel_var in features], axis = 1)

output_data = np.load(path_data + '/cloud_area_output_dyamond.npy')

In [4]:
(samples_total, no_of_features) = input_data.shape
(samples_total, no_of_features)

(285179494, 5)

In [5]:
# Construct training and validation data
training_folds = []
validation_folds = []
two_week_incr = samples_total//6

for i in range(3):
    # Note that this is a temporal split since time was the first dimension in the original tensor
    first_incr = np.arange(samples_total//6*i, samples_total//6*(i+1))
    second_incr = np.arange(samples_total//6*(i+3), samples_total//6*(i+4))

    validation_folds.append(np.append(first_incr, second_incr))
    training_folds.append(np.arange(samples_total))
    training_folds[i] = np.delete(training_folds[i], validation_folds[i])

In [6]:
# The second fold yields the best model
flattened_input_train = input_data[training_folds[1]]
flattened_input_valid = input_data[validation_folds[1]]
flattened_output_train = output_data[training_folds[1]]
flattened_output_valid = output_data[validation_folds[1]]
    
# Remove input_data, output_data
del input_data, output_data, training_folds, validation_folds
gc.collect()

0

**Split the data into decision tree-based regimes**

In [7]:
if no_of_regimes > 1:
    # Already remove the regime with clw + cli = 0
    if loc_issue:
        reg_not_0_train = np.where(flattened_input_train[:, loc_all['clw']] + flattened_input_train[:, loc_all['cli']] > 1e-20)[0]
        reg_not_0_valid = np.where(flattened_input_valid[:, loc_all['clw']] + flattened_input_valid[:, loc_all['cli']] > 1e-20)[0]
    else:
        reg_not_0_train = np.where(flattened_input_train[:, loc['clw']] + flattened_input_train[:, loc['cli']] > 1e-20)[0]
        reg_not_0_valid = np.where(flattened_input_valid[:, loc['clw']] + flattened_input_valid[:, loc['cli']] > 1e-20)[0]
        
    flattened_input_train = flattened_input_train[reg_not_0_train]
    flattened_output_train = flattened_output_train[reg_not_0_train]
    flattened_input_valid = flattened_input_valid[reg_not_0_valid]
    flattened_output_valid = flattened_output_valid[reg_not_0_valid]

    # We only need to split the regimes further if no_of_regimes > 2
    if no_of_regimes > 2:
        # Take a subset of the data to train the decision tree on
        subset_size = 10**7 # or 10**6

        inds = np.random.randint(0, flattened_input_train.shape[0], subset_size)
        input_subset = flattened_input_train[inds]
        output_subset = flattened_output_train[inds]

        classification_tree = tree.DecisionTreeRegressor(max_depth=3, max_leaf_nodes=(no_of_regimes-1)) # set max_depth to [2,3]
        classification_tree.fit(input_subset, output_subset)
        text_representation = tree.export_text(classification_tree, feature_names=features)
        print(text_representation)

        ind_reg_train = np.where(classification_tree.apply(flattened_input_train) == regime)
        ind_reg_valid = np.where(classification_tree.apply(flattened_input_valid) == regime)

        # Sometimes, the regime is called differently...
        if np.sum(ind_reg_train) == 0:
            print('The regime %d does not exist, switching to regime %d instead.'%(regime, no_of_regimes))
            ind_reg_train = np.where(classification_tree.apply(flattened_input_train) == no_of_regimes)
            ind_reg_valid = np.where(classification_tree.apply(flattened_input_valid) == no_of_regimes)

        flattened_input_train = flattened_input_train[ind_reg_train]
        flattened_input_valid = flattened_input_valid[ind_reg_valid]

        flattened_output_train = flattened_output_train[ind_reg_train]
        flattened_output_valid = flattened_output_valid[ind_reg_valid]

**Pick the subset**

In [8]:
subset = np.random.randint(0, len(flattened_output_train), subset_size)

**Normalize the features**

In [9]:
# Scale the data
mean_all = [4.12205844e-03,2.25493498e-05,3.38180032e-06,2.57065512e+02,6.00030443e+04,5.64080139e+03,2.35046400e-01,1.32776682e+01,6.02512234e-01,9.86270417e+04,-1.27545273e-06,-4.02484958e-10,1.65204582e-08,-4.34660202e-11,4.29441131e-10,-1.82817316e-12,-4.68742483e-03,-7.54899040e-07,-7.51544542e+00,-1.06989723e-04,1.65615172e-03,-9.27604679e-06,-4.76200071e-05,-1.32246548e-07]
std_all = [5.07648249e-03,5.69702638e-05,1.01308124e-05,3.00533874e+01,3.12514292e+04,5.66963918e+03,4.11184302e-01,1.11389888e+01,3.32494615e-01,6.24039256e+03,2.03179260e-06,1.17041141e-08,1.33311867e-07,1.42840744e-09,6.73384546e-09,5.07424672e-11,5.82875686e-03,6.34826092e-05,3.53136052e+00,1.13215264e-02,6.62892130e-03,6.08144307e-05,2.58065098e-04,2.49552692e-06]

mean = np.concatenate([np.expand_dims(mean_all[loc_all[sel_var]], axis=0) for sel_var in features], axis = 0)
std = np.concatenate([np.expand_dims(std_all[loc_all[sel_var]], axis=0) for sel_var in features], axis = 0)

# Work with scaled training folds
train_data_scaled = (flattened_input_train - mean)/std
valid_data_scaled = (flattened_input_valid - mean)/std

**Run sindy-pi**

In [28]:
M = np.random.randint(0, train_data_scaled.shape[0], 10**1)

In [46]:
model = ps.SINDyPI()
model.fit(train_data_scaled[M], flattened_output_train[M])

Model  0
Model  1
Model  2
Model  3
Model  4


In [44]:
train_data_scaled[M].shape

(10, 5)

In [39]:
train_data_scaled.shape

(124950762, 5)

In [42]:
model.predict(train_data_scaled[:1])

array([[-0.48581656, -1.23785105, -0.66650038, -0.25137623,  0.00913024]])

In [43]:
flattened_output_train[0]

0.0

In [10]:
train_data_scaled.shape

(124950762, 5)

In [11]:
t = np.linspace(0, 1, 100)
x = 3 * np.exp(-2 * t)
y = 0.5 * np.exp(t)
X = np.stack((x, y), axis=-1)  # First column is x, second is y

In [20]:
np.expand_dims(t, 1).shape

(100, 1)

In [19]:
y.shape

(100,)

In [21]:
model = ps.SINDyPI()
# model.fit(X, t=t)
model.fit(np.expand_dims(t, 1), y)

Model  0


In [25]:
model.coef_

array([[6.65968924e-09]])