tb - 7/8/2022 - Generalizing the data generator for sparse linear regression using stochastic gradient descent. The goal is to select the terms found in [093] using the entire dataset. One of the first steps will be to compare features selected when training on both datasets at once compared to individual (-4K) or (+4K) climates.

# Imports and Initialization

## Imports

In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RepeatedKFold
from sklearn.preprocessing import PolynomialFeatures

from cbrain.cam_constants import *
from cbrain.climate_invariant import *
from cbrain.equation_discovery import *
from cbrain.preprocessing.convert_dataset_20191113 import compute_LHF_nsDELQ
from cbrain.climate_invariant_utils import *
from scipy.integrate import cumtrapz,trapz
from scipy import interpolate,misc

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pickle
import xarray as xr

/nfspool-0/home/tbeucler/CBRAIN-CAM/notebooks/tbeucler_devlog


In [2]:
fz = 12
lw = 2
siz = 100

plt.rc('text', usetex=False)
mpl.rcParams['mathtext.fontset'] = 'stix'
mpl.rcParams['font.family'] = 'STIXGeneral'
plt.rc('font', family='serif', size=fz)
mpl.rcParams['lines.linewidth'] = lw

## Load data

In [3]:
path_data = '/DFS-L/DATA/pritchard/tbeucler/SPCAM/SPCAM_PHYS/'
path_array = {}
climate_str = ['cold','hot','both']
set_str = ['train','valid','test']
test_clim_str = ['cold','hot','both','medium']
path_array['cold'] = [path_data+'2021_03_18_O3_TRAIN_M4K_shuffle.nc',
                      path_data+'2021_03_18_O3_VALID_M4K.nc',
                      path_data+'2021_03_18_O3_TEST_M4K.nc']
path_array['hot'] = [path_data+'2021_03_18_O3_TRAIN_P4K_shuffle.nc',
                     path_data+'2021_03_18_O3_VALID_P4K.nc',
                     path_data+'2021_03_18_O3_TEST_P4K.nc']
path_array['both'] = [path_data+'2022_04_18_TRAIN_M4K_P4K_shuffle.nc',
                      path_data+'2022_04_18_VALID_M4K_P4K.nc',
                      path_data+'2022_04_18_TEST_M4K_P4K.nc']
path_array['medium'] = [path_data+'2021_01_24_O3_TRAIN_shuffle.nc',
                        path_data+'2021_01_24_O3_VALID.nc',
                        path_data+'2021_01_24_O3_TEST.nc']
path_input_norm = path_data + '2021_01_24_NORM_O3_small.nc'
path_norm_RH = path_data + '2021_02_01_NORM_O3_RH_small.nc'
path_norm_BMSE = path_data + '2021_06_16_NORM_BMSE_small.nc'
path_norm_LHF_nsDELQ = path_data + '2021_02_01_NORM_O3_LHF_nsDELQ_small.nc'
in_vars = ['QBP','TBP','PS','SOLIN','SHFLX','LHFLX'] # We take the large-scale climate state as inputs
out_vars = ['PHQ','TPHYSTND','QRL','QRS'] # and we output the response of clouds/storms to these climate conditions
scale_dict = pickle.load(open(path_data+'009_Wm2_scaling.pkl','rb'))

In [4]:
path_RH_B_train = path_data+'2022_07_08_B_RH_TRAIN_both.nc'
path_RH_B_test = path_data+'2022_06_26_B_RH_both.nc'

In [5]:
climates = ['m4K','ref','p4K']
path_append = ['2021_03_18_O3_TEST_M4K.nc','2021_01_24_O3_TEST.nc','2021_03_18_O3_TEST_P4K.nc']
path_RH_B_append = ['2022_06_27_B_RH_TEST_m4K.nc','2022_06_27_B_RH_TEST_ref.nc','2022_06_27_B_RH_TEST_p4K.nc']
path_LHF_nsDELQ = '2022_06_29_LHF_nsDELQ.nc'
path_LHF_nsDELQ_train = '2022_07_08_LHF_nsDELQ_TRAIN_both.nc'

In [6]:
LHF_nsDELQ_train = xr.open_dataset(path_data+path_LHF_nsDELQ_train)
LHF_nsDELQ_test = xr.open_dataset(path_data+path_LHF_nsDELQ)

In [7]:
train_both = xr.open_dataset(path_array['both'][0])
train_both_RH_B = xr.open_dataset(path_RH_B_train)

In [8]:
test_sets = {}; test_sets_RHB = {}; 
LHFname = ['LHFns_cold','LHFns_med','LHFns_hot']

In [9]:
for iclim,clim in enumerate(climates):
    test_sets[clim] = xr.open_dataset(path_data+path_append[iclim])
    test_sets_RHB[clim] = xr.open_dataset(path_data+path_RH_B_append[iclim])

In [10]:
ind_input = np.concatenate((np.arange(0,60),np.arange(90,94)))
ind_output = np.arange(94,94+60)

In [11]:
xt = {}; yt = {}; xRHt= {}; xBt = {}; xLHFnst = {};

In [12]:
name_RH = ['RH_test_cold','RH_test_med','RH_test_hot']
name_B = ['B_test_cold','B_test_med','B_test_hot']

In [13]:
x = train_both['vars'][:,ind_input]
xRH = train_both_RH_B['RH_train']
xB = train_both_RH_B['B_train']
xLHFns = LHF_nsDELQ_train['LHFns']
y = train_both['vars'][:,ind_output]
for iclim,clim in enumerate(climates):
    xt[clim] = test_sets[clim]['vars'][:,ind_input]
    xRHt[clim] = test_sets_RHB[clim][name_RH[iclim]]
    xBt[clim] = test_sets_RHB[clim][name_B[iclim]]
    xLHFnst[clim] = LHF_nsDELQ_test[LHFname[iclim]]
    yt[clim] = test_sets[clim]['vars'][:,ind_output]

# First attempt using least-squared error (no SGD)

In [14]:
Model_Types = ['localBF','localCI','BF','CI']
N_subsample = 5
N_degree = 4
Nsample = 3000

In [15]:
KEYS = {}
KEYS['local'] = ['ps', 'S0', 'SHF', 'LHF','LHFns','p', 'q', 'dq_dp_FD',
              'd2q_dp2_FD','T', 'dT_dp_FD','d2T_dp2_FD','RH', 'dRH_dp_FD',
              'd2RH_dp2_FD','B', 'dB_dp_FD','d2B_dp2_FD']
KEYS['localCI'] = ['ps', 'S0', 'SHF','LHFns','p','RH', 'dRH_dp_FD',
              'd2RH_dp2_FD','B', 'dB_dp_FD','d2B_dp2_FD']
KEYS['localBF'] = ['ps', 'S0', 'SHF', 'LHF', 'p', 'q', 'dq_dp_FD',
              'd2q_dp2_FD','T', 'dT_dp_FD','d2T_dp2_FD']
KEYS['all'] = np.concatenate((KEYS['local'],['Q_above','Q_below','T_below','T_above',
                                             'RH_below','RH_above','B_below','B_above']))
KEYS['BF'] = np.concatenate((KEYS['localBF'],['Q_above','Q_below','T_below','T_above']))
KEYS['CI'] = np.concatenate((KEYS['localCI'],['RH_below','RH_above','B_below','B_above']))
scalar_keys = ['ps','S0','SHF','LHF','LHFns']
vector_keys = ['p', 'q', 'dq_dp_FD', 'd2q_dp2_FD', 'Q_above', 'Q_below',
               'T', 'dT_dp_FD', 'd2T_dp2_FD', 'T_above', 'T_below',
               'RH', 'dRH_dp_FD', 'd2RH_dp2_FD', 'RH_above', 'RH_below',
               'B', 'dB_dp_FD', 'd2B_dp2_FD', 'B_above', 'B_below']
combin_keys = np.concatenate((scalar_keys,vector_keys))

In [16]:
min_features = [1,1,1,1,1]
max_features = [11,6,6,6,6]
cv = [2,2,2,2,2]

In [17]:
dict_Q = {}; dict_T = {};

## Second attempt, this time using both the cold and the warm parts of the dataset

In [None]:
for i_subsample in range(N_subsample):
    print('i_subsample='+str(i_subsample)+'/'+str(N_subsample-1))
    dict_Q[i_subsample] = {}; dict_T[i_subsample] = {};
    
    # Pre-process & sub-sample the data
    i_random = np.random.choice(np.linspace(0,xt[clim].shape[0]-1,xt[clim].shape[0]),
                                size=((Nsample,)),replace=False).astype('int')
    dict_Q[i_subsample]['Selected_indices'] = i_random;
    dict_T[i_subsample]['Selected_indices'] = i_random;
    
    x_train,x_test,y_train,y_test = subsampler(i_random,x,y,xRH,xB,xLHFns,hyam,hybm,variables=KEYS['all'])
    x_train_range,x_test_range,Norm = range_normalizer(x_train,scalar_keys,vector_keys)
    
    for model_type in (Model_Types):
        print('model_type='+model_type)
        dict_Q[i_subsample][model_type] = {}; dict_T[i_subsample][model_type] = {};
        
        # Transform dictionary into array for regression purposes
        X_train,X_test,dQdt_train,dQdt_test,dTdt_train,dTdt_test = \
        dic_to_array(KEYS[model_type],x_train_range,y_train,scale_dict)
    
        for degree in np.arange(1,N_degree+1):
            print('degree='+str(degree))
            dic_to_save = {}; ideg = degree-1;
            
            # Polynomial features
            poly = PolynomialFeatures(degree=degree)
            X_train_poly = poly.fit_transform(X_train)    
            features = poly.get_feature_names(np.array(KEYS[model_type])) # Update the feature names
                
            # Linear regression on polynomial features - dQ/dt & dT/dt
            dict_Q[i_subsample][model_type][degree] = \
            SFS_poly(features,X_train_poly,dQdt_train,
                     min_features[ideg],max_features[ideg],cv[ideg])
            
            dict_T[i_subsample][model_type][degree] = \
            SFS_poly(features,X_train_poly,dTdt_train,
                     min_features[ideg],max_features[ideg],cv[ideg])
            
        # Write dict_combined to pkl file
        save_path = path_data+'Polynomial_Fits/2022_07_18_dicQT_'+model_type+'_isample_'+str(i_subsample+1)+'.pkl'
        pickle.dump({
          "Qfits": dict_Q[i_subsample][model_type],
          "Tfits": dict_T[i_subsample][model_type],
          'Selected_indices': i_random}, open(save_path,'wb'))

i_subsample=0/4
model_type=localBF
degree=1
{'d2q_dp2_FD': 81.8115371790674, 'LR_Bias': -44.33369190294277, 'mse_train': 933.3850451581445} 

{'LHF': 15.66882448528694, 'd2q_dp2_FD': 76.65701744898779, 'LR_Bias': -44.42018591122975, 'mse_train': 929.7930889124533} 

{'LHF': 17.474299208897666, 'q': -9.998625988901928, 'd2q_dp2_FD': 85.07880152552428, 'LR_Bias': -48.68814066961484, 'mse_train': 928.5386999946804} 

{'LHF': 16.636161946405405, 'q': -39.09367539260514, 'dq_dp_FD': 114.2334331520406, 'd2q_dp2_FD': 49.8825610402677, 'LR_Bias': -78.17102530087627, 'mse_train': 920.4865697937593} 

{'ps': 6.985720156427852, 'LHF': 15.71204940368684, 'q': -38.967970022457486, 'dq_dp_FD': 113.8498208532948, 'd2q_dp2_FD': 50.168472827658825, 'LR_Bias': -82.29429190681124, 'mse_train': 919.7071802557098} 

{'ps': 6.69753836397759, 'SHF': -10.004518083934773, 'LHF': 19.370312282365134, 'q': -39.71921280825016, 'dq_dp_FD': 112.81740470771666, 'd2q_dp2_FD': 51.40995494009371, 'LR_Bias': -80.57156183

{'SHF^2 q^2': -1265.3010755068915, 'SHF p q d2T_dp2_FD': 4744.6676573737695, 'SHF q dT_dp_FD^2': -6191.309983177184, 'LHF p^2 dq_dp_FD': 191.45426027595494, 'LHF dq_dp_FD^3': 288.07049936346493, 'LR_Bias': -4.382389836045628, 'mse_train': 854.2580494069873} 

{'SHF^2 q dT_dp_FD': 564.2677507899431, 'LR_Bias': -1.6316611960085479, 'mse_train': 498.36502307379783} 

{'SHF^2 p q': -9054.091857263624, 'SHF^2 q dT_dp_FD': 12441.556662716526, 'LR_Bias': -2.6216080459376587, 'mse_train': 461.82407007643786} 

{'ps LHF p dq_dp_FD': -255.01192815902473, 'SHF^2 p q': -9840.10177064007, 'SHF^2 q dT_dp_FD': 14587.338431806047, 'LR_Bias': 0.23733057961425574, 'mse_train': 440.2340514870997} 

{'ps LHF p dq_dp_FD': -282.4774675327616, 'SHF^2 p q': -11910.442831608867, 'SHF^2 q dT_dp_FD': 181786.65381925815, 'SHF^2 q d2T_dp2_FD': -179183.45185447214, 'LR_Bias': 0.44313204162562103, 'mse_train': 422.1830380787789} 

{'ps LHF p dq_dp_FD': -301.9416295079322, 'S0 dq_dp_FD^3': 89.20235172027496, 'SHF^2 p

{'S0 SHF d2RH_dp2_FD': 50.52530720451592, 'SHF RH dB_dp_FD': -203.10237961461547, 'LHFns RH^2': 332.53788386704827, 'B^3': -9.590061451435762, 'LR_Bias': 1.2845674631632336, 'mse_train': 450.77257171302205} 

{'S0 SHF d2RH_dp2_FD': 50.753752519442614, 'SHF RH dB_dp_FD': -136.38491654656733, 'LHFns RH^2': 445.9255574597448, 'LHFns RH dB_dp_FD': -162.34081277116235, 'B^3': -7.931408611189564, 'LR_Bias': 1.3125935302215805, 'mse_train': 448.47682537641765} 

degree=4
{'LHFns RH^3': -159.29513724329453, 'LR_Bias': 3.063447385425223, 'mse_train': 916.8559371484154} 

{'LHFns RH^3': -1179.565456471423, 'LHFns RH^2 dB_dp_FD': 933.0874579580048, 'LR_Bias': -0.6675798586156702, 'mse_train': 888.2812427692043} 

{'LHFns RH^3': -1250.8494156920515, 'LHFns RH^2 d2RH_dp2_FD': -437.21198396790345, 'LHFns RH^2 dB_dp_FD': 1391.3649960572411, 'LR_Bias': -0.6315590059055093, 'mse_train': 884.4091853481397} 

{'ps LHFns p^2': 53.76781579178767, 'LHFns RH^3': -1036.1288274959513, 'LHFns RH^2 d2RH_dp2_FD':

## First attempt only using the cold part of the dataset (once again)

In [None]:
for i_subsample in range(N_subsample):
    print('i_subsample='+str(i_subsample)+'/'+str(N_subsample-1))
    dict_Q[i_subsample] = {}; dict_T[i_subsample] = {};
    
    # Pre-process & sub-sample the data
    i_random = np.random.choice(np.linspace(0,xt[clim].shape[0]-1,xt[clim].shape[0]),
                                size=((Nsample,)),replace=False).astype('int')
    dict_Q[i_subsample]['Selected_indices'] = i_random;
    dict_T[i_subsample]['Selected_indices'] = i_random;
    
    x_train,x_test,y_train,y_test = subsampler(i_random,x,y,xRH,xB,xLHFns,
                                               xt,yt,xRHt,xBt,xLHFnst,hyam,hybm,
                                               KEYS['all'])
    x_train_range,x_test_range,Norm = range_normalizer(x_train,x_test,scalar_keys,vector_keys)
    
    for model_type in (Model_Types):
        print('model_type='+model_type)
        dict_Q[i_subsample][model_type] = {}; dict_T[i_subsample][model_type] = {};
        
        # Transform dictionary into array for regression purposes
        X_train,X_test,dQdt_train,dQdt_test,dTdt_train,dTdt_test = \
        dic_to_array(KEYS[model_type],x_train_range,x_test_range,
                     y_train,y_test,scale_dict)
    
        for degree in np.arange(1,N_degree+1):
            print('degree='+str(degree))
            dic_to_save = {}; ideg = degree-1;
            
            # Polynomial features
            poly = PolynomialFeatures(degree=degree)
            X_train_poly = poly.fit_transform(X_train)    
            features = poly.get_feature_names(np.array(KEYS[model_type])) # Update the feature names
            X_test_poly = {};
            for iclim,clim in enumerate(climates):
                X_test_poly[clim] = poly.fit_transform(X_test[clim])
                
            # Linear regression on polynomial features - dQ/dt & dT/dt
            dict_Q[i_subsample][model_type][degree] = \
            SFS_poly(features,X_train_poly,X_test_poly,
                     dQdt_train,dQdt_test,
                     min_features[ideg],max_features[ideg],cv[ideg])
            
            dict_T[i_subsample][model_type][degree] = \
            SFS_poly(features,X_train_poly,X_test_poly,
                     dTdt_train,dTdt_test,
                     min_features[ideg],max_features[ideg],cv[ideg])
            
        # Write dict_combined to pkl file
        save_path = path_data+'Polynomial_Fits/2022_07_12_dicQT_'+model_type+'_isample_'+str(i_subsample+1)+'.pkl'
        pickle.dump({
          "Qfits": dict_Q[i_subsample][model_type],
          "Tfits": dict_T[i_subsample][model_type],
          'Selected_indices': i_random}, open(save_path,'wb'))

In [None]:
# save_path = path_data+'Polynomial_Fits/2022_07_12_dicQT_'+model_type+'_isample_'+str(i_subsample+1)+'.pkl'
# pickle.dump({
#   "Qfits": dict_Q[i_subsample][model_type],
#   "Tfits": dict_T[i_subsample][model_type],
#   'Selected_indices': i_random}, open(save_path,'wb'))

# Second attempt using the full dataset and terms selected by the SFS

## Load most important features calculated using subsamples of the training set

In [None]:
path_Poly = '/DFS-L/DATA/pritchard/tbeucler/SPCAM/SPCAM_PHYS/Polynomial_Fits/'

In [None]:
tmp = pickle.load(open(path_Poly+'2022_07_18_Terms_Count.pkl','rb'))

In [None]:
termQ = tmp['Terms_Moisture']
termT = tmp['Terms_Temperature']

In [None]:
Models = ['localBF','localCI','BF','CI']

## Visualization of most selected terms

In [None]:
term = termT

for i in range(4):
    fig, ax = plt.subplots(1,1,figsize=(15,2.5))
    plt.bar(term[Models[i]]['variables'][term[Models[i]]['count']>4],
              term[Models[i]]['count'][term[Models[i]]['count']>4])
    plt.axhline(y=5,color='g')
    plt.xticks(rotation=90);
    plt.xticks(fontsize=fz);
    plt.title(Models[i],fontsize=fz)

## Train model on the full dataset using the top selected features

### Prototype: LocalBF for subgrid moistening

In [None]:
model = 'localBF'

In [None]:
terms = termQ[model]['variables'][termQ[model]['count']>4]

In [None]:
terms

Find "base terms" and "base polynomials" that appear in the selected terms

In [None]:
unique_terms = []; unique_pol = [];
for term in terms:
    for t in term.split():
        po = t.find('^')
        if po>0: unique_terms.append(t[:po])
        else: unique_terms.append(t)
        unique_pol.append(t)
unique_terms = list(set(unique_terms))
unique_pol = list(set(unique_pol))

In [None]:
unique_terms

Problem in current implementation of FD if a second derivative term appears without its first derivative counterpart

In [None]:
unique_pol

In [None]:
x.shape

In [None]:
plt.hist(x[:,59].values,bins=100);

In [None]:
plt.hist(x[:(xt[clim].shape[0]-1),59].values,bins=100);

In [None]:
# Pre-process & sub-sample the data
i_random = np.random.choice(np.linspace(0,xt[clim].shape[0]-1,xt[clim].shape[0]),
                            size=((Nsample,)),replace=False).astype('int')
dict_Q[i_subsample]['Selected_indices'] = i_random;
dict_T[i_subsample]['Selected_indices'] = i_random;

In [None]:
x_train,x_test,y_train,y_test = subsampler(None,x,y,xRH,xB,xLHFns,xt,yt,xRHt,xBt,xLHFnst,hyam,hybm,unique_terms)

In [None]:
x_train_range,x_test_range,Norm = range_normalizer(x_train,x_test,
                                                   list(set(scalar_keys)&set(unique_terms)),
                                                   list(set(vector_keys)&set(unique_terms)))