tb - 7/18/2022 - Adapting equation learning from data to Earth-like situations

# Imports and Initialization

## Libraries

In [4]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RepeatedKFold
from sklearn.preprocessing import PolynomialFeatures

from cbrain.cam_constants import *
from cbrain.climate_invariant import *
from cbrain.equation_discovery import *
from cbrain.preprocessing.convert_dataset_20191113 import compute_LHF_nsDELQ
from cbrain.climate_invariant_utils import *
from scipy.integrate import cumtrapz,trapz
from scipy import interpolate,misc

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pickle
import xarray as xr

/nfspool-0/home/tbeucler/CBRAIN-CAM/notebooks/tbeucler_devlog


## Figure parameters

In [5]:
fz = 12
lw = 2
siz = 100

plt.rc('text', usetex=False)
mpl.rcParams['mathtext.fontset'] = 'stix'
mpl.rcParams['font.family'] = 'STIXGeneral'
plt.rc('font', family='serif', size=fz)
mpl.rcParams['lines.linewidth'] = lw

## Paths to data

In [18]:
path_data = '/DFS-L/DATA/pritchard/tbeucler/SPCAM/SPCAM_PHYS/'
path_array = {}
climate_str = ['cold','hot','both']
set_str = ['train','valid','test']
test_clim_str = ['cold','hot','both','medium']
climates = ['cold','medium','hot']
path_array['cold'] = [path_data+'2021_04_18_RG_TRAIN_M4K_shuffle.nc',
                      path_data+'2021_04_18_RG_VALID_M4K.nc',
                      path_data+'2021_04_18_RG_TEST_M4K.nc']
path_array['hot'] = [path_data+'2021_04_18_RG_TRAIN_P4K_shuffle.nc',
                     path_data+'2021_04_18_RG_VALID_P4K.nc',
                     path_data+'2021_04_18_RG_TEST_P4K.nc']
path_array['both'] = ['','','']
path_array['medium'] = [path_data+'2021_06_03_RG_TRAIN_shuffle.nc',
                        path_data+'2021_06_03_RG_VALID.nc',
                        path_data+'2021_06_06_RG_TEST.nc']
path_input_norm = path_data + '2021_04_18_RG_small.nc'
path_train_RH = path_data + '2021_01_24_O3_small_shuffle.nc'
path_norm_RH = path_data + '2021_02_01_NORM_O3_RH_small.nc'
path_train_BMSE = path_data + '2021_06_16_BMSE_small_shuffle.nc'
path_norm_BMSE = path_data + '2021_06_16_NORM_BMSE_small.nc'
path_train_LHF_nsDELQ = path_data + '2021_02_01_O3_LHF_nsQ_small_shuffle.nc'
path_norm_LHF_nsDELQ = path_data + '2021_02_01_NORM_O3_LHF_nsDELQ_small.nc'
in_vars = ['QBP','TBP','PS','SOLIN','SHFLX','LHFLX'] # We take the large-scale climate state as inputs
out_vars = ['PHQ','TPHYSTND','QRL','QRS'] # and we output the response of clouds/storms to these climate conditions
scale_dict = pickle.load(open(path_data+'009_Wm2_scaling.pkl','rb'))
path_append = ['2021_03_18_O3_TEST_M4K.nc','2021_01_24_O3_TEST.nc','2021_03_18_O3_TEST_P4K.nc']
path_RH_B_append = ['2022_06_27_B_RH_TEST_m4K.nc','2022_06_27_B_RH_TEST_ref.nc','2022_06_27_B_RH_TEST_p4K.nc']
path_LHF_nsDELQ = '2022_06_29_LHF_nsDELQ.nc'
path_LHF_nsDELQ_train = '2022_07_08_LHF_nsDELQ_TRAIN_both.nc'
scale_dict_RH = scale_dict.copy()
scale_dict_RH['RH'] = 0.01*L_S/G, # Arbitrary 0.1 factor as specific humidity is generally below 2%

## Load data

In [15]:
train = {}

In [16]:
for clim in climates:
    train[clim] = xr.open_dataset(path_array[clim][0])

# Calculate input distribution statistics

## Sub-sample all three climates and mix samples

In [None]:
KEYS = {}
KEYS['local'] = ['ps', 'S0', 'SHF', 'LHF','LHFns','p', 'q', 'dq_dp_FD',
              'd2q_dp2_FD','T', 'dT_dp_FD','d2T_dp2_FD','RH', 'dRH_dp_FD',
              'd2RH_dp2_FD','B', 'dB_dp_FD','d2B_dp2_FD']
KEYS['localCI'] = ['ps', 'S0', 'SHF','LHFns','p','RH', 'dRH_dp_FD',
              'd2RH_dp2_FD','B', 'dB_dp_FD','d2B_dp2_FD']
KEYS['localBF'] = ['ps', 'S0', 'SHF', 'LHF', 'p', 'q', 'dq_dp_FD',
              'd2q_dp2_FD','T', 'dT_dp_FD','d2T_dp2_FD']
KEYS['all'] = np.concatenate((KEYS['local'],['Q_above','Q_below','T_below','T_above',
                                             'RH_below','RH_above','B_below','B_above']))
KEYS['BF'] = np.concatenate((KEYS['localBF'],['Q_above','Q_below','T_below','T_above']))
KEYS['CI'] = np.concatenate((KEYS['localCI'],['RH_below','RH_above','B_below','B_above']))
scalar_keys = ['ps','S0','SHF','LHF','LHFns']
vector_keys = ['p', 'q', 'dq_dp_FD', 'd2q_dp2_FD', 'Q_above', 'Q_below',
               'T', 'dT_dp_FD', 'd2T_dp2_FD', 'T_above', 'T_below',
               'RH', 'dRH_dp_FD', 'd2RH_dp2_FD', 'RH_above', 'RH_below',
               'B', 'dB_dp_FD', 'd2B_dp2_FD', 'B_above', 'B_below']
combin_keys = np.concatenate((scalar_keys,vector_keys))

In [10]:
train

{'cold': <xarray.Dataset>
 Dimensions:    (sample: 143161344, var_names: 184)
 Coordinates:
   * var_names  (var_names) object 'QBP' 'QBP' 'QBP' 'QBP' ... 'QRS' 'QRS' 'QRS'
 Dimensions without coordinates: sample
 Data variables:
     time       (sample) int64 ...
     lat        (sample) float64 ...
     lon        (sample) float64 ...
     vars       (sample, var_names) float32 ...,
 'medium': <xarray.Dataset>
 Dimensions:    (sample: 143161344, var_names: 184)
 Coordinates:
   * var_names  (var_names) object 'QBP' 'QBP' 'QBP' 'QBP' ... 'QRS' 'QRS' 'QRS'
 Dimensions without coordinates: sample
 Data variables:
     time       (sample) int64 ...
     lat        (sample) float64 ...
     lon        (sample) float64 ...
     vars       (sample, var_names) float32 ...,
 'hot': <xarray.Dataset>
 Dimensions:    (sample: 143161344, var_names: 184)
 Coordinates:
   * var_names  (var_names) object 'QBP' 'QBP' 'QBP' 'QBP' ... 'QRS' 'QRS' 'QRS'
 Dimensions without coordinates: sample
 Data vari

## Calculate empirical PDF,CDF

In [29]:
Nsample = 2500000

In [30]:
i_random = np.random.choice(np.linspace(0,xt[clim].shape[0]-1,xt[clim].shape[0]),
                                size=((Nsample,)),replace=False).astype('int')

In [31]:
x_train,x_test,y_train,y_test = subsampler(i_random,x,y,xRH,xB,xLHFns,hyam,hybm,variables=KEYS['all'])

In [32]:
combin_keys = np.concatenate((scalar_keys,vector_keys))

In [33]:
def edg2bin(edges):
    return (edges[:-1]+edges[1:])/2

In [34]:
PDF = {}; EDG = {}; CDF = {}

In [35]:
for k in x_train.keys():
    print(k)
    PDF[k],EDG[k] = np.histogram(x_train[k].flatten(),bins=250,density=True)
    db = np.array(np.diff(EDG[k]), float)  
    CDF[k] = db*PDF[k].cumsum()

p
q
dq_dp_FD
d2q_dp2_FD
Q_above
Q_below
T
dT_dp_FD
d2T_dp2_FD
T_above
T_below
RH
dRH_dp_FD
d2RH_dp2_FD
RH_above
RH_below
B
dB_dp_FD
d2B_dp2_FD
B_above
B_below
ps
S0
SHF
LHF
LHFns


# Appendix A: Convert q,T,LHF to RH,B,LHFns to form the climate-invariant variables

## Data Generators

In [54]:
N_batch = 8192

In [55]:
def train_gen_rescaling(input_rescaling,path_norm,path_train,scale_dict):
    return DataGeneratorCI(
        data_fn = path_train,
        input_vars = input_rescaling,
        output_vars = out_vars,
        norm_fn = path_norm,
        input_transform = ('mean', 'maxrs'),
        output_transform = scale_dict)

In [56]:
train_gen_RH = train_gen_rescaling(['RH','TBP','PS', 'SOLIN', 'SHFLX', 'LHFLX'],
                                   path_norm_RH,path_train_RH,scale_dict_RH)
train_gen_BMSE = train_gen_rescaling(['QBP','BMSE','PS', 'SOLIN', 'SHFLX', 'LHFLX'],
                                     path_norm_BMSE,path_train_BMSE,scale_dict)
train_gen_LHF_nsDELQ = train_gen_rescaling(['QBP','TBP','PS', 'SOLIN', 'SHFLX', 'LHF_nsDELQ'],
                                           path_norm_LHF_nsDELQ,path_train_LHF_nsDELQ,scale_dict)

In [57]:
def Generator_singleDS(path,rescaling=None):
    
    in_vars = ['QBP','TBP','PS','SOLIN','SHFLX','LHFLX'] # We take the large-scale climate state as inputs
    out_vars = ['PHQ','TPHYSTND','QRL','QRS'] # and we output the response of clouds/storms to these climate conditions
    path_input_norm = path_data + '2021_01_24_NORM_O3_small.nc'
    scale_dict = pickle.load(open(path_data+'009_Wm2_scaling.pkl','rb'))
    
    if rescaling=='CI':
        gen = DataGeneratorCI(
        data_fn = path,
        input_vars = in_vars,
        output_vars = out_vars,
        norm_fn = path_input_norm,
        batch_size=N_batch,
        input_transform = ('mean', 'maxrs'),
        output_transform = scale_dict,
        Qscaling = 'RH',
        Tscaling = 'BMSE',
        LHFscaling = 'LHF_nsDELQ',
        hyam=hyam, hybm=hybm, # Arrays to define mid-levels of hybrid vertical coordinate
        inp_sub_Qscaling=train_gen_RH.input_transform.sub, # What to subtract from RH inputs
        inp_div_Qscaling=train_gen_RH.input_transform.div, # What to divide RH inputs by
        inp_sub_Tscaling=train_gen_BMSE.input_transform.sub,
        inp_div_Tscaling=train_gen_BMSE.input_transform.div,
        inp_sub_LHFscaling=train_gen_LHF_nsDELQ.input_transform.sub,
        inp_div_LHFscaling=train_gen_LHF_nsDELQ.input_transform.div
        ) 
    else:
        gen = DataGeneratorCI(
        data_fn = path,
        input_vars = in_vars,
        output_vars = out_vars,
        norm_fn = path_input_norm,
        batch_size=N_batch,
        input_transform = ('mean', 'maxrs'),
        output_transform = scale_dict
        )

    return gen 

In [58]:
BFgen = {}
CIgen = {}

In [59]:
for iclimate,clim in enumerate(climates):
    print('Climate = ',clim)
    BFgen[clim] = {}
    CIgen[clim] = {}
    
    for iset,st in enumerate(set_str):
        print('Set = ',st)
        
        BFgen[clim][st] = Generator_singleDS(path_array[clim][iset])
        CIgen[clim][st] = Generator_singleDS(path_array[clim][iset],rescaling='CI')

Climate =  cold
Set =  train
Set =  valid
Set =  test
Climate =  medium
Set =  train
Set =  valid
Set =  test
Climate =  hot
Set =  train
Set =  valid
Set =  test


## Create array to hold rescaled variables

In [60]:
clim = 'cold'; set0 = 'train'; iset = 0;

In [61]:
gen = CIgen[clim][set0]
train_set = xr.open_dataset(path_array[clim][iset])
train_CI = train_set.copy()
var_names_CI = train_CI['var_names'].values

In [62]:
for i in range(60):
    if i<30: var_names_CI[i] = 'RH'
    else: var_names_CI[i] = 'BMSE'
var_names_CI[63] = 'LHF_nsDELQ'

In [63]:
train_CI.assign_coords({'var_names':var_names_CI});

In [64]:
newval_train = np.zeros(train_CI['vars'].shape,dtype='float32')

## Assign new values using climate-invariant generators

In [None]:
for ibatch in range((gen.n_samples)//N_batch):
    if ibatch % 10==0: print('progress=','%2.2f' % \
                             (100*ibatch/((gen.n_samples)//N_batch)),
                             '%','               ',end='\r')
    gen_pu = (gen[ibatch][0]*gen.input_transform.div+gen.input_transform.sub)
    newval_train[ibatch*N_batch:((1+ibatch)*N_batch),:] = np.concatenate(
        (gen_pu[:,:64],train_CI['vars'][ibatch*N_batch:((1+ibatch)*N_batch),64:]),axis=1)

progress= 1.72 %                

In [None]:
plt.hist(newval_train[:,:30].flatten(),bins=100);

In [None]:
plt.hist(newval_train[:,30:60].flatten(),bins=100);

In [None]:
plt.hist(newval_train[:,63].flatten(),bins=100);

## Extract and save RH, B, and LHF_nsDELQ

In [None]:
RH_train = np.float32(newval_train[:,:30])
B_train = np.float32(newval_train[:,30:60])
LHFnsDELQ_train = np.float32(newval_train[:,63])

In [None]:
data = dict(
        RH_train=(["samples_both","pressure"],RH_train),
        B_train=(["samples_both","pressure"],B_train),
        LHFnsDELQ_train=(["samples_both","pressure"],LHFnsDELQ_train))

In [None]:
coord = dict(
        pressure_midlevel=(["pressure"],np.mean(pm,axis=0)),
        pressure_interfac=(["pressure_interface"],np.mean(pi,axis=0)),
        samples_mixed=(["samples_both"],np.arange(0,RH_train.shape[0]))
    )

In [None]:
da = xr.Dataset(
    data_vars=data,
    coords=coord,
    attrs=dict(description="RH, B, and LHF_nsDELQ calculated using script [098]")
)

In [None]:
da.to_netcdf(path_data+'2022_07_21_RG_B_RH_LHFns_TRAIN_m4K.nc')