## Preprocessing Qubicc

Converting the data into npy makes it possible for us to work with it efficiently; originally we require 500GB of RAM which is always difficult to guarantee. We preprocess QUBICC in another ipynb notebook precisely because of this issue.

1) We read the data
2) Reshape variables so that they have equal dimensionality
3) Reshape into data samples fit for the NN and convert into a DataFrame
4) Downsample the data: Remove data above 21kms, remove condensate-free clouds, combat class-imbalance
5) Split into input and output
6) Save as npy

Note: We neither scale nor split the data into training/validation/test sets. <br>
The reason is that i) in order to scale we need the entire dataset but this can only be done in conjunction with the Qubicc dataset. Also for cross-validation different scalings will be necessary based on different subsets of the data, ii) The split into subsets will be done by the cross-validation procedure or not at all when training the final model.

In [26]:
# Ran with 900GB

import sys
import xarray as xr
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import time
import gc
# import importlib
# importlib.reload(my_classes)

base_path = '/pf/b/b309170'
output_path = base_path + '/my_work/icon-ml_data/cloud_cover_parameterization/region_based_one_nn_R02B05/based_on_var_interpolated_data'

# Add path with my_classes to sys.path
sys.path.insert(0, base_path + '/workspace_icon-ml/cloud_cover_parameterization/')

# Which days to load
days_qubicc = 'all_hcs'

from my_classes import load_data

VERT_LAYERS = 31

#Set a numpy seed for the permutation later on!
np.random.seed(10)

# Set output_var to one of {'cl', 'cl_area'}
output_var = 'cl'

## 1) Reading the data
### Input:
- coriolis: Coriolis parameter
- zg: Geometric height at full levels (3D)
- qv: Specific water vapor content (3D)
- qc: Specific cloud water content (3D)
- qi: Specific cloud ice content (3D)
- temp: Temperature (3D)
- pres: Pressure (3D)
- u: Zonal wind (3D)
- v: Meridional wind (3D)

$10$ input nodes

### Output:
- clc: Cloud Cover

$1$ output nodes

The data above 21km is capped.

In [2]:
# For cl_area I only need the output as I already have the input
# I still need 'clw', 'cli' for condensate-free clouds
order_of_vars_qubicc = ['hus', 'clw', 'cli', 'ta', 'pfull', 'ua', 'va', 'zg', 'coriolis', output_var]

In [3]:
# Load QUBICC data
data_dict = load_data(source='qubicc', days=days_qubicc, resolution='R02B05', 
                             order_of_vars=order_of_vars_qubicc)

hus
clw
cli
ta
pfull
ua
va
cl


In [4]:
# Convert the data to float32 asap!
for key in data_dict.keys():
    data_dict[key] = np.float32(data_dict[key])
gc.collect()

7348

In [5]:
for key in data_dict.keys():
    print(key, data_dict[key].shape)

hus (2162, 31, 78069)
clw (2162, 31, 78069)
cli (2162, 31, 78069)
ta (2162, 31, 78069)
pfull (2162, 31, 78069)
ua (2162, 31, 78069)
va (2162, 31, 78069)
zg (31, 78069)
coriolis (78069,)
cl (2162, 31, 78069)


In [6]:
(TIME_STEPS, VERT_LAYERS, HORIZ_FIELDS) = data_dict[output_var].shape

In [7]:
#Reshaping into nd-arrays of equaling shapes (don't reshape in the vertical)
data_dict['zg'] = np.repeat(np.expand_dims(data_dict['zg'], 0), TIME_STEPS, axis=0)
try: 
    data_dict['coriolis'] = np.repeat(np.expand_dims(data_dict['coriolis'], 0), TIME_STEPS, axis=0)
    data_dict['coriolis'] = np.repeat(np.expand_dims(data_dict['coriolis'], 1), VERT_LAYERS, axis=1)
    # Surface temperature (Try without?)
    temp_sfc = np.repeat(np.expand_dims(data_dict['ta'][:, -1, :], axis=1), VERT_LAYERS, axis=1)
except: 
    print('There is probably no coriolis or temperature in order_of_vars_qubicc')
    pass

In [8]:
# Remove the first timesteps of the QUBICC simulations since the clc values are 0 across the entire earth there 
remove_steps = []
for i in range(data_dict[output_var].shape[0]):
    if np.all(data_dict[output_var][i,4:,:] == 0):
        remove_steps.append(i)
        TIME_STEPS = TIME_STEPS - 1

for key in data_dict.keys():
    data_dict[key] = np.delete(data_dict[key], remove_steps, axis=0)
        
try:
    temp_sfc = np.float32(np.delete(temp_sfc, remove_steps, axis=0))
except:
    print('There is probably no temperature in order_of_vars_qubicc')
    pass

In [9]:
# Our Neural Network has trained with clc in [0, 100]!
data_dict[output_var] = 100*data_dict[output_var]
np.max(data_dict[output_var][:, 4:, :])

100.000015

In [10]:
# Carry along information about the vertical layer of a grid cell. int16 is sufficient for < 1000.
vert_layers = np.int16(np.repeat(np.expand_dims(np.arange(1, VERT_LAYERS+1), 0), TIME_STEPS, axis=0))
vert_layers = np.repeat(np.expand_dims(vert_layers, 2), HORIZ_FIELDS, axis=2)
vert_layers.shape

(2159, 31, 78069)

In [11]:
### Subsample QUBICC data further

# We reduce the data size to using only every three hours from the QUBICC data.
# The reason is that training is almost impossible with a total data size of 3.6 Billion samples (from NARVAL we have 126 Mio samples). 
# To make it feasible we would need a training batch size of ~5000.
# Therefore we need to decrease the amount of samples further. 
# We decrease the amount of QUBICC samples as they are less reliable than the NARVAL samples. 
# We split the dataset in half by only taking into account every two hours (we can delete every second hour as we assume 
# a relatively high temporal correlation).

for key in order_of_vars_qubicc:
    data_dict[key] = data_dict[key][0::3]
vert_layers = vert_layers[0::3]
try:
    temp_sfc = temp_sfc[0::3]
except:
    print('There is probably no (surface) temperature.')
    pass

# Adapt time steps (roughly divided by 3)
TIME_STEPS = data_dict[output_var].shape[0]

In [12]:
# Add variables below and above

def add_above_and_below(var_array):
    '''
        var_array: 3D tensor
    '''
    above = (np.insert(var_array, obj=0, values=1000*np.ones((TIME_STEPS, HORIZ_FIELDS)), axis=1))[:, :-1, :]
    # Replace by the entry from the same cell if the one above is nan.
    # It is a bit suboptimal that the ones above can be nan. 
    # But in QUBICC this only pertains the cli, clw and hus which are zero anyways at 21kms.
    nan_indices = np.where(np.isnan(above))
    above[nan_indices] = above[nan_indices[0], nan_indices[1]+1, nan_indices[2]]
    
    # Below is the same value as the grid cell for surface-closest layer
    below = (np.append(var_array, values=var_array[:, -1:, :], axis=1))[:, 1:, :]
    return above, below

above = {}
below = {}

# 1000 is a value that cannot be attained physically and serves as our way of checking whether the grid cell is at the model top
# It makes sense to insert 0 as the difference at the lowest levels. (Note that the values won't stay 0 after normalization) 
# The NN could get around these values that are not really physical by weighing the influence from below with a zg-factor.
# Alternatively we would have to remove the variable from below altogether

for key in order_of_vars_qubicc[:-2]:
    above[key], below[key] = add_above_and_below(data_dict[key])

In [13]:
# Reshaping into 1D-arrays and converting dict into a DataFrame-object (the following is based on Aurelien Geron)
for key in data_dict.keys():
    data_dict[key] = np.reshape(data_dict[key], -1) 
    vert_layers = np.reshape(vert_layers, -1)
    
for key in order_of_vars_qubicc[:-2]:
    data_dict['%s_below'%key] = np.reshape(below[key], -1)
    data_dict['%s_above'%key] = np.reshape(above[key], -1)
    
try:
    data_dict['temp_sfc'] = np.reshape(temp_sfc, -1)
except:
    print('There is probably no (surface) temperature.')
    pass

df = pd.DataFrame.from_dict(data_dict)
df.head()

Unnamed: 0,hus,clw,cli,ta,pfull,ua,va,zg,coriolis,cl,...,ta_above,pfull_below,pfull_above,ua_below,ua_above,va_below,va_above,zg_below,zg_above,temp_sfc
0,3e-06,,,214.038208,1370.786011,27.121922,19.595692,28193.783203,0.000116,,...,1000.0,1885.024536,1000.0,23.64312,1000.0,11.679565,1000.0,26201.697266,1000.0,268.771423
1,3e-06,,,214.383835,1370.754395,27.276657,20.395845,28193.783203,0.000116,,...,1000.0,1884.386597,1000.0,23.998238,1000.0,12.613994,1000.0,26201.697266,1000.0,269.241333
2,3e-06,,,213.572433,1365.707642,26.677664,19.175291,28193.783203,0.000116,,...,1000.0,1878.876709,1000.0,22.96813,1000.0,10.617148,1000.0,26201.697266,1000.0,268.296692
3,3e-06,,,214.446808,1377.688721,26.759027,19.512516,28193.783203,0.000115,,...,1000.0,1892.658691,1000.0,24.536003,1000.0,12.112512,1000.0,26201.697266,1000.0,267.593109
4,3e-06,,,213.310684,1359.823975,27.173082,19.345314,28193.783203,0.000118,,...,1000.0,1871.216309,1000.0,22.446886,1000.0,11.548752,1000.0,26201.697266,1000.0,266.535187


In [14]:
import gc
del data_dict
gc.collect()

22

**Downsampling the data (minority class: clc = 0)**

In [15]:
# Remove data above 21kms
df = df.loc[df['zg'] < 21000]

In [16]:
# There are no nans left
assert np.all(np.isnan(df) == False) == True

In [17]:
# Some quick sanity checks regarding the input data
if output_var == 'cl':
    assert np.all(df['ta'] > 150) and np.all(df['pfull'] > 150)
    
    #The upper levels have been cut off so there are no wrong values in the data anymore
    assert df[df['ta_above']==1000].shape[0] == 0 

In [18]:
# Remove condensate-free clouds (7.3% of clouds)
df = df.loc[~((df['cl'] > 0) & (df['clw'] == 0) & (df['cli'] == 0))]

In [19]:
# We ensure that clc != 0 is as large as clc = 0 (which then has 294 Mio samples) and keep the original order intact
df_noclc = df.loc[df['cl']==0]
print(len(df_noclc))

# len(downsample_indices) will be the number of noclc samples that remain
downsample_ratio = (len(df) - len(df_noclc))/len(df_noclc)
shuffled_indices = np.random.permutation(df.loc[df['cl']==0].index)
size_noclc = int(len(df_noclc)*downsample_ratio)
downsample_indices = shuffled_indices[:size_noclc] 

# Concatenate df.loc[df[output_var]!=0].index and downsample_indices
final_indices = np.concatenate((downsample_indices, df.loc[df['cl']!=0].index))

# Sort final_indices so that we can more or less recover the timesteps
final_indices = np.sort(final_indices)

# Label-based (loc) not positional-based
df = df.loc[final_indices]

920713771


In [20]:
# Number of samples after downsampling
len(df)

1176638142

In [21]:
#Modifies df as well
def split_input_output(dataset):
    output_df = dataset[output_var]
    del dataset[output_var]
    return output_df

In [22]:
output_df = split_input_output(df)

In [27]:
# Save the data
if output_var == 'cl':
    np.save(output_path + '/cloud_cover_input_qubicc.npy', np.float32(df))
    np.save(output_path + '/cloud_cover_output_qubicc.npy', np.float32(output_df))
elif output_var == 'cl_area':
    np.save(output_path + '/cloud_area_output_qubicc.npy', np.float32(output_df))

# Save the corresponding vertical layers (int16 is sufficient for layers < 1000)
if output_var == 'cl':
    np.save(output_path + '/samples_vertical_layers_qubicc.npy', vert_layers[df.index])

Some tests of the cloud area output

In [None]:
if output_var == 'cl_area':
    # Test
    old_input = np.load(output_path + '/cloud_cover_input_qubicc.npy')
    # If this yields True then we're done
    print(np.all(old_input[:,2] == df['cli']))

In [None]:
clc = np.load(output_path + '/cloud_cover_output_qubicc.npy')
cl_area = np.load(output_path + '/cloud_area_output_qubicc.npy')

diff = cl_area - clc

In [None]:
plt.hist(diff, bins = 100)
plt.show()

In [None]:
plt.hist(diff, bins = 100, log=True)
plt.show()

In [None]:
# These should be anomalies existing due to differences in coarse-graining
len(np.where(diff < 0)[0])

In [None]:
len(np.where(diff > 0)[0])

In [None]:
len(np.where(diff >= 0)[0])

In [None]:
len(np.where(diff < 0)[0])/len(diff) # 1.17% of the data

In [None]:
len(np.where(diff < 0)[0])/len(np.where(diff != 0)[0]) # 2.36% of cloudy data