## Preprocessing Dyamond data

1) We read the data
2) Reshape variables so that they have equal dimensionality
3) Reshape into data samples fit for the NN and convert into a DataFrame
4) Downsample the data: Remove data above 21kms, remove condensate-free clouds, combat class-imbalance
5) Split into input and output
6) Save as npy

Note: We neither scale nor split the data into training/validation/test sets already in this notebook. <br>
The reason is that i) in order to scale we need the entire dataset but this can only be done in conjunction with the dyamond dataset. Also for cross-validation different scalings will be necessary based on different subsets of the data, ii) The split into subsets will be done by the cross-validation procedure or not at all when training the final model.

*To compute the derivatives, I had to run eight duplicates of this notebook, only computing the derivatives for one variable at a time (to not run OOM). And at the end I had to piece together the resulting npy-files.*

In [3]:
# Ran with 900GB
import sys
import xarray as xr
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import time
# import importlib
# importlib.reload(my_classes)

sys.path.insert(0, '..')
from functions import add_derivatives
from my_classes import load_data

output_path = '~/my_work/icon-ml_data/cloud_cover_parameterization/neighborhood_based_SR_DYAMOND'

VERT_LAYERS = 31

## Parameters for the notebook

#Set a numpy seed for the permutation later on!
np.random.seed(10)

## 1) Reading the data

The data above 21km is capped.

In [2]:
order_of_vars_dyamond = ['hus', 'clw', 'cli', 'ta', 'pa', 'ua', 'va', 'zg', 'fr_land', 'clc', 'cl_area']

In [3]:
# Load horizontally and vertically coarse-grained dyamond data
data_dict = load_data(source='split_by_var_name', days='discard_spinup', resolution='R02B05', order_of_vars=order_of_vars_dyamond, 
                      path='~/bd1179_work/DYAMOND/hvcg_data')

clc
cl_area


In [4]:
for key in data_dict.keys():
    print(key, data_dict[key].shape)

clc (467, 31, 79342)
cl_area (467, 31, 79342)


In [5]:
(TIME_STEPS, VERT_LAYERS, HORIZ_FIELDS) = data_dict['clc'].shape

In [6]:
try:
    #Reshaping into nd-arrays of equaling shapes (don't reshape in the vertical)
    data_dict['zg'] = np.repeat(np.expand_dims(data_dict['zg'], 0), TIME_STEPS, axis=0)
    data_dict['fr_land'] = np.repeat(np.expand_dims(data_dict['fr_land'], 0), TIME_STEPS, axis=0)
    data_dict['fr_land'] = np.repeat(np.expand_dims(data_dict['fr_land'], 1), VERT_LAYERS, axis=1)
except:
    pass

In [7]:
# Convert the data to float32!
for key in data_dict.keys():
    data_dict[key] = np.float32(data_dict[key])

In [8]:
# Our Neural Network has trained with clc in [0, 100]!
data_dict['clc'] = 100*data_dict['clc']
data_dict['cl_area'] = 100*data_dict['cl_area']
print(np.max(data_dict['clc'][:, 4:, :]))
print(np.max(data_dict['cl_area'][:, 4:, :]))

100.000015
100.0


In [None]:
# Carry along information about the vertical layer of a grid cell. int16 is sufficient for < 1000.
vert_layers = np.int16(np.repeat(np.expand_dims(np.arange(1, VERT_LAYERS+1), 0), TIME_STEPS, axis=0))
vert_layers = np.repeat(np.expand_dims(vert_layers, 2), HORIZ_FIELDS, axis=2)
vert_layers.shape

In [None]:
# Add magnitude of horizontal wind
data_dict['U'] = np.sqrt(data_dict['ua']**2 + data_dict['va']**2)
del data_dict['ua']
del data_dict['va']

In [None]:
# Add RH
T0 = 273.15
r = 0.00263*data_dict['pa']*data_dict['hus']*np.exp((17.67*(data_dict['ta']-T0))/(data_dict['ta']-29.65))**(-1)

data_dict['rh'] = r

In [None]:
# Add ps
ps = np.repeat(np.expand_dims(data_dict['pa'][:, -1], axis=1), VERT_LAYERS, axis=1)
data_dict['ps'] = ps

In [9]:
# Remove data above 21kms
for key in data_dict.keys():
    data_dict[key] = data_dict[key][:, 4:, :]

# vert_layers = vert_layers[:, 4:, :]

In [None]:
# Add derivatives

# Required time per variable
# On JupyterHub, sequential loop: 129.2 hours
# Via SLURM job, sequential loop: 105 hours
# 128 processes, parallel loop: 11.6 hours --> Actually, it only seems to take 1.1 hours

from contextlib import contextmanager
import multiprocessing as mlp
import gc

@contextmanager
def poolcontext(*args, **kwargs):
    pool = mlp.Pool(*args, **kwargs)
    yield pool
    pool.terminate()
    
def add_derivatives_par(data_dict):
    # Define variables for add_derivatives (Add 'zg' at the end)
    base_variables = ['hus', 'clw', 'cli', 'ta', 'pa', 'U', 'rh', 'zg']
    return add_derivatives(data_dict, base_variables)

procs = 128
with poolcontext(processes=procs) as pool:
    # Every process received a part of data_dict
    results = pool.map(add_derivatives_par, [{key: data_dict[key][k*TIME_STEPS//procs:(k+1)*TIME_STEPS//procs] for key in data_dict.keys()} for k in range(procs)])
    
data_dict = {}
for key in results[0].keys():
    data_dict[key] = np.concatenate([results[k][key] for k in range(procs)])
    
del results
gc.collect()

In [10]:
# Reshaping into 1D-arrays and converting dict into a DataFrame-object
for key in data_dict.keys():
    data_dict[key] = np.reshape(data_dict[key], -1)

# vert_layers = np.reshape(vert_layers, -1)

In [11]:
for key in data_dict.keys():
    print(key, data_dict[key].shape)

clc (1000423278,)
cl_area (1000423278,)


In [12]:
df = pd.DataFrame.from_dict(data_dict)

# Number of samples/rows
len(df)

1000423278

In [14]:
# del data_dict
# gc.collect()

**Downsampling the data (minority class: clc = 0)**

In [15]:
# There are no nans left
assert np.all(np.isnan(df) == False) == True

In [18]:
# We ensure that clc != 0 is as large as clc = 0 (which then has 294 Mio samples) and keep the original order intact
df_noclc = df.loc[df['clc']==0]
print(len(df_noclc))

# len(downsample_indices) will be the number of noclc samples that remain
downsample_ratio = (len(df) - len(df_noclc))/len(df_noclc)
shuffled_indices = np.random.permutation(df_noclc.index)
size_noclc = int(len(df_noclc)*downsample_ratio)

del df_noclc
gc.collect()

downsample_indices = shuffled_indices[:size_noclc] 

# Concatenate df.loc[df[output_var]!=0].index and downsample_indices
final_indices = np.concatenate((downsample_indices, df.loc[df['clc']!=0].index))

del shuffled_indices, downsample_indices
gc.collect()

# Sort final_indices so that we can more or less recover the timesteps
final_indices = np.sort(final_indices)

# Label-based (loc) not positional-based
df = df.loc[final_indices]

In [19]:
# Number of samples after downsampling
len(df)

774757958

In [20]:
#Modifies df as well
def split_input_output(dataset):
    output_clc = dataset['clc']
    output_cl_area = dataset['cl_area']
    del dataset['clc']
    del dataset['cl_area']
    return output_clc, output_cl_area

In [21]:
output_clc, output_cl_area = split_input_output(df)

In [22]:
# Save the data
# np.save(output_path + '/cloud_cover_input_dyamond.npy', df)
np.save(output_path + '/cloud_cover_output_dyamond.npy', output_clc)
# np.save(output_path + '/cloud_area_output_dyamond.npy', output_cl_area)
# np.save(output_path + '/samples_vertical_layers_dyamond.npy', vert_layers[df.index])

In [None]:
# CHECK
test_output = np.load(output_path + '/cloud_area_output_dyamond.npy')

assert np.all(np.abs(test_output - output_cl_area) < 1e-10)

### Piece together the npy files containing the derivatives

In [None]:
final_features = ['hus', 'clw', 'cli', 'ta', 'pa', 'zg', 'fr_land', 'U', 'rh', 'ps', 'hus_z', 'hus_zz', 'clw_z', 'clw_zz', 'cli_z',\
                  'cli_zz', 'ta_z', 'ta_zz', 'pa_z', 'pa_zz', 'U_z', 'U_zz', 'rh_z', 'rh_zz']
len(final_features)

In [None]:
input_cli = np.load(output_path + '/cloud_cover_input_dyamond_cli_only.npy')
input_ta = np.load(output_path  + '/cloud_cover_input_dyamond_ta_only.npy')
input_clw = np.load(output_path + '/cloud_cover_input_dyamond_clw_only.npy')
input_rh = np.load(output_path  + '/cloud_cover_input_dyamond_rh_only.npy')
input_hus = np.load(output_path + '/cloud_cover_input_dyamond_hus_only.npy')
input_U = np.load(output_path + '/cloud_cover_input_dyamond_U_only.npy')
input_pa = np.load(output_path + '/cloud_cover_input_dyamond_pa_only.npy')

In [None]:
# The first ten variables should coincide
assert np.all(input_cli[:, :10] == input_ta[:, :10])
assert np.all(input_cli[:, :10] == input_clw[:, :10])
assert np.all(input_cli[:, :10] == input_rh[:, :10])
assert np.all(input_cli[:, :10] == input_hus[:, :10])
assert np.all(input_cli[:, :10] == input_U[:, :10])
assert np.all(input_cli[:, :10] == input_pa[:, :10])

# These should be the derivatives
assert not np.all(input_cli[:, -2:] == input_ta[:, -2:])
assert not np.all(input_cli[:, -2:] == input_clw[:, -2:])
assert not np.all(input_cli[:, -2:] == input_rh[:, -2:])
assert not np.all(input_cli[:, -2:] == input_hus[:, -2:])
assert not np.all(input_cli[:, -2:] == input_U[:, -2:])
assert not np.all(input_cli[:, -2:] == input_pa[:, -2:])

In [None]:
final_input = np.concatenate((input_hus, input_clw[:, -2:], input_cli[:, -2:], input_ta[:, -2:], input_pa[:, -2:], input_U[:, -2:], input_rh[:, -2:]), axis=1)
np.save(output_path + '/cloud_cover_input_dyamond.npy', final_input)