## NN parameterization of cloud cover 

In [2]:
import sys
import xarray as xr
import numpy as np
import matplotlib.pyplot as plt
# import sherpa
import pandas as pd
import os
import importlib
import for_preprocessing

# from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler
importlib.reload(for_preprocessing)
# importlib.reload(my_classes)
from for_preprocessing import load_day

# Add path with my_classes to sys.path
sys.path.insert(0, '/pf/b/b309170/workspace_icon-ml/cloud_cover_parameterization/')

from my_classes import write_infofile

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

path = "/pf/b/b309170/my_work/NARVAL/data_var_vertinterp/"
output_path = "/pf/b/b309170/my_work/icon-ml_data/cloud_cover_parameterization/region_based/based_on_var_interpolated_data"
model_path = "/pf/b/b309170/workspace_icon-ml/cloud_cover_parameterization/region_based/saved_models"

NUM = 1
VERT_LAYERS = 31
no_NNs = 27

np.random.seed(NUM)
%config Completer.use_jedi = False # Faster autocompletion?

## 1) Reading the data

We train one NN for every vertical layer i
### Input:
- fr_lake: Fraction of open water in a grid box, for seas and lakes
- zf[i-2], zf[i-1], zf[i], zf[i+1], zf[i+2]: Geometric height at full levels (3D)
- qv[i-2], qv[i-1], qv[i], qv[i+1], qv[i+2]: Specific water vapor content (3D)
- qc[i-2], qc[i-1], qc[i], qc[i+1], qc[i+2]: Specific cloud water content (3D)
- qi[i-2], qi[i-1], qi[i], qi[i+1], qi[i+2]: Specific cloud ice content (3D)
- temp[i-2], temp[i-1], temp[i], temp[i+1], temp[i+2]: Temperature (3D)
- pres[i-2], pres[i-1], pres[i], pres[i+1], pres[i+2]: Pressure (3D)
- rho[i-2], rho[i-1], rho[i], rho[i+1], rho[i+2]: Air density (3D)

Additionally, we'll use clc[i] from the previous time step. <br>
$37$ $( = 5\cdot 7+1+1$) input nodes

### Output:
- clc[i]: Cloud Cover

$1$ output node

=> Train 27 NNs (no need to train uppermost layers) with 37 input and 1 output node each

In [2]:
# We need to take a different approach to load the data as clc from the previous time step is in the input
# We load the data day by day, not utilizing consecutive days
# (since the simulations were only run for around a day and in general we don't have consecutive days!)

# 1) Load a day
# 2) Create one dataset for each layer (below 21km)
# 3) Put data into corresponding dataset
# 4) Do 1)-3) for all days
# 5) Split dataset into train/valid/test
# 6) Combine test-datasets => save
#            training-datasets => save
#            validation-datasets => save

# Get all days
ls = os.listdir(os.path.join(path, 'temp')) #Temperature as an arbitrary variable
days = set()
for j in range(len(ls)):
    day = ls[j].split(sep='_')[5] #Days
    days.add(day)

In [3]:
# Store all days in an array of dataframes (each row is a training sample for the NN)
dfs = load_day(days.pop(), no_NNs, path)
while len(days) > 0: #while len(days) > 0 to load all days
    tmp = load_day(days.pop(), no_NNs, path)
    for i in range(no_NNs):
        dfs[i] = dfs[i].append(tmp[i], ignore_index=True)

In [4]:
# Is the data loaded correctly?
# Yes, we have:
# 1)
print('For every vertical layer up to no_NNs many we have one dataframe, i.e. %d in total'%len(dfs))
assert len(dfs) == no_NNs
# 2)
print('There are %d input + output variables'%dfs[0].shape[1])
assert dfs[0].shape[1] == 38
print('Removing the first timestep from 67 days, with 1699 timesteps in total and 1131 horizontal entries:')
print('(1699-67)*1024 = %d'%dfs[0].shape[0])
assert dfs[0].shape[0] == (1699-67)*1024 
# 3)
for i in range(len(dfs)):
    for j in range(len(dfs)):
        assert dfs[i].shape == dfs[j].shape
print('The shapes of all dataframes are equal.')
# 4) Checking some arbitrary data sample
# dfs[16].values[664103] matches
# day: 2016072800, vertical layer: 20, horizontal index (after removing nans): 206

For every vertical layer up to no_NNs many we have one dataframe, i.e. 27 in total
There are 38 input + output variables
Removing the first timestep from 67 days, with 1699 timesteps in total and 1131 horizontal entries:
(1699-67)*1024 = 1671168
The shapes of all dataframes are equal.


In [5]:
dfs[0].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1671168 entries, 0 to 1671167
Data columns (total 38 columns):
 #   Column    Non-Null Count    Dtype  
---  ------    --------------    -----  
 0   qv_i-2    1671168 non-null  float64
 1   qv_i-1    1671168 non-null  float64
 2   qv_i      1671168 non-null  float64
 3   qv_i+1    1671168 non-null  float64
 4   qv_i+2    1671168 non-null  float64
 5   qc_i-2    1671168 non-null  float64
 6   qc_i-1    1671168 non-null  float64
 7   qc_i      1671168 non-null  float64
 8   qc_i+1    1671168 non-null  float64
 9   qc_i+2    1671168 non-null  float64
 10  qi_i-2    1671168 non-null  float64
 11  qi_i-1    1671168 non-null  float64
 12  qi_i      1671168 non-null  float64
 13  qi_i+1    1671168 non-null  float64
 14  qi_i+2    1671168 non-null  float64
 15  temp_i-2  1671168 non-null  float64
 16  temp_i-1  1671168 non-null  float64
 17  temp_i    1671168 non-null  float64
 18  temp_i+1  1671168 non-null  float64
 19  temp_i+2  1671168 non

In [6]:
pd.set_option('display.float_format', lambda x: '%g' % x)
dfs[20].describe()

Unnamed: 0,qv_i-2,qv_i-1,qv_i,qv_i+1,qv_i+2,qc_i-2,qc_i-1,qc_i,qc_i+1,qc_i+2,...,rho_i+1,rho_i+2,zg_i-2,zg_i-1,zg_i,zg_i+1,zg_i+2,fr_lake,clc_prev,clc
count,1671170.0,1671170.0,1671170.0,1671170.0,1671170.0,1671170.0,1671170.0,1671170.0,1671170.0,1671170.0,...,1671170.0,1671170.0,1671170.0,1671170.0,1671170.0,1671170.0,1671170.0,1671170.0,1671170.0,1671170.0
mean,0.00737286,0.00888582,0.0105352,0.0120102,0.0132247,1.62484e-05,2.54916e-05,3.59945e-05,3.26906e-05,1.48808e-05,...,1.06726,1.09814,2217.44,1764.81,1365.57,1019.29,725.788,0.00250321,10.8091,10.7657
std,0.00278859,0.00296276,0.00297614,0.00287546,0.00289272,3.80472e-05,4.96917e-05,6.64943e-05,6.32496e-05,4.01862e-05,...,0.0207723,0.0228537,93.7835,107.026,119.739,131.405,141.547,0.0110212,12.5613,12.5311
min,0.000104811,0.00013219,0.000259667,0.000370459,0.000555851,0.0,0.0,0.0,0.0,0.0,...,0.955472,0.978894,2168.33,1710.0,1305.51,954.582,657.176,0.0,0.0,0.0
25%,0.00517002,0.00688104,0.00927526,0.0110916,0.0123455,0.0,0.0,1.47803e-26,1.4492e-07,1.34621e-09,...,1.06185,1.09398,2168.38,1710.04,1305.55,954.611,657.196,0.0,0.0942267,0.101541
50%,0.0077521,0.00958351,0.0113457,0.0127074,0.0138855,1.60923e-12,3.77945e-06,1.20674e-05,9.50654e-06,9.84288e-07,...,1.07252,1.10405,2170.5,1711.94,1307.12,955.819,658.041,0.0,7.0439,7.00473
75%,0.00978913,0.011327,0.0127262,0.013973,0.0151277,1.38625e-05,3.10972e-05,4.23167e-05,3.49793e-05,1.05947e-05,...,1.08016,1.11236,2210.53,1752.73,1348.08,997.056,698.658,1.59524e-06,17.0376,16.9344
max,0.0135576,0.0150132,0.0165432,0.0178242,0.0185974,0.0010945,0.00116312,0.00118059,0.000990079,0.000873018,...,1.11864,1.14857,2805.78,2445.79,2136.51,1873.51,1652.89,0.151835,98.9596,98.9596


**Splitting the data into a learning and a test set**

In [7]:
#Splitting the data into a learning and a test set

#Should we use StratifiedShuffleSplit instead to make sure that the test set is representative of the whole dataset?
#E.g. define categories of specific water vapor and make sure those categories are present in the test set as well
#-> Geron, p.69

def split_train_test(df, test_ratio):
    shuffled_indices = np.random.permutation(len(df))
    test_set_size = int(len(df)*test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return df.iloc[train_indices], df.iloc[test_indices]
    
learning_sets = []
test_sets = []
for i in range(no_NNs):
    a, b = split_train_test(dfs[i], 0.2)
    learning_sets.append(a)
    test_sets.append(b)
print(len(learning_sets[0]), 'training samples, ', len(test_sets[0]), 'test samples')

1336935 training samples,  334233 test samples


In [8]:
scaler = StandardScaler()

In [None]:
# Split the training sets[i]/learning sets[i] into a training sets[i] and a validation sets[i] and rescale
train_sets = []
valid_sets = []
input_train_sets = []
input_valid_sets = []
input_test_sets = []
output_valid_sets = []
output_train_sets = []
output_test_sets = []

for i in range(no_NNs):
    a, b = split_train_test(learning_sets[i], 0.1)
    train_sets.append(a)
    valid_sets.append(b)
    output_valid_sets.append(valid_sets[i]['clc'])
    del valid_sets[i]['clc']
    output_train_sets.append(train_sets[i]['clc'])
    del train_sets[i]['clc']
    # Save and scale the test set as well
    output_test_sets.append(test_sets[i]['clc'])
    del test_sets[i]['clc']
    scaler.fit(train_sets[i])
    input_train_sets.append(scaler.transform(train_sets[i]))
    input_valid_sets.append(scaler.transform(valid_sets[i]))
    input_test_sets.append(scaler.transform(test_sets[i]))
    with open(model_path + '/scaler_%d.txt'%NUM, 'a') as file:
        file.write('The mean values of the %d-th Standard Scaler: \n %s'%(i, str(scaler.mean_)))
        file.write('\nThe standard deviation values of the %d-th Standard Scaler: \n %s \n'
                   %(i,str(np.sqrt(scaler.var_))))

In [10]:
# We reduce the number of saved files by concatenating
input_train = np.concatenate(([input_train_sets[i] for i in range(no_NNs)]))
input_valid = np.concatenate(([input_valid_sets[i] for i in range(no_NNs)]))
input_test = np.concatenate(([input_test_sets[i] for i in range(no_NNs)]))
output_train = np.concatenate(([output_train_sets[i] for i in range(no_NNs)]))
output_valid = np.concatenate(([output_valid_sets[i] for i in range(no_NNs)]))
output_test = np.concatenate(([output_test_sets[i] for i in range(no_NNs)]))

In [11]:
# Save the data
np.save(output_path + '/cloud_cover_input_train_%d.npy'%NUM, input_train)
np.save(output_path + '/cloud_cover_input_valid_%d.npy'%NUM, input_valid)
np.save(output_path + '/cloud_cover_output_train_%d.npy'%NUM, output_train)
np.save(output_path + '/cloud_cover_output_valid_%d.npy'%NUM, output_valid)
np.save(output_path + '/cloud_cover_input_test_%d.npy'%NUM, input_test)
np.save(output_path + '/cloud_cover_output_test_%d.npy'%NUM, output_test)

In [12]:
# Write the accompanying info-file
with open(model_path + '/model_region_based_final_%d.txt'%NUM, 'w') as file:
    write_infofile(file, str(learning_sets[0].columns), str(learning_sets[0].columns[:-1]), 
                   model_path, output_path, NUM)