## Preprocessing

<!-- Was used to generate: <br>
*preprocessed_data/cloud_cover_all_days_input_train.npy <br>
preprocessed_data/cloud_cover_all_days_input_valid.npy <br>
preprocessed_data/cloud_cover_all_days_output_train.npy <br>
preprocessed_data/cloud_cover_all_days_output_valid.npy* -->

In [None]:
import sys
import xarray as xr
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import time
# import importlib
# importlib.reload(my_classes)

# from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Nadam
from tensorflow.keras.callbacks import EarlyStopping

base_path = '/pf/b/b309170'
path = base_path + '/my_work/NARVAL/data_var_vertinterp/'
output_path = base_path + '/my_work/icon-ml_data/cloud_cover_parameterization/grid_column_based/based_on_var_interpolated_data'
model_path = "/pf/b/b309170/workspace_icon-ml/cloud_cover_parameterization/grid_column_based/saved_models"

# Add path with my_classes to sys.path
sys.path.insert(0, base_path + '/workspace_icon-ml/cloud_cover_parameterization/')

from my_classes import write_infofile
from my_classes import load_data

NUM = 1
VERT_LAYERS = 31

np.random.seed(NUM)

## 1) Reading the data
### Input:
- fr_lake: Fraction of open water in a grid box, for seas and lakes
- zg: Geometric height at full levels (3D)
- qv: Specific water vapor content (3D)
- qc: Specific cloud water content (3D)
- qi: Specific cloud ice content (3D)
- temp: Temperature (3D)
- pres: Pressure (3D)
- rho: Air density (3D)

$186$ $( = 1+24[zg]+26[q_c]+27\cdot 5$) input nodes

### Output:
- clc: Cloud Cover

$27$ output nodes

The data above 21km is capped.

In [2]:
# Loads the NARVAL data into the data_dict dictionary
order_of_vars = ['qv', 'qc', 'qi', 'temp', 'pres', 'rho', 'zg', 'fr_lake', 'fr_land', 'clc']
data_dict = load_data(source='narval', days='all', vert_interp=True, order_of_vars=order_of_vars)
    
del data_dict['fr_land']

In [3]:
#Reshaping into nd-arrays of equaling shapes
data_dict['zg'] = np.repeat(np.expand_dims(data_dict['zg'], 0), 1635, axis=0)
data_dict['fr_lake'] = np.repeat(np.expand_dims(data_dict['fr_lake'], 0), 1635, axis=0)

In [4]:
# One sample should contain a column of information
data_dict_reshaped = {}
for key in data_dict.keys():
    if data_dict[key].shape[1] == VERT_LAYERS:  
        for i in range(4, VERT_LAYERS):
            new_key = '{}{}{:d}'.format(key,'_',i)
            data_dict_reshaped[new_key] = np.reshape(data_dict[key][:,i,:], -1)
    else:
        data_dict_reshaped[key] = np.reshape(data_dict[key], -1)

# Remove constant fields
del data_dict_reshaped['zg_4']
del data_dict_reshaped['zg_5']
del data_dict_reshaped['zg_6']
del data_dict_reshaped['qc_4']

In [5]:
#Converting dict into a DataFrame-object 
df = pd.DataFrame.from_dict(data_dict_reshaped)
df.head()

Unnamed: 0,qv_4,qv_5,qv_6,qv_7,qv_8,qv_9,qv_10,qv_11,qv_12,qv_13,...,clc_21,clc_22,clc_23,clc_24,clc_25,clc_26,clc_27,clc_28,clc_29,clc_30
0,3e-06,3e-06,3e-06,3e-06,5e-06,1e-05,2.8e-05,8.6e-05,0.000101,0.000112,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3e-06,3e-06,3e-06,3e-06,5e-06,1e-05,3.3e-05,0.00011,0.000138,0.000232,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3e-06,3e-06,3e-06,3e-06,4e-06,1.1e-05,3.7e-05,9.3e-05,0.000118,0.000195,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3e-06,3e-06,3e-06,3e-06,5e-06,9e-06,3.3e-05,0.000131,0.000177,0.000299,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3e-06,3e-06,3e-06,3e-06,6e-06,1.1e-05,1.6e-05,5.6e-05,8.4e-05,9e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**Splitting the data into a learning and a test set**

In [6]:
#Splitting the data into a learning and a test set

#Should we use StratifiedShuffleSplit instead to make sure that the test set is representative of the whole dataset?
#E.g. define categories of specific water vapor and make sure those categories are present in the test set as well
#-> Geron, p.69

def split_train_test(df, test_ratio):
    shuffled_indices = np.random.permutation(len(df))
    test_set_size = int(len(df)*test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return df.iloc[train_indices], df.iloc[test_indices]
    
learning_set, test_set = split_train_test(df, 0.2)
print(len(learning_set), 'training samples, ', len(test_set), 'test samples')

1339392 training samples,  334848 test samples


In [7]:
scaler = StandardScaler()

In [8]:
#Split the training set/learning set into a training set and a validation set and rescale

train_set, valid_set = split_train_test(learning_set, 0.1)
output_valid = pd.DataFrame()
for i in range(4, VERT_LAYERS):
    output_valid['clc_%d'%i] = valid_set['clc_%d'%i]
    del valid_set['clc_%d'%i]
output_train = pd.DataFrame()
for i in range(4, VERT_LAYERS):
    output_train['clc_%d'%i] = train_set['clc_%d'%i]
    del train_set['clc_%d'%i]
scaler.fit(train_set)
input_train = scaler.transform(train_set)
input_valid = scaler.transform(valid_set)

In [9]:
# Save and scale the test set as well
output_test = pd.DataFrame()
for i in range(4, VERT_LAYERS):
    output_test['clc_%d'%i] = test_set['clc_%d'%i]
    del test_set['clc_%d'%i]
input_test = scaler.transform(test_set)

In [10]:
# Save the data
np.save(output_path + '/cloud_cover_input_train_%d.npy'%NUM, input_train)
np.save(output_path + '/cloud_cover_input_valid_%d.npy'%NUM, input_valid)
np.save(output_path + '/cloud_cover_output_train_%d.npy'%NUM, output_train)
np.save(output_path + '/cloud_cover_output_valid_%d.npy'%NUM, output_valid)
np.save(output_path + '/cloud_cover_input_test_%d.npy'%NUM, input_test)
np.save(output_path + '/cloud_cover_output_test_%d.npy'%NUM, output_test)
with open(model_path+'/scaler_%d.txt'%NUM, 'w') as file:
    file.write('Standard Scaler mean values:\n')
    file.write(str(scaler.mean_))
    file.write('\nStandard Scaler standard deviation:\n')
    file.write(str(np.sqrt(scaler.var_)))

In [11]:
# Write the accompanying info-file
with open(model_path + '/model_capped_grid_column_based_final_%d.txt'%NUM, 'w') as file:
    write_infofile(file, str(learning_set.columns), str(np.array(learning_set.columns[:-27])), 
                   model_path, output_path, NUM)