## Preprocessing

<!-- Was used to generate: <br>
*preprocessed_data/cloud_cover_all_days_input_train_1.npy <br>
preprocessed_data/cloud_cover_all_days_input_valid_1.npy <br>
preprocessed_data/cloud_cover_all_days_output_train_1.npy <br>
preprocessed_data/cloud_cover_all_days_output_valid_1.npy* -->

In [1]:
import sys
import xarray as xr
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import importlib

# from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Nadam
from tensorflow.keras.callbacks import EarlyStopping

base_path = '/pf/b/b309170'
path = base_path + '/my_work/NARVAL/data_var_vertinterp/'
output_path = base_path + '/my_work/icon-ml_data/cloud_cover_parameterization/grid_cell_based_v3/based_on_var_interpolated_data'
model_path = "/pf/b/b309170/workspace_icon-ml/cloud_cover_parameterization/grid_cell_based_v3/saved_models"
# Add path with my_classes to sys.path
sys.path.insert(0, base_path + '/workspace_icon-ml/cloud_cover_parameterization/')

from my_classes import write_infofile
from my_classes import load_data

NUM = 1
np.random.seed(NUM)

## Reading the data
### Input:
- fr_land: Fraction of land
- zg: Geometric height at full levels
- qv: Specific water vapor content
- qi: Specific cloud ice content
- temp: Temperature
- pres: Pressure

### Output:
- clc: Cloud Cover

Be careful with the NARVAL file-naming convention when it comes to timestamps when adding 2D-variables.

In [2]:
# Loads the NARVAL data into the data_dict dictionary
order_of_vars = ['qv', 'qi', 'temp', 'pres', 'zg', 'fr_land', 'clc']
data_dict = load_data(source='narval', days='all', vert_interp=True, order_of_vars=order_of_vars)

In [3]:
#Reshaping into nd-arrays of equaling shapes (have timesteps x vert x hor)
data_dict['zg'] = np.repeat(np.expand_dims(data_dict['zg'], 0), data_dict['qv'].shape[0], axis=0)
data_dict['fr_land'] = np.repeat(np.expand_dims(data_dict['fr_land'], 0), data_dict['qv'].shape[0], axis=0)
data_dict['fr_land'] = np.repeat(np.expand_dims(data_dict['fr_land'], 1), data_dict['qv'].shape[1], axis=1)

assert data_dict['fr_land'].shape == data_dict['qv'].shape == data_dict['zg'].shape

In [4]:
data_dict.keys()

odict_keys(['qv', 'qi', 'temp', 'pres', 'zg', 'fr_land', 'clc'])

In [5]:
# Reshaping into 1D-arrays and converting dict into a DataFrame-object (the following is based on Aurelien Geron)
for key in ['qv', 'qi', 'temp', 'pres', 'zg', 'fr_land', 'clc']:
    data_dict[key] = np.reshape(data_dict[key], -1) 

df = pd.DataFrame.from_dict(data_dict)
df.head()

Unnamed: 0,qv,qi,temp,pres,zg,fr_land,clc
0,2e-06,0.0,226.366128,1530.231208,28193.783559,1.0,0.0
1,2e-06,0.0,227.968134,1528.399878,28193.783559,1.0,0.0
2,2e-06,0.0,226.825919,1528.486878,28193.783559,1.0,0.0
3,2e-06,0.0,228.243447,1521.956216,28193.783559,1.0,0.0
4,2e-06,0.0,228.072678,1525.309351,28193.783559,1.0,0.0


**Downsampling the data (minority class: clc = 0)**

In [6]:
np.max(df.loc[df['clc']>0])['zg']

20784.62706137544

In [7]:
df = df.loc[df['zg'] < 21000] # There are days with clc > 0 at 20500 meters

In [9]:
df_noclc = df.loc[df['clc']==0]
len(df_noclc)

26814085

In [10]:
# We ensure that clc != 0 and clc = 0 have the same size
downsample_ratio = (len(df) - len(df_noclc))/len(df_noclc)
print(downsample_ratio)
shuffled_indices = np.random.permutation(len(df_noclc))
set_size = int(len(df_noclc)*downsample_ratio)
downsample_indices = shuffled_indices[:set_size] 
df = pd.concat([df_noclc.iloc[downsample_indices],df.loc[df['clc']!=0]])

0.68584831442132


**Splitting the data into a learning and a test set**

In [11]:
#Splitting the data into a learning and a test set

#Should we use StratifiedShuffleSplit instead to make sure that the test set is representative of the whole dataset?
#E.g. define categories of specific water vapor and make sure those categories are present in the test set as well
#-> Geron, p.69

def split_train_test(df, test_ratio):
    shuffled_indices = np.random.permutation(len(df))
    test_set_size = int(len(df)*test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return df.iloc[train_indices], df.iloc[test_indices]
    
learning_set, test_set = split_train_test(df, 0.2)
print(len(learning_set), 'training samples, ', len(test_set), 'test samples')

29424632 training samples,  7356158 test samples


In [12]:
scaler = StandardScaler()

In [13]:
#Split the training set/learning set into a training set and a validation set and rescale

train_set, valid_set = split_train_test(learning_set, 0.1)
if 'clc' in valid_set.columns:
    output_valid = valid_set['clc']
    del valid_set['clc']
if 'clc' in train_set.columns:
    output_train = train_set['clc']
    del train_set['clc']
scaler.fit(train_set)
input_train = scaler.transform(train_set)
input_valid = scaler.transform(valid_set)

In [14]:
# Save and scale the test set as well
if 'clc' in test_set.columns:
    output_test = test_set['clc']
    del test_set['clc']
input_test = scaler.transform(test_set)

In [15]:
# Save the data
np.save(output_path + '/cloud_cover_all_days_input_train_%d.npy'%NUM, input_train)
np.save(output_path + '/cloud_cover_all_days_input_valid_%d.npy'%NUM, input_valid)
np.save(output_path + '/cloud_cover_all_days_output_train_%d.npy'%NUM, output_train)
np.save(output_path + '/cloud_cover_all_days_output_valid_%d.npy'%NUM, output_valid)
np.save(output_path + '/cloud_cover_all_days_input_test_%d.npy'%NUM, input_test)
np.save(output_path + '/cloud_cover_all_days_output_test_%d.npy'%NUM, output_test)
with open(model_path+'/scaler_%d.txt'%NUM, 'w') as file:
    file.write('Standard Scaler mean values:\n')
    file.write(str(scaler.mean_))
    file.write('\nStandard Scaler standard deviation:\n')
    file.write(str(np.sqrt(scaler.var_)))

In [18]:
# Write the accompanying info-file
with open(model_path + '/model_grid_cell_based_v3_final_%d.txt'%NUM, 'w') as file:
    write_infofile(file, str(learning_set.columns), str(np.array(np.delete(learning_set.columns, 6))), 
                   model_path, output_path, NUM)