## Random Forest

**For Table 3 of the paper**

Neighborhood-based NARVAL R2B4 model

n_estimator = 1 takes 41min 59s

In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from tensorflow.keras import backend as K
import gc
import numpy as np
import pandas as pd
import importlib
import os
import sys

from sklearn.ensemble import RandomForestRegressor

from tensorflow.keras.optimizers import Nadam
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import tensorflow as tf

import matplotlib.pyplot as plt
import time

NUM = 1

In [2]:
root_path = '/home/b/b309170'
data_path = os.path.join(root_path,
                         'my_work/icon-ml_data/cloud_cover_parameterization/region_based/based_on_var_interpolated_data')
model_path = os.path.join(root_path,
                          'workspace_icon-ml/cloud_cover_parameterization/region_based/saved_models')
info_file = os.path.join(root_path, 
                        'workspace_icon-ml/cloud_cover_parameterization/region_based/saved_models/model_region_based_final_1.txt')

n_layers = 27 # Is also the number of NNs

In [3]:
# Load all data (Data is already normalized w.r.t. training data)
input_train = np.load(os.path.join(data_path, 'cloud_cover_input_train_1.npy'))
output_train = np.load(os.path.join(data_path, 'cloud_cover_output_train_1.npy'))
input_valid = np.load(os.path.join(data_path, 'cloud_cover_input_valid_1.npy'))
output_valid = np.load(os.path.join(data_path, 'cloud_cover_output_valid_1.npy'))
input_test = np.load(os.path.join(data_path, 'cloud_cover_input_test_1.npy')) 
output_test = np.load(os.path.join(data_path, 'cloud_cover_output_test_1.npy')) 

In [4]:
## Load data pertaining to a specific NN
n_train_samples = output_train.shape[0]
n_valid_samples = output_valid.shape[0]
n_test_samples = output_test.shape[0]
n_features = input_train.shape[1]

# Load the data into dictionaries. Can't use 3D tensors here as some features will be removed depending on the NN.
input_train_NN = {}
for i in range(n_layers):
    input_train_NN[i] = np.zeros((n_train_samples//n_layers, n_features))
    
output_train_NN = {}
for i in range(n_layers):
    output_train_NN[i] = np.zeros((n_train_samples//n_layers))
    
input_valid_NN = {}
for i in range(n_layers):
    input_valid_NN[i] = np.zeros((n_valid_samples//n_layers, n_features))
    
output_valid_NN = {}
for i in range(n_layers):
    output_valid_NN[i] = np.zeros((n_valid_samples//n_layers))
    
input_test_NN = {}
for i in range(n_layers):
    input_test_NN[i] = np.zeros((n_test_samples//n_layers, n_features))
    
output_test_NN = {}
for i in range(n_layers):
    output_test_NN[i] = np.zeros((n_test_samples//n_layers))

In [5]:
for i in range(n_layers):
    start_ind_train = (n_train_samples//27)*i
    end_ind_train = (n_train_samples//27)*(i+1)
    start_ind_valid = (n_valid_samples//27)*i
    end_ind_valid = (n_valid_samples//27)*(i+1)
    start_ind_test = (n_test_samples//27)*i
    end_ind_test = (n_test_samples//27)*(i+1) 

    input_train_NN[i] = input_train[start_ind_train:end_ind_train]
    output_train_NN[i] = output_train[start_ind_train:end_ind_train]
    input_valid_NN[i] = input_valid[start_ind_valid:end_ind_valid]
    output_valid_NN[i] = output_valid[start_ind_valid:end_ind_valid]
    input_test_NN[i] = input_test[start_ind_test:end_ind_test]  
    output_test_NN[i] = output_test[start_ind_test:end_ind_test]

In [6]:
# We remove the input variables with zero variance. We compute the resulting input dimension for the NN.
input_dim = n_features
for i in range(n_layers):
    vars_to_remove = []
    for j in range(n_features):
        if np.var(input_train_NN[i][:, j]) == 0 or np.isnan(np.var(input_train_NN[i][:, j])):
            input_dim -= 1
            vars_to_remove.append(j)
    input_train_NN[i] = np.delete(input_train_NN[i], vars_to_remove, axis=1)
    input_valid_NN[i] = np.delete(input_valid_NN[i], vars_to_remove, axis=1)
    input_test_NN[i] = np.delete(input_test_NN[i], vars_to_remove, axis=1)

In [7]:
# Checking standardization:
thresh = 1e-5
[np.abs(np.mean(input_train_NN[0][:, j]))<thresh and np.abs(np.var(input_train_NN[0][:, j])-1)<thresh for j in range(input_train_NN[0].shape[1])]

[True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True]

### Random Forest

In [13]:
# Suppress warnings
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

mean_rf_mse = 0

for i in range(2, n_layers):
    # Instantiate model with 100 decision trees
    rf = RandomForestRegressor(n_estimators = 47, random_state = 10)

    # Train the model on training data
    rf.fit(input_train_NN[i], output_train_NN[i])
    
    # model_fold_3 is implemented in ICON-A
    batch_size = 2**20

    for j in range(1 + input_test_NN[i].shape[0]//batch_size):
        if j == 0:
            clc_predictions = rf.predict(input_test_NN[i][j*batch_size:(j+1)*batch_size])
        else:
            clc_predictions = np.concatenate((clc_predictions, rf.predict(input_test_NN[i][j*batch_size:(j+1)*batch_size])), axis=0)
        K.clear_session()
        gc.collect()
        
    rf_mse = mean_squared_error(output_test_NN[i], clc_predictions)
    
    with open('/home/b/b309170/workspace_icon-ml/iconml_clc/additional_content/baselines/RFs/RF_results.txt', 'a') as file:
        file.write('The MSE on the test set of the neighborhood-based R2B4 RF on layer %d is %.2f.\n'%(i, rf_mse)) 
    mean_rf_mse += rf_mse

with open('/home/b/b309170/workspace_icon-ml/iconml_clc/additional_content/baselines/RFs/RF_results.txt', 'a') as file:
    file.write('The overall MSE on the test set of the neighborhood-based R2B4 RF is %.2f.\n'%(mean_rf_mse/(n_layers-2)))

CPU times: user 41min 55s, sys: 3.99 s, total: 41min 59s
Wall time: 41min 59s
