## Random Forest

**For Table 3 of the paper**

Cell-based NARVAL R2B4 RF

We can quickly estimate the required time to train a RF by training a RF with n_estimators = 1, dividing that time by 36 and interpreting the result in hours.

--> Would need at least **10 hours** to train it on Mistral with default hyperparameters

In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from tensorflow.keras import backend as K
import gc
import numpy as np
import pandas as pd
import importlib
import os
import sys
import pickle
import joblib

from sklearn.ensemble import RandomForestRegressor
from tensorflow.keras.models import load_model

base_path = '/home/b/b309170'
path_data = base_path + '/my_work/icon-ml_data/cloud_cover_parameterization/grid_cell_based_v3/based_on_var_interpolated_data'

import matplotlib.pyplot as plt
import time

NUM = 1

In [2]:
# The data is already normalized (w.r.t. training data)
input_train = np.load(path_data + '/cloud_cover_all_days_input_train_%d.npy'%NUM, mmap_mode='r')
input_valid = np.load(path_data + '/cloud_cover_all_days_input_valid_%d.npy'%NUM)
input_test = np.load(path_data + '/cloud_cover_all_days_input_test_%d.npy'%NUM)
output_train = np.load(path_data + '/cloud_cover_all_days_output_train_%d.npy'%NUM)
output_valid = np.load(path_data + '/cloud_cover_all_days_output_valid_%d.npy'%NUM)
output_test = np.load(path_data + '/cloud_cover_all_days_output_test_%d.npy'%NUM)

In [3]:
input_train.shape

(26482169, 6)

### Random Forest

The hyperparameters that influence computational time heavily are *n_estimators, max_depth, max_samples*.

In [16]:
## Timing (on Mistral)

# n_estimators = 1, max_depth = 1, min_samples_split = 6, max_leaf_nodes = 2, max_samples = 1: 0.8s
# n_estimators = 1, max_depth = 1, min_samples_split = 6, max_leaf_nodes = 2:                  16s
# n_estimators = 1, max_depth = 1, min_samples_split = 6, max_samples = 1:                     0.8s
# n_estimators = 1, max_depth = 1, max_samples = 1:                                            0.8s

# n_estimators = 1, max_depth = 1:                                                             16s
# n_estimators = 1, max_depth = 2:                                                             33s
# n_estimators = 1, max_depth = 3:                                                             46s
# n_estimators = 1, max_depth = 10:                                                            140s

# n_estimators = 2, max_depth = 1:                                                             33s
# n_estimators = 3, max_depth = 1:                                                             48s
# n_estimators = 5, max_depth = 1:                                                             81s
# n_estimators = 10, max_depth = 1:                                                            160s

# n_estimators = 1:                                                                            356s
# n_estimators = 2:                                                                            721s
# n_estimators = 3:                                                                            1101s
# n_estimators = 4:                                                                            1469s
# n_estimators = 5:                                                                            1813s
# n_estimators = 10:                                                                           3636s
# --> Increases linearly!

# a = [1,2,3,4,5,10]
# t_a = [356, 721, 1101, 1469, 1813, 3636]
# print('The trend is very linear: %s'%str(np.array(t_a)/np.array(a)))
# print('I thus expect that to train the default setting of n_estimators = 100 we need %d hours'%(100*363.6/3600))

In [17]:
# Instantiate model with 100 decision trees
rf = RandomForestRegressor(n_estimators = 340, random_state = 10)

# Train the model on training data
rf.fit(input_train, output_train)

RandomForestRegressor(n_estimators=2, random_state=10)

In [8]:
joblib.dump(rf, "/home/b/b309170/scratch/cell_based_R2B4_uncompressed.joblib", compress=0) 

['/home/b/b309170/scratch/RF_compressed.joblib']

In [19]:
# model_fold_3 is implemented in ICON-A
batch_size = 2**20

for i in range(1 + input_test.shape[0]//batch_size):
    if i == 0:
        clc_predictions = rf.predict(input_test[i*batch_size:(i+1)*batch_size])
    else:
        clc_predictions = np.concatenate((clc_predictions, rf.predict(input_test[i*batch_size:(i+1)*batch_size])), axis=0)
    K.clear_session()
    gc.collect()

In [20]:
lin_mse = mean_squared_error(output_test, clc_predictions)

with open('/home/b/b309170/workspace_icon-ml/iconml_clc/additional_content/baselines/RFs/RF_results.txt', 'a') as file:
    file.write('The MSE on the test set of the cell-based R2B4 RF is %.2f.\n'%lin_mse) 

### Prediction timing RF vs NN

In [2]:
input_test = np.load(path_data + '/cloud_cover_all_days_input_test_%d.npy'%NUM)

In [3]:
rf = joblib.load("/home/b/b309170/scratch/cell_based_R2B4_uncompressed.joblib")

In [3]:
nn = load_model("/home/b/b309170/workspace_icon-ml/iconml_clc/n1_cell_based_narval_r2b4/saved_models/model_grid_cell_based_v3_final_1.h5")

In [None]:
%%time
batch_size = 2**20

for i in range(1 + input_test.shape[0]//batch_size):
    if i == 0:
        clc_predictions = rf.predict(input_test[i*batch_size:(i+1)*batch_size])
    else:
        clc_predictions = np.concatenate((clc_predictions, rf.predict(input_test[i*batch_size:(i+1)*batch_size])), axis=0)
    K.clear_session()
    gc.collect()

In [6]:
%%time
batch_size = 2**20

for i in range(1 + input_test.shape[0]//batch_size):
    if i == 0:
        clc_predictions = nn.predict(input_test[i*batch_size:(i+1)*batch_size])
    else:
        clc_predictions = np.concatenate((clc_predictions, nn.predict(input_test[i*batch_size:(i+1)*batch_size])), axis=0)
    K.clear_session()
    gc.collect()

CPU times: user 4min 42s, sys: 23 s, total: 5min 5s
Wall time: 3min 12s
