## Random Forest

**For Table 3 of the paper**

Column-based QUBICC R2B5 model

Requires more than 8 hours for n_estimators = 1

In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

from tensorflow.keras import backend as K
from tensorflow.keras.regularizers import l1_l2
import tensorflow.nn as nn
import tensorflow as tf
import gc
import numpy as np
import os
import joblib

from tensorflow.keras.optimizers import Nadam
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization

import matplotlib.pyplot as plt

In [2]:
# Prevents crashes of the code
gpus = tf.config.list_physical_devices('GPU')
tf.config.set_visible_devices(gpus[0], 'GPU')

In [3]:
# Allow the growth of memory Tensorflow allocates (limits memory usage overall)
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

In [4]:
scaler = StandardScaler()

In [4]:
path = '/home/b/b309170'
path_data = path + '/my_work/icon-ml_data/cloud_cover_parameterization/grid_column_based_QUBICC_R02B05/based_on_var_interpolated_data'

In [6]:
# input_data = np.transpose(np.load(path_data + '/cloud_cover_input_qubicc.npy', mmap_mode='r'))
# output_data = np.transpose(np.load(path_data + '/cloud_cover_output_qubicc.npy', mmap_mode='r'))

In [7]:
# (samples_total, no_of_features) = input_data.shape
# assert no_of_features == 163

Remove columns that are constant in at least one of the training folds

In [8]:
# # These features correspond to qc_4, qc_5, qc_6, qc_7, qc_8, qc_9, zg_4, zg_5, zg_6
# remove_fields = [27, 28, 29, 30, 31, 32, 135, 136, 137]
# input_data = np.delete(input_data, remove_fields, axis=1)
# no_of_features = no_of_features - len(remove_fields)

In [9]:
# training_folds = []
# validation_folds = []
# two_week_incr = samples_total//6

# for i in range(3):
#     # Note that this is a temporal split since time was the first dimension in the original tensor
#     first_incr = np.arange(samples_total//6*i, samples_total//6*(i+1))
#     second_incr = np.arange(samples_total//6*(i+3), samples_total//6*(i+4))

#     validation_folds.append(np.append(first_incr, second_incr))
#     training_folds.append(np.arange(samples_total))
#     training_folds[i] = np.delete(training_folds[i], validation_folds[i])

In [10]:
# # Need the second split

# #Standardize according to the fold
# scaler.fit(input_data[training_folds[1],:])

# #Load the data for the respective fold and convert it to tf data
# input_train = scaler.transform(input_data[training_folds[1]])
# input_valid = scaler.transform(input_data[validation_folds[1]])
# output_train = output_data[training_folds[1]]
# output_valid = output_data[validation_folds[1]]

# np.save('RFs/column_based_R2B5_input_train.npy', input_train)
# np.save('RFs/column_based_R2B5_input_valid.npy', input_valid)
# np.save('RFs/column_based_R2B5_output_train.npy', output_train)
# np.save('RFs/column_based_R2B5_output_valid.npy', output_valid)

In [5]:
input_train = np.load('/home/b/b309170/workspace_icon-ml/iconml_clc/additional_content/baselines/RFs/column_based_R2B5_input_train.npy')
input_valid = np.load('/home/b/b309170/workspace_icon-ml/iconml_clc/additional_content/baselines/RFs/column_based_R2B5_input_valid.npy')
output_train = np.load('/home/b/b309170/workspace_icon-ml/iconml_clc/additional_content/baselines/RFs/column_based_R2B5_output_train.npy')
output_valid = np.load('/home/b/b309170/workspace_icon-ml/iconml_clc/additional_content/baselines/RFs/column_based_R2B5_output_valid.npy')

### Random Forest

In [None]:
## Timing (on Mistral)

# n_estimators = 1, max_depth = 1: 4040s
# n_estimators = 5, max_depth = 6: 4040*30s is close to 34 hrs

# Using n_estimators = 5, max_depth = 5

In [None]:
# max_depth = 5 didn't finish
# max_depth = 2 should theoretically finish in 8 hours
# max_depth = 3 should theoretically finish in 10 hours, it finished in 12 hours
# max_depth = 5 should theoretically finish in 16 hours
rf = RandomForestRegressor(n_estimators = 5, max_depth = 5, random_state = 42)

# Train the model on training data
rf.fit(input_train, output_train)

In [None]:
joblib.dump(rf, "/home/b/b309170/scratch/column_based_R2B5_uncompressed_md_5.joblib", compress=0) 

In [6]:
rf = joblib.load("/home/b/b309170/scratch/column_based_R2B5_uncompressed_md_5.joblib")

In [7]:
# model_fold_3 is implemented in ICON-A
batch_size = 2**20

for i in range(1 + input_valid.shape[0]//batch_size):
    if i == 0:
        clc_predictions = rf.predict(input_valid[i*batch_size:(i+1)*batch_size])
    else:
        clc_predictions = np.concatenate((clc_predictions, rf.predict(input_valid[i*batch_size:(i+1)*batch_size])), axis=0)
    K.clear_session()
    gc.collect()

In [8]:
mse_rf = mean_squared_error(output_valid, clc_predictions)

with open('/home/b/b309170/workspace_icon-ml/iconml_clc/additional_content/baselines/RFs/RF_results.txt', 'a') as file:
    file.write('The MSE on the validation set of the column-based R2B5 RF is %.2f.\n'%mse_rf)