## Random Forest

**For Table 3 of the paper**

Cell-based QUBICC R2B5 model

n_estimator = 1 takes 6h 36s

In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from tensorflow.keras import backend as K
from tensorflow.keras.regularizers import l1_l2
import tensorflow as tf
import tensorflow.nn as nn
import gc
import numpy as np
import pandas as pd
import importlib
import os
import sys
import joblib

#Import sklearn before tensorflow (static Thread-local storage)
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from tensorflow.keras.models import load_model

path = '/home/b/b309170'
path_data = path + '/my_work/icon-ml_data/cloud_cover_parameterization/grid_cell_based_QUBICC_R02B05/based_on_var_interpolated_data'

import matplotlib.pyplot as plt
import time

NUM = 1

In [2]:
# Prevents crashes of the code
gpus = tf.config.list_physical_devices('GPU')
tf.config.set_visible_devices(gpus[0], 'GPU')

In [3]:
# Allow the growth of memory Tensorflow allocates (limits memory usage overall)
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

In [4]:
scaler = StandardScaler()

In [5]:
# # Data is not yet normalized
# input_data = np.load(path_data + '/cloud_cover_input_qubicc.npy', mmap_mode='r')
# output_data = np.load(path_data + '/cloud_cover_output_qubicc.npy', mmap_mode='r')

In [6]:
# (samples_total, no_of_features) = input_data.shape
# assert no_of_features < samples_total # Making sure there's no mixup

In [7]:
# # Split into training and validation (need split 2)
# training_folds = []
# validation_folds = []
# two_week_incr = samples_total//6

# for i in range(3):
#     # Note that this is a temporal split since time was the first dimension in the original tensor
#     first_incr = np.arange(samples_total//6*i, samples_total//6*(i+1))
#     second_incr = np.arange(samples_total//6*(i+3), samples_total//6*(i+4))

#     validation_folds.append(np.append(first_incr, second_incr))
#     training_folds.append(np.arange(samples_total))
#     training_folds[i] = np.delete(training_folds[i], validation_folds[i])

In [8]:
# # Need the second split

# #Standardize according to the fold
# scaler.fit(input_data[training_folds[1]])

# #Load the data for the respective fold and convert it to tf data
# input_train = scaler.transform(input_data[training_folds[1]])
# input_valid = scaler.transform(input_data[validation_folds[1]]) 
# output_train = output_data[training_folds[1]]
# output_valid = output_data[validation_folds[1]]

# np.save('RFs/cell_based_R2B5_input_train.npy', input_train)
# np.save('RFs/cell_based_R2B5_input_valid.npy', input_valid)
# np.save('RFs/cell_based_R2B5_output_train.npy', output_train)
# np.save('RFs/cell_based_R2B5_output_valid.npy', output_valid)

In [9]:
input_train = np.load('/home/b/b309170/workspace_icon-ml/iconml_clc/additional_content/baselines/RFs/cell_based_R2B5_input_train.npy')
input_valid = np.load('/home/b/b309170/workspace_icon-ml/iconml_clc/additional_content/baselines/RFs/cell_based_R2B5_input_valid.npy')
output_train = np.load('/home/b/b309170/workspace_icon-ml/iconml_clc/additional_content/baselines/RFs/cell_based_R2B5_output_train.npy')
output_valid = np.load('/home/b/b309170/workspace_icon-ml/iconml_clc/additional_content/baselines/RFs/cell_based_R2B5_output_valid.npy')

### Random Forest

In [10]:
n_est = 5
md = 8

**Size on disk**

In [11]:
s = 0
for k in range(md):
    s += 2**(7+k)
    
# Expected/maximal size on disk
1186 + n_est*322 + n_est*s

165996

In [12]:
# Instantiate model with 100 decision trees
rf = RandomForestRegressor(n_estimators = n_est, max_depth = md, random_state = 10)

# Train the model on training data
rf.fit(input_train, output_train)

RandomForestRegressor(max_depth=8, n_estimators=5, random_state=10)

In [13]:
joblib.dump(rf, "/home/b/b309170/scratch/cell_based_R2B5_uncompressed_smaller_md_8.joblib", compress=0) 

['/home/b/b309170/scratch/cell_based_R2B5_uncompressed_smaller_md_8.joblib']

In [14]:
# Should be around 159000
os.path.getsize('/home/b/b309170/scratch/cell_based_R2B5_uncompressed_smaller_md_8.joblib')

166238

In [15]:
[tree.tree_.max_depth for tree in rf.estimators_]

[8, 8, 8, 8, 8]

In [16]:
# model_fold_3 is implemented in ICON-A
batch_size = 2**20

for i in range(1 + input_valid.shape[0]//batch_size):
    if i == 0:
        clc_predictions = rf.predict(input_valid[i*batch_size:(i+1)*batch_size])
    else:
        clc_predictions = np.concatenate((clc_predictions, rf.predict(input_valid[i*batch_size:(i+1)*batch_size])), axis=0)
    K.clear_session()
    gc.collect()

In [17]:
mse_rf = mean_squared_error(output_valid, clc_predictions)

with open('/home/b/b309170/workspace_icon-ml/iconml_clc/additional_content/baselines/RFs/RF_results.txt', 'a') as file:
    file.write('The MSE on the validation set of the smaller cell-based R2B5 RF with md of 8 is %.2f.\n'%mse_rf)