## Random Forest

**For Table 3 of the paper**

Neighborhood-based QUBICC R2B5 model

Requires more than 8 hours for n_estimators = 1

In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from tensorflow.keras import backend as K
from tensorflow.keras.regularizers import l1_l2
import tensorflow.nn as nn
import tensorflow as tf
import gc
import numpy as np
import pandas as pd
import importlib
import os
import sys
import joblib

#Import sklearn before tensorflow (static Thread-local storage)
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

# from tensorflow.keras.optimizers import Nadam
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense, BatchNormalization
# import tensorflow as tf

# Add path with my_classes to sys.path
base_path = '/home/b/b309170'

import matplotlib.pyplot as plt
import time

NUM = 1

In [2]:
# Prevents crashes of the code
gpus = tf.config.list_physical_devices('GPU')
tf.config.set_visible_devices(gpus[0], 'GPU')

In [3]:
# Allow the growth of memory Tensorflow allocates (limits memory usage overall)
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

In [4]:
scaler = StandardScaler()

In [5]:
# path_data = base_path + '/my_work/icon-ml_data/cloud_cover_parameterization/region_based_one_nn_R02B05/based_on_var_interpolated_data'

# # Data is not yet normalized
# input_data = np.load(path_data + '/cloud_cover_input_qubicc.npy', mmap_mode='r')
# output_data = np.load(path_data + '/cloud_cover_output_qubicc.npy', mmap_mode='r')

In [6]:
# (samples_total, no_of_features) = input_data.shape
# print((samples_total, no_of_features))

# assert no_of_features < samples_total # Making sure there's no mixup

In [7]:
# training_folds = []
# validation_folds = []
# two_week_incr = samples_total//6

# for i in range(3):
#     # Note that this is a temporal split since time was the first dimension in the original tensor
#     first_incr = np.arange(samples_total//6*i, samples_total//6*(i+1))
#     second_incr = np.arange(samples_total//6*(i+3), samples_total//6*(i+4))

#     validation_folds.append(np.append(first_incr, second_incr))
#     training_folds.append(np.arange(samples_total))
#     training_folds[i] = np.delete(training_folds[i], validation_folds[i])

In [8]:
# # Need the third split

# #Standardize according to the fold
# scaler.fit(input_data[training_folds[2]])

# #Load the data for the respective fold and convert it to tf data
# input_train = scaler.transform(input_data[training_folds[2]])
# input_valid = scaler.transform(input_data[validation_folds[2]]) 
# output_train = output_data[training_folds[2]]
# output_valid = output_data[validation_folds[2]]

# np.save('RFs/neighborhood_based_R2B5_input_train.npy', input_train)
# np.save('RFs/neighborhood_based_R2B5_input_valid.npy', input_valid)
# np.save('RFs/neighborhood_based_R2B5_output_train.npy', output_train)
# np.save('RFs/neighborhood_based_R2B5_output_valid.npy', output_valid)

In [None]:
input_train = np.load('/home/b/b309170/workspace_icon-ml/iconml_clc/additional_content/baselines/RFs/neighborhood_based_R2B5_input_train.npy')
input_valid = np.load('/home/b/b309170/workspace_icon-ml/iconml_clc/additional_content/baselines/RFs/neighborhood_based_R2B5_input_valid.npy')
output_train = np.load('/home/b/b309170/workspace_icon-ml/iconml_clc/additional_content/baselines/RFs/neighborhood_based_R2B5_output_train.npy')
output_valid = np.load('/home/b/b309170/workspace_icon-ml/iconml_clc/additional_content/baselines/RFs/neighborhood_based_R2B5_output_valid.npy')

### Random Forest

In [None]:
## Timing (on Mistral)

# n_estimators = 1, max_depth = 1: 2556
# n_estimators = 5, max_depth = 9: 2556*45 is close to 34 hrs

# Using n_estimators = 5, max_depth = 9

In [10]:
# max_depth = 4 should theoretically finish in 8 hours, it finished in 12 hours
# max_depth = 5 should theoretically finish in 12 hours
# max_depth = 9 should theoretically finish in 20 hours
# max_depth = 9 didn't finish
rf = RandomForestRegressor(n_estimators = 5, max_depth = 7, random_state = 42)

# Train the model on training data
rf.fit(input_train, output_train)

CPU times: user 42min 21s, sys: 16.8 s, total: 42min 38s
Wall time: 42min 38s


RandomForestRegressor(max_depth=1, n_estimators=1, random_state=42)

In [8]:
joblib.dump(rf, "/home/b/b309170/scratch/neighborhood_based_R2B5_uncompressed_md_7.joblib", compress=0) 

['/home/b/b309170/scratch/RF_compressed.joblib']

In [None]:
# rf = joblib.load("/home/b/b309170/scratch/neighborhood_based_R2B5_uncompressed_7.joblib")

In [None]:
# model_fold_3 is implemented in ICON-A
batch_size = 2**20

for i in range(1 + input_valid.shape[0]//batch_size):
    if i == 0:
        clc_predictions = rf.predict(input_valid[i*batch_size:(i+1)*batch_size])
    else:
        clc_predictions = np.concatenate((clc_predictions, rf.predict(input_valid[i*batch_size:(i+1)*batch_size])), axis=0)
    K.clear_session()
    gc.collect()

In [None]:
mse_rf = mean_squared_error(output_valid, clc_predictions)

with open('/home/b/b309170/workspace_icon-ml/iconml_clc/additional_content/baselines/RFs/RF_results.txt', 'a') as file:
    file.write('The MSE on the validation set of the neighborhood-based R2B5 RF is %.2f.\n'%mse_rf)