## Multiple linear regression

**For Table 3 of the paper**

Column-based QUBICC R2B5 model

In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

from tensorflow.keras import backend as K
from tensorflow.keras.regularizers import l1_l2
import tensorflow.nn as nn
import tensorflow as tf
import gc
import numpy as np
import os

from tensorflow.keras.optimizers import Nadam
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization

import matplotlib.pyplot as plt

In [2]:
# Prevents crashes of the code
gpus = tf.config.list_physical_devices('GPU')
tf.config.set_visible_devices(gpus[0], 'GPU')

In [3]:
# Allow the growth of memory Tensorflow allocates (limits memory usage overall)
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

In [4]:
import time

In [5]:
path = '/pf/b/b309170'
path_data = path + '/my_work/icon-ml_data/cloud_cover_parameterization/grid_column_based_QUBICC_R02B05/based_on_var_interpolated_data'

In [6]:
# order_of_vars_narval = ['qv', 'qc', 'qi', 'temp', 'pres', 'zg', 'fr_land', 'clc']samples_total

In [7]:
input_data = np.transpose(np.load(path_data + '/cloud_cover_input_qubicc.npy', mmap_mode='r'))
output_data = np.transpose(np.load(path_data + '/cloud_cover_output_qubicc.npy', mmap_mode='r'))

In [8]:
(samples_total, no_of_features) = input_data.shape
assert no_of_features == 163

Remove columns that are constant in at least one of the training folds

In [9]:
# These features correspond to qc_4, qc_5, qc_6, qc_7, qc_8, qc_9, zg_4, zg_5, zg_6
remove_fields = [27, 28, 29, 30, 31, 32, 135, 136, 137]
input_data = np.delete(input_data, remove_fields, axis=1)
no_of_features = no_of_features - len(remove_fields)

### Training the multiple linear model on the entire data set

In [10]:
scaler = StandardScaler()

In [11]:
scaler.fit(input_data)
input_data_scaled = scaler.transform(input_data)

In [12]:
t0 = time.time()

# The optimal multiple linear regression model
lin_reg = LinearRegression()
lin_reg.fit(input_data_scaled, output_data)

print(time.time() - t0)

730.4069516658783


In [13]:
# Loss of this optimal multiple linear regression model
clc_predictions = lin_reg.predict(input_data_scaled)
lin_mse = mean_squared_error(output_data, clc_predictions)
print('The mean squared error of the linear model is %.2f.'%lin_mse) 

The mean squared error of the linear model is 97.81.


### Zero Output Model

In [14]:
np.mean(output_data**2, dtype=np.float64)

537.2421871927486

### Constant Output Model

In [15]:
output_data.shape

(168550971, 27)

In [16]:
mean = np.mean(output_data, axis=0)

In [17]:
# mean = np.mean(output_data, axis=0)
np.mean(((output_data - mean)**2), dtype=np.float64)

431.2802232851945

### Randomly initialized neural network

In [18]:
model = Sequential()
model.add(Dense(256, activation='relu', input_dim = no_of_features))
model.add(Dense(256, activation='relu'))
model.add(Dense(27, activation='linear'))
model.compile(loss='mse', optimizer=Nadam())

In [19]:
# model_fold_3 is implemented in ICON-A
batch_size = 2**20

for i in range(1 + input_data_scaled.shape[0]//batch_size):
    if i == 0:
        clc_predictions = model.predict_on_batch(input_data_scaled[i*batch_size:(i+1)*batch_size])
    else:
        clc_predictions = np.concatenate((clc_predictions, model.predict_on_batch(input_data_scaled[i*batch_size:(i+1)*batch_size])), axis=0)
    K.clear_session()
    gc.collect()

In [20]:
lin_mse = mean_squared_error(output_data, clc_predictions)
print('The mean squared error of the randomly initialized neural network is %.2f.'%lin_mse) 

The mean squared error of the randomly initialized neural network is 471.17.
