## Multiple linear regression

**For Table 3 of the paper**

Neighborhood-based NARVAL R2B4 model

In [34]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from tensorflow.keras import backend as K
import gc
import numpy as np
import pandas as pd
import importlib
import os
import sys

from tensorflow.keras.optimizers import Nadam
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import tensorflow as tf

base_path = '/pf/b/b309170'

# Add path with my_classes to sys.path
sys.path.insert(0, base_path + '/workspace_icon-ml/cloud_cover_parameterization/')

import my_classes
importlib.reload(my_classes)
from my_classes import write_infofile
from my_classes import load_data

import matplotlib.pyplot as plt
import time

NUM = 1

In [3]:
root_path = '/pf/b/b309170'
data_path = os.path.join(root_path,
                         'my_work/icon-ml_data/cloud_cover_parameterization/region_based/based_on_var_interpolated_data')
model_path = os.path.join(root_path,
                          'workspace_icon-ml/cloud_cover_parameterization/region_based/saved_models')
info_file = os.path.join(root_path, 
                        'workspace_icon-ml/cloud_cover_parameterization/region_based/saved_models/model_region_based_final_1.txt')

n_layers = 27 # Is also the number of NNs

In [4]:
# Load all data (Data is already normalized w.r.t. training data)
input_train = np.load(os.path.join(data_path, 'cloud_cover_input_train_1.npy'))
output_train = np.load(os.path.join(data_path, 'cloud_cover_output_train_1.npy'))
input_valid = np.load(os.path.join(data_path, 'cloud_cover_input_valid_1.npy'))
output_valid = np.load(os.path.join(data_path, 'cloud_cover_output_valid_1.npy'))
input_test = np.load(os.path.join(data_path, 'cloud_cover_input_test_1.npy')) 
output_test = np.load(os.path.join(data_path, 'cloud_cover_output_test_1.npy')) 

In [5]:
## Load data pertaining to a specific NN
n_train_samples = output_train.shape[0]
n_valid_samples = output_valid.shape[0]
n_test_samples = output_test.shape[0]
n_features = input_train.shape[1]

# Load the data into dictionaries. Can't use 3D tensors here as some features will be removed depending on the NN.
input_train_NN = {}
for i in range(n_layers):
    input_train_NN[i] = np.zeros((n_train_samples//n_layers, n_features))
    
output_train_NN = {}
for i in range(n_layers):
    output_train_NN[i] = np.zeros((n_train_samples//n_layers))
    
input_valid_NN = {}
for i in range(n_layers):
    input_valid_NN[i] = np.zeros((n_valid_samples//n_layers, n_features))
    
output_valid_NN = {}
for i in range(n_layers):
    output_valid_NN[i] = np.zeros((n_valid_samples//n_layers))
    
input_test_NN = {}
for i in range(n_layers):
    input_test_NN[i] = np.zeros((n_test_samples//n_layers, n_features))
    
output_test_NN = {}
for i in range(n_layers):
    output_test_NN[i] = np.zeros((n_test_samples//n_layers))

In [6]:
for i in range(n_layers):
    start_ind_train = (n_train_samples//27)*i
    end_ind_train = (n_train_samples//27)*(i+1)
    start_ind_valid = (n_valid_samples//27)*i
    end_ind_valid = (n_valid_samples//27)*(i+1)
    start_ind_test = (n_test_samples//27)*i
    end_ind_test = (n_test_samples//27)*(i+1) 

    input_train_NN[i] = input_train[start_ind_train:end_ind_train]
    output_train_NN[i] = output_train[start_ind_train:end_ind_train]
    input_valid_NN[i] = input_valid[start_ind_valid:end_ind_valid]
    output_valid_NN[i] = output_valid[start_ind_valid:end_ind_valid]
    input_test_NN[i] = input_test[start_ind_test:end_ind_test]  
    output_test_NN[i] = output_test[start_ind_test:end_ind_test]

In [7]:
# We remove the input variables with zero variance. We compute the resulting input dimension for the NN.
input_dim = n_features
for i in range(n_layers):
    vars_to_remove = []
    for j in range(n_features):
        if np.var(input_train_NN[i][:, j]) == 0 or np.isnan(np.var(input_train_NN[i][:, j])):
            input_dim -= 1
            vars_to_remove.append(j)
    input_train_NN[i] = np.delete(input_train_NN[i], vars_to_remove, axis=1)
    input_valid_NN[i] = np.delete(input_valid_NN[i], vars_to_remove, axis=1)
    input_test_NN[i] = np.delete(input_test_NN[i], vars_to_remove, axis=1)

In [22]:
# Checking standardization:
thresh = 1e-5
[np.abs(np.mean(input_train_NN[0][:, j]))<thresh and np.abs(np.var(input_train_NN[0][:, j])-1)<thresh for j in range(input_train_NN[0].shape[1])]

[True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True]

### Training the multiple linear model on the entire data set

In [9]:
input_data = []
output_data = []

for i in range(n_layers):
    input_data.append(np.concatenate((input_train_NN[i], input_valid_NN[i], input_test_NN[i]), 
                                     axis=0))
    output_data.append(np.concatenate((output_train_NN[i], output_valid_NN[i], output_test_NN[i]), 
                                      axis=0))

In [24]:
t0 = time.time()

mean_lin_mse = 0

# Leave out the two upper-most layers
for i in range(2, n_layers):
    # The optimal multiple linear regression models
    lin_reg = LinearRegression()
    lin_reg.fit(input_data[i], output_data[i])
    
    # Loss of this optimal multiple linear regression model
    clc_predictions = lin_reg.predict(input_data[i])
    lin_mse = mean_squared_error(output_data[i], clc_predictions)
    print('The mean squared error of the linear model is %.2f.'%lin_mse) 
    
    mean_lin_mse += lin_mse

print(time.time() - t0)
print('The overall mean MSE is: %.3f'%(mean_lin_mse/(n_layers-2)))

The mean squared error of the linear model is 0.16.
The mean squared error of the linear model is 2.30.
The mean squared error of the linear model is 9.69.
The mean squared error of the linear model is 17.96.
The mean squared error of the linear model is 16.33.
The mean squared error of the linear model is 10.57.
The mean squared error of the linear model is 6.03.
The mean squared error of the linear model is 4.76.
The mean squared error of the linear model is 3.25.
The mean squared error of the linear model is 2.45.
The mean squared error of the linear model is 2.36.
The mean squared error of the linear model is 2.48.
The mean squared error of the linear model is 1.94.
The mean squared error of the linear model is 1.37.
The mean squared error of the linear model is 1.54.
The mean squared error of the linear model is 2.15.
The mean squared error of the linear model is 3.14.
The mean squared error of the linear model is 5.06.
The mean squared error of the linear model is 7.67.
The mean 

### Zero Output Model

In [29]:
mean_zero_output = 0

for i in range(2, n_layers):
    zero_output_mse = np.mean(output_data[i]**2, dtype=np.float64)
    mean_zero_output += zero_output_mse
    
print(mean_zero_output/(n_layers-2))

113.37236371975237


### Constant Output Model

In [30]:
mean_constant_output = 0

for i in range(2, n_layers):
    mean = np.mean(output_data[i], dtype=np.float64)
    constant_output_mse = np.mean((output_data[i]-mean)**2, dtype=np.float64)
    mean_constant_output += constant_output_mse
    
print(mean_constant_output/(n_layers-2))

86.48432970951315


### Randomly initialized neural network

In [35]:
# Suppress warnings
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

mean_nn_mse = 0

for i in range(2, n_layers):
    model = Sequential()
    model.add(Dense(256, activation='relu', input_dim = input_data[i].shape[1]))
    model.add(Dense(256, activation='relu'))
    model.add(Dense(1, activation='linear'))
    model.compile(loss='mse', optimizer=Nadam())
    
    # model_fold_3 is implemented in ICON-A
    batch_size = 2**20

    for j in range(1 + input_data[i].shape[0]//batch_size):
        if j == 0:
            clc_predictions = model.predict_on_batch(input_data[i][j*batch_size:(j+1)*batch_size])
        else:
            clc_predictions = np.concatenate((clc_predictions, model.predict_on_batch(input_data[i][j*batch_size:(j+1)*batch_size])), axis=0)
        K.clear_session()
        gc.collect()
        
    nn_mse = mean_squared_error(output_data[i], clc_predictions[:, 0])
    print('The mean squared error of the randomly initialized neural network is %.2f.'%nn_mse)
    
    mean_nn_mse += nn_mse
    
print('The overall mean is %.3f'%(mean_nn_mse/(n_layers-2)))

The mean squared error of the randomly initialized neural network is 1.19.
The mean squared error of the randomly initialized neural network is 27.70.
The mean squared error of the randomly initialized neural network is 129.42.
The mean squared error of the randomly initialized neural network is 268.78.
The mean squared error of the randomly initialized neural network is 282.07.
The mean squared error of the randomly initialized neural network is 196.40.
The mean squared error of the randomly initialized neural network is 127.86.
The mean squared error of the randomly initialized neural network is 103.37.
The mean squared error of the randomly initialized neural network is 67.30.
The mean squared error of the randomly initialized neural network is 52.54.
The mean squared error of the randomly initialized neural network is 60.23.
The mean squared error of the randomly initialized neural network is 68.45.
The mean squared error of the randomly initialized neural network is 52.17.
The mea