## Multiple linear regression

**For Table 3 of the paper**

Neighborhood-based QUBICC R2B5 model

In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from tensorflow.keras import backend as K
from tensorflow.keras.regularizers import l1_l2
import tensorflow.nn as nn
import tensorflow as tf
import gc
import numpy as np
import pandas as pd
import importlib
import os
import sys

#Import sklearn before tensorflow (static Thread-local storage)
from sklearn.preprocessing import StandardScaler

from tensorflow.keras.optimizers import Nadam
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization
import tensorflow as tf

# Add path with my_classes to sys.path
base_path = '/pf/b/b309170'
sys.path.insert(0, base_path + '/workspace_icon-ml/cloud_cover_parameterization/')

import my_classes
importlib.reload(my_classes)
from my_classes import write_infofile
from my_classes import load_data

import matplotlib.pyplot as plt
import time

NUM = 1

In [2]:
# Prevents crashes of the code
gpus = tf.config.list_physical_devices('GPU')
tf.config.set_visible_devices(gpus[0], 'GPU')

In [3]:
# Allow the growth of memory Tensorflow allocates (limits memory usage overall)
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

In [4]:
scaler = StandardScaler()

In [5]:
path_data = base_path + '/my_work/icon-ml_data/cloud_cover_parameterization/region_based_one_nn_R02B05/based_on_var_interpolated_data'

# Data is not yet normalized
input_data = np.load(path_data + '/cloud_cover_input_qubicc.npy', mmap_mode='r')
output_data = np.load(path_data + '/cloud_cover_output_qubicc.npy', mmap_mode='r')

In [6]:
(samples_total, no_of_features) = input_data.shape
(samples_total, no_of_features)

(1176638142, 26)

In [7]:
# input_data = np.concatenate((input_narval, input_qubicc), axis=0)
# output_data = np.concatenate((output_narval, output_qubicc), axis=0)

In [8]:
(samples_total, no_of_features) = input_data.shape
assert no_of_features < samples_total # Making sure there's no mixup

In [9]:
# Scale the input data = (samples_total, no_of_features)
input_data_scaled = scaler.fit_transform(input_data)
assert len(scaler.mean_) == no_of_features # Every feature has its own mean and std and we scale accordingly

### Training the multiple linear model on the entire data set

In [10]:
t0 = time.time()

# The optimal multiple linear regression model
lin_reg = LinearRegression()
lin_reg.fit(input_data_scaled, output_data)

print(time.time() - t0)

491.26888513565063


In [11]:
# Loss of this optimal multiple linear regression model
clc_predictions = lin_reg.predict(input_data_scaled)
lin_mse = mean_squared_error(output_data, clc_predictions)
print('The mean squared error of the linear model is %.2f.'%lin_mse) 

The mean squared error of the linear model is 297.63.


### Zero Output Model

In [12]:
np.mean(output_data**2, dtype=np.float64)

692.9528578143779

### Constant Output Model

In [13]:
mean = np.mean(output_data, dtype=np.float64)
np.mean((output_data-mean)**2, dtype=np.float64)

558.2758760179522

### Randomly initialized neural network

In [14]:
# Create the model
model = Sequential()

# First hidden layer
model.add(Dense(units=64, activation='tanh', input_dim=no_of_features, 
                kernel_regularizer=l1_l2(l1=0.004749, l2=0.008732)))

# Second hidden layer
model.add(Dense(units=64, activation=nn.leaky_relu, kernel_regularizer=l1_l2(l1=0.004749, l2=0.008732)))
# model.add(Dropout(0.221)) # We drop 18% of the hidden nodes
model.add(BatchNormalization())

# Third hidden layer
model.add(Dense(units=64, activation='tanh', kernel_regularizer=l1_l2(l1=0.004749, l2=0.008732)))
# model.add(Dropout(0.221)) # We drop 18% of the hidden nodes

# Output layer
model.add(Dense(1, activation='linear', kernel_regularizer=l1_l2(l1=0.004749, l2=0.008732)))
model.compile(loss='mse', optimizer=Nadam())

In [15]:
# model_fold_3 is implemented in ICON-A
batch_size = 2**20

for i in range(1 + samples_total//batch_size):
    if i == 0:
        clc_predictions = model.predict_on_batch(input_data_scaled[i*batch_size:(i+1)*batch_size])
    else:
        clc_predictions = np.concatenate((clc_predictions, model.predict_on_batch(input_data_scaled[i*batch_size:(i+1)*batch_size])), axis=0)
    K.clear_session()
    gc.collect()

In [16]:
lin_mse = mean_squared_error(output_data, clc_predictions[:, 0])
print('The mean squared error of the randomly initialized neural network is %.2f.'%lin_mse) 

The mean squared error of the randomly initialized neural network is 699.21.
