## Multiple linear regression

**For Table 3 of the paper**

Cell-based QUBICC R2B5 model

In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from tensorflow.keras import backend as K
from tensorflow.keras.regularizers import l1_l2
import tensorflow as tf
import tensorflow.nn as nn
import gc
import numpy as np
import pandas as pd
import importlib
import os
import sys

#Import sklearn before tensorflow (static Thread-local storage)
from sklearn.preprocessing import StandardScaler

from tensorflow.keras.optimizers import Nadam
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization

path = '/pf/b/b309170'
path_data = path + '/my_work/icon-ml_data/cloud_cover_parameterization/grid_cell_based_QUBICC_R02B05/based_on_var_interpolated_data'

# Add path with my_classes to sys.path
sys.path.insert(0, path + '/workspace_icon-ml/cloud_cover_parameterization/')

import my_classes
importlib.reload(my_classes)
from my_classes import simple_sundqvist_scheme
from my_classes import write_infofile
from my_classes import load_data

import matplotlib.pyplot as plt
import time

NUM = 1

In [2]:
# Prevents crashes of the code
gpus = tf.config.list_physical_devices('GPU')
tf.config.set_visible_devices(gpus[0], 'GPU')

In [3]:
# Allow the growth of memory Tensorflow allocates (limits memory usage overall)
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

In [4]:
scaler = StandardScaler()

In [5]:
# Data is not yet normalized
input_data = np.load(path_data + '/cloud_cover_input_qubicc.npy', mmap_mode='r')
output_data = np.load(path_data + '/cloud_cover_output_qubicc.npy', mmap_mode='r')

In [6]:
(samples_total, no_of_features) = input_data.shape
assert no_of_features < samples_total # Making sure there's no mixup

In [7]:
# Scale the input data = (samples_total, no_of_features)
scaler.fit(input_data)
assert len(scaler.mean_) == no_of_features # Every feature has its own mean and std and we scale accordingly
input_data_scaled = scaler.transform(input_data)

### Training the multiple linear model on the entire data set

In [8]:
t0 = time.time()

# The optimal multiple linear regression model
lin_reg = LinearRegression()
lin_reg.fit(input_data_scaled, output_data)

print(time.time() - t0)

114.00159311294556


In [9]:
# Loss of this optimal multiple linear regression model
clc_predictions = lin_reg.predict(input_data_scaled)
lin_mse = mean_squared_error(output_data, clc_predictions)
print('The mean squared error of the linear model is %.2f.'%lin_mse) 

The mean squared error of the linear model is 401.47.


### Zero Output Model

In [10]:
np.mean(output_data**2, dtype=np.float64)

923.9371442759749

### Constant Output Model

In [11]:
mean = np.mean(output_data, dtype=np.float64)
np.mean((output_data-mean)**2, dtype=np.float64)

684.5114016334184

### Randomly initialized neural network

In [12]:
# Create the model
model = Sequential()

# First hidden layer
model.add(Dense(units=64, activation='tanh', input_dim=no_of_features, 
                kernel_regularizer=l1_l2(l1=0.004749, l2=0.008732)))

# Second hidden layer
model.add(Dense(units=64, activation=nn.leaky_relu, kernel_regularizer=l1_l2(l1=0.004749, l2=0.008732)))
# model.add(Dropout(0.221)) # We drop 18% of the hidden nodes
model.add(BatchNormalization())

# Third hidden layer
model.add(Dense(units=64, activation='tanh', kernel_regularizer=l1_l2(l1=0.004749, l2=0.008732)))
# model.add(Dropout(0.221)) # We drop 18% of the hidden nodes

# Output layer
model.add(Dense(1, activation='linear', kernel_regularizer=l1_l2(l1=0.004749, l2=0.008732)))
model.compile(loss='mse', optimizer=Nadam())

In [13]:
# model_fold_3 is implemented in ICON-A
batch_size = 2**20

for i in range(1 + input_data_scaled.shape[0]//batch_size):
    if i == 0:
        clc_predictions = model.predict_on_batch(input_data_scaled[i*batch_size:(i+1)*batch_size])
    else:
        clc_predictions = np.concatenate((clc_predictions, model.predict_on_batch(input_data_scaled[i*batch_size:(i+1)*batch_size])), axis=0)
    K.clear_session()
    gc.collect()

In [14]:
lin_mse = mean_squared_error(output_data, clc_predictions[:, 0])
print('The mean squared error of the randomly initialized neural network is %.2f.'%lin_mse) 

The mean squared error of the randomly initialized neural network is 913.91.


### Simplified Sundqvist function

input_data is unscaled

In [7]:
qv = input_data[:, 0]
temp = input_data[:, 3]
pres = input_data[:, 4]

In [27]:
t0 = time.time()

# 0.001% of the data
ind = np.random.randint(0, samples_total, samples_total//10**5)

# Entries will be in [0, 1]
sundqvist = []
for i in ind:
    sundqvist.append(simple_sundqvist_scheme(qv[i], temp[i], pres[i], ps=101325))
    
time.time() - t0

367.98427391052246

In [36]:
np.mean((output_data[ind] - 100*np.array(sundqvist))**2, dtype=np.float64)

773.5590553245462