## Multiple linear regression

**For Table 3 of the paper**

Cell-based NARVAL R2B4 model

In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from tensorflow.keras import backend as K
import gc
import numpy as np
import pandas as pd
import importlib
import os
import sys

from tensorflow.keras.optimizers import Nadam
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

base_path = '/pf/b/b309170'
path_data = base_path + '/my_work/icon-ml_data/cloud_cover_parameterization/grid_cell_based_v3/based_on_var_interpolated_data'

# Add path with my_classes to sys.path
sys.path.insert(0, base_path + '/workspace_icon-ml/cloud_cover_parameterization/')

import my_classes
importlib.reload(my_classes)
from my_classes import simple_sundqvist_scheme
from my_classes import write_infofile
from my_classes import load_data

import matplotlib.pyplot as plt
import time

NUM = 1

In [2]:
# The data is already normalized (w.r.t. training data)
input_train = np.load(path_data + '/cloud_cover_all_days_input_train_%d.npy'%NUM, mmap_mode='r')
input_valid = np.load(path_data + '/cloud_cover_all_days_input_valid_%d.npy'%NUM)
input_test = np.load(path_data + '/cloud_cover_all_days_input_test_%d.npy'%NUM)
output_train = np.load(path_data + '/cloud_cover_all_days_output_train_%d.npy'%NUM)
output_valid = np.load(path_data + '/cloud_cover_all_days_output_valid_%d.npy'%NUM)
output_test = np.load(path_data + '/cloud_cover_all_days_output_test_%d.npy'%NUM)

### Training the multiple linear model on the training set only
--> Yields 81.60 on the training set and 82.00 on the validation + test set. <br>
--> The MSEs are so similar to each other, so it doesn't make sense to split the dataset

In [6]:
input_valid_test = np.concatenate((input_valid, input_test), axis=0)
output_valid_test = np.concatenate((output_valid, output_test), axis=0)

input_train -> output_train <br>
input_valid_test -> output_valid_test

In [8]:
t0 = time.time()

# The optimal multiple linear regression model
lin_reg = LinearRegression()
lin_reg.fit(input_train, output_train)

print(time.time() - t0)

4.183831214904785


In [9]:
# Training data loss
clc_predictions = lin_reg.predict(input_train)
lin_mse = mean_squared_error(output_train, clc_predictions)
print('The mean squared error of the linear model is %.2f.'%lin_mse) 

The mean squared error of the linear model is 81.60.


In [10]:
# Validation data loss
clc_predictions = lin_reg.predict(input_valid_test)
lin_mse = mean_squared_error(output_valid_test, clc_predictions)
print('The mean squared error of the linear model is %.2f.'%lin_mse) 

The mean squared error of the linear model is 82.00.


### Training the multiple linear model on the entire data set

In [3]:
input_data = np.concatenate((input_train, input_valid, input_test), axis=0)
output_data = np.concatenate((output_train, output_valid, output_test), axis=0)

In [23]:
t0 = time.time()

# The optimal multiple linear regression model
lin_reg = LinearRegression()
lin_reg.fit(input_data, output_data)

print(time.time() - t0)

4.8166115283966064


In [25]:
# Loss of this optimal multiple linear regression model
clc_predictions = lin_reg.predict(input_data)
lin_mse = mean_squared_error(output_data, clc_predictions)
print('The mean squared error of the linear model is %.2f.'%lin_mse) 

The mean squared error of the linear model is 81.71.


### Zero Output Model

In [16]:
np.mean(output_data**2, dtype=np.float64)

129.6262720267197

### Constant Output Model

In [17]:
mean = np.mean(output_data, dtype=np.float64)
np.mean((output_data-mean)**2, dtype=np.float64)

109.63230156266818

### Randomly initialized neural network

In [20]:
model = Sequential()
model.add(Dense(256, activation='relu', input_dim = 6))
model.add(Dense(256, activation='relu'))
model.add(Dense(1, activation='linear'))
model.compile(loss='mse', optimizer=Nadam())

In [36]:
# model_fold_3 is implemented in ICON-A
batch_size = 2**20

for i in range(1 + input_data.shape[0]//batch_size):
    if i == 0:
        clc_predictions = model.predict_on_batch(input_data[i*batch_size:(i+1)*batch_size])
    else:
        clc_predictions = np.concatenate((clc_predictions, model.predict_on_batch(input_data[i*batch_size:(i+1)*batch_size])), axis=0)
    K.clear_session()
    gc.collect()

In [45]:
lin_mse = mean_squared_error(output_data, clc_predictions[:, 0])
print('The mean squared error of the randomly initialized neural network is %.2f.'%lin_mse) 

The mean squared error of the randomly initialized neural network is 131.07.


### Simplified Sundqvist function

In [4]:
# Reverse scaling
means = np.array([5.37518440e-03, 4.65389731e-07, 2.59635412e+02, 5.52329389e+04,
 6.79260772e+03, 2.58097095e-01])
stds = np.array([6.01943993e-03, 3.95009930e-06, 3.55940285e+01, 3.26642242e+04,
 6.20726361e+03, 4.28313535e-01])

In [5]:
# Reverse scaling
qv = input_data[:, 0]*stds[0] + means[0]
temp = input_data[:, 2]*stds[2] + means[2]
pres = input_data[:, 3]*stds[3] + means[3]

In [6]:
t0 = time.time()

# Entries will be in [0, 1]
sundqvist = []
for i in range(qv.shape[0]):
    sundqvist.append(simple_sundqvist_scheme(qv[i], temp[i], pres[i], ps=101325))
    
time.time() - t0

253.455313205719

In [9]:
np.mean((output_data - 100*np.array(sundqvist))**2, dtype=np.float64)

85.18675142002722