 # Wave Map Model Building

In [1]:
import datetime
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, RidgeCV
from sklearn.pipeline import Pipeline

pd.options.plotting.backend = 'plotly'
np.random.seed(42)


In [2]:
# Load data
wave_data = pd.read_csv('data/wave_data.csv', header=[0,1], index_col=0, parse_dates=True)
weather_data = pd.read_csv('data/weather_data.csv', header=[0,1], index_col=0, parse_dates=True)


In [3]:
# Join data
data = pd.concat([wave_data, weather_data], axis=1, join='inner')


In [4]:
# Add lagging values
lagging_vars = ['10u', '10v', '2t', 'sp']
lagging_times = [datetime.timedelta(hours=h) for h in range(1, 10)]
dat = [data]
for var in lagging_vars:
    for i, t in enumerate(lagging_times):
        var_dat = data.loc[:,(slice(None),var)]
        var_dat.index = var_dat.index + t
        var_dat.columns = pd.MultiIndex.from_tuples([(b, p+'_lag'+str(i+1)) for b, p in var_dat.columns])
        dat.append(var_dat)
data = pd.concat(dat, axis=1)


In [5]:
# Helper function
def describe_regression(pipe, coef_desc, target_desc):
    model = pipe.named_steps['regression']
    return pd.DataFrame(
        np.append(model.coef_, model.intercept_[:,np.newaxis], axis=1).T,
        index=np.append(coef_desc, 'intercept'), columns=target_desc)


 ## Train Test Split
 Use a randomly chosen one of the buoys as the test dataset.

In [6]:
num_buoys = data.columns.levshape[0]
test_buoy = data.columns.get_level_values(0)[np.random.randint(0, num_buoys)]


In [7]:
test = data[test_buoy]
train = data.drop(test_buoy, axis=1)


 ## Simple Linear Regression
 Linear regression from Wind Vector, Temperature and Pressure to Wave Vector.
 No Temporal Information.

 Inputs:
 * 10u
 * 10v
 * 2t
 * sp


 Outputs:
 * wave_u
 * wave_v

In [8]:
# Build input and output matrices
input_vars = ['10u', '10v', '2t', 'sp']
output_vars = ['wave_u', 'wave_v']

test_dropped = test.dropna()
X_test = test_dropped[input_vars].values
Y_test = test_dropped[output_vars].values

X_train = None
Y_train = None
for i, buoy in enumerate(train.columns.levels[0].drop(test_buoy)):
    buoy_dat = train[buoy].dropna()
    if X_train is None:
        X_train = buoy_dat[input_vars].values
        Y_train = buoy_dat[output_vars].values
    else:
        X_train = np.concatenate([X_train, buoy_dat[input_vars].values])
        Y_train = np.concatenate([Y_train, buoy_dat[output_vars].values])


In [9]:
# Train model
pipe = Pipeline([('scaler', StandardScaler()), ('regression', LinearRegression())])
pipe.fit(X_train, Y_train)
print('Train accuracy of the model: {:.3f}'.format(pipe.score(X_train, Y_train)))


Train accuracy of the model: 0.591


In [10]:
# Test model
print('Test accuracy of the model: {:.3f}'.format(pipe.score(X_test, Y_test)))


Test accuracy of the model: 0.673


In [11]:
# Model coefficiens
describe_regression(pipe, input_vars, output_vars)


Unnamed: 0,wave_u,wave_v
10u,0.569577,0.172658
10v,-0.008136,0.592012
2t,-0.04692,-0.011057
sp,-0.034271,-0.107311
intercept,0.294546,0.253399


 The linear regression gives the expected result: The are strong correlations between 10u-wave_u and 10v-wave_v.
 However, the accuracy of the model is not good.
 For some reason the train accuracy is lower than the test accuracy.

 ## Linear Regression with older inputs
 Linear regression from Wind Vector, Temperature and Pressure to Wave Vector.
 Each input variable is used four times, once with the current value, once lagging one hour and so on.

 Inputs:
 * 10u
 * 10v
 * 2t
 * sp
 * repeating from '_lag1' to '_lag3'


 Outputs:
 * wave_u
 * wave_v

In [12]:
# Build input and output matrices
base_vars = ['10u', '10v', '2t', 'sp']
input_vars = base_vars.copy()
for l in range(1, 4):
    input_vars.extend([v + '_lag' + str(l) for v in base_vars])
input_vars.sort()
output_vars = ['wave_u', 'wave_v']

test_dropped = test.dropna()
X_test = test_dropped[input_vars].values
Y_test = test_dropped[output_vars].values

X_train = None
Y_train = None
for i, buoy in enumerate(train.columns.levels[0].drop(test_buoy)):
    buoy_dat = train[buoy].dropna()
    if X_train is None:
        X_train = buoy_dat[input_vars].values
        Y_train = buoy_dat[output_vars].values
    else:
        X_train = np.concatenate([X_train, buoy_dat[input_vars].values])
        Y_train = np.concatenate([Y_train, buoy_dat[output_vars].values])


In [13]:
# Train model
pipe = Pipeline([('scaler', StandardScaler()), ('regression', LinearRegression())])
pipe.fit(X_train, Y_train)
print('Train accuracy of the model: {:.3f}'.format(pipe.score(X_train, Y_train)))


Train accuracy of the model: 0.638


In [14]:
# Test model
print('Test accuracy of the model: {:.3f}'.format(pipe.score(X_test, Y_test)))


Test accuracy of the model: 0.730


In [15]:
# Model coefficiens
describe_regression(pipe, input_vars, output_vars)


Unnamed: 0,wave_u,wave_v
10u,0.171629,0.080945
10u_lag1,0.05007,0.063023
10u_lag2,-0.205431,-0.323277
10u_lag3,0.582713,0.34486
10v,-0.028701,0.167182
10v_lag1,-0.084883,-0.008029
10v_lag2,0.190862,-0.144826
10v_lag3,-0.081156,0.632432
2t,0.121003,-0.00838
2t_lag1,-0.144914,-0.026489


 The regression model with lagging input variables shows increased accuracy.
 The largest influence on the model comes from the variables lagging the most, therefore longer lagging variables should be included.

 ## Linear Regression with even older inputs
 Linear regression from Wind Vector, Temperature and Pressure to Wave Vector.
 Each input variable is used nine times, once with the current value, once lagging one hour and so on.
 The goal here is to see how old the variables with the strongest correlation are.

 Inputs:
 * 10u
 * 10v
 * 2t
 * sp
 * repeating from '_lag1' to '_lag9'


 Outputs:
 * wave_u
 * wave_v

In [16]:
# Build input and output matrices
base_vars = ['10u', '10v', '2t', 'sp']
input_vars = base_vars.copy()
for l in range(1, 10):
    input_vars.extend([v + '_lag' + str(l) for v in base_vars])
input_vars.sort()
output_vars = ['wave_u', 'wave_v']

test_dropped = test.dropna()
X_test = test_dropped[input_vars].values
Y_test = test_dropped[output_vars].values

X_train = None
Y_train = None
for i, buoy in enumerate(train.columns.levels[0].drop(test_buoy)):
    buoy_dat = train[buoy].dropna()
    if X_train is None:
        X_train = buoy_dat[input_vars].values
        Y_train = buoy_dat[output_vars].values
    else:
        X_train = np.concatenate([X_train, buoy_dat[input_vars].values])
        Y_train = np.concatenate([Y_train, buoy_dat[output_vars].values])


In [17]:
# Train model
pipe = Pipeline([('scaler', StandardScaler()), ('regression', LinearRegression())])
pipe.fit(X_train, Y_train)
print('Train accuracy of the model: {:.3f}'.format(pipe.score(X_train, Y_train)))


Train accuracy of the model: 0.654


In [18]:
# Test model
print('Test accuracy of the model: {:.3f}'.format(pipe.score(X_test, Y_test)))


Test accuracy of the model: 0.738


In [19]:
# Model coefficiens
describe_regression(pipe, input_vars, output_vars)


Unnamed: 0,wave_u,wave_v
10u,0.173792,0.061769
10u_lag1,0.009187,-0.035854
10u_lag2,0.096202,0.052847
10u_lag3,0.055178,0.012649
10u_lag4,0.077528,-0.006156
10u_lag5,0.03124,0.027985
10u_lag6,0.042818,-0.025192
10u_lag7,0.04117,0.060008
10u_lag8,-0.045357,-0.108394
10u_lag9,0.143245,0.1361


 The model yields basically the same accuracy as when using less lagging variables.
 Of the lagging variables again, the oldest is the one with the strongest correlation.
 It is not clear from this result how much lag should be included.

 Next we try introducing regularization.

 ## Ridge Regression
 Ridge regression from Wind Vector, Temperature and Pressure to Wave Vector.
 Each input variable is used nine times, once with the current value, once lagging one hour and so on.
 The goal here is to see how old the variables with the strongest correlation are.

 Inputs:
 * 10u
 * 10v
 * 2t
 * sp
 * repeating from '_lag1' to '_lag9'


 Outputs:
 * wave_u
 * wave_v

In [20]:
# Build input and output matrices
base_vars = ['10u', '10v', '2t', 'sp']
input_vars = base_vars.copy()
for l in range(1, 10):
    input_vars.extend([v + '_lag' + str(l) for v in base_vars])
input_vars.sort()
output_vars = ['wave_u', 'wave_v']

test_dropped = test.dropna()
X_test = test_dropped[input_vars].values
Y_test = test_dropped[output_vars].values

X_train = None
Y_train = None
for i, buoy in enumerate(train.columns.levels[0].drop(test_buoy)):
    buoy_dat = train[buoy].dropna()
    if X_train is None:
        X_train = buoy_dat[input_vars].values
        Y_train = buoy_dat[output_vars].values
    else:
        X_train = np.concatenate([X_train, buoy_dat[input_vars].values])
        Y_train = np.concatenate([Y_train, buoy_dat[output_vars].values])


In [21]:
# Train model
alphas = np.logspace(-2, 5, num=20)
pipe = Pipeline([('scaler', StandardScaler()), ('regression', RidgeCV())])
pipe.fit(X_train, Y_train)
print('Chosen regularization parameter: alpha={:.3f}'.format(pipe.named_steps['regression'].alpha_))
print('Train accuracy of the model: {:.3f}'.format(pipe.score(X_train, Y_train)))


Chosen regularization parameter: alpha=10.000
Train accuracy of the model: 0.654


In [22]:
# Test model
print('Test accuracy of the model: {:.3f}'.format(pipe.score(X_test, Y_test)))


Test accuracy of the model: 0.738


In [23]:
# Model coefficiens
describe_regression(pipe, input_vars, output_vars)


Unnamed: 0,wave_u,wave_v
10u,0.170686,0.060789
10u_lag1,0.017389,-0.031049
10u_lag2,0.08942,0.048478
10u_lag3,0.057503,0.012712
10u_lag4,0.075512,-0.005133
10u_lag5,0.033266,0.024524
10u_lag6,0.046137,-0.013096
10u_lag7,0.035948,0.04564
10u_lag8,-0.043257,-0.102086
10u_lag9,0.142259,0.134878


 The regularization does not seem to influence the performance much.