 # Wave Map Model Building

In [1]:
import datetime
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

pd.options.plotting.backend = 'plotly'
np.random.seed(42)


In [2]:
# Load data
wave_data = pd.read_csv('data/wave_data.csv', header=[0,1], index_col=0, parse_dates=True)
weather_data = pd.read_csv('data/weather_data.csv', header=[0,1], index_col=0, parse_dates=True)


In [3]:
# Join data
data = pd.concat([wave_data, weather_data], axis=1, join='inner')


In [4]:
# Add lagging values
lagging_vars = ['10u', '10v', '2t', 'sp']
lagging_times = [datetime.timedelta(hours=h) for h in range(1, 4)]
dat = [data]
for var in lagging_vars:
    for i, t in enumerate(lagging_times):
        var_dat = data.loc[:,(slice(None),var)]
        var_dat.index = var_dat.index + t
        var_dat.columns = pd.MultiIndex.from_tuples([(b, p+'_lag'+str(i+1)) for b, p in var_dat.columns])
        dat.append(var_dat)
data = pd.concat(dat, axis=1)


In [5]:
# Helper function
def describe_regression(pipe, coef_desc, target_desc):
    model = pipe.named_steps['regression']
    return pd.DataFrame(
        np.append(model.coef_, model.intercept_[:,np.newaxis], axis=1).T,
        index=np.append(coef_desc, 'intercept'), columns=target_desc)


 ## Train Test Split
 Use a randomly chosen one of the buoys as the test dataset.

In [6]:
num_buoys = data.columns.levshape[0]
test_buoy = data.columns.get_level_values(0)[np.random.randint(0, num_buoys)]


In [7]:
test = data[test_buoy]
train = data.drop(test_buoy, axis=1)


 ## Simple Linear Regression
 Linear regression from Wind Vector, Temperature and Pressure to Wave Vector.
 No Temporal Information.

 Inputs:
 * 10u
 * 10v
 * 2t
 * sp


 Outputs:
 * wave_u
 * wave_v

In [8]:
# Build input and output matrices
input_vars = ['10u', '10v', '2t', 'sp']
output_vars = ['wave_u', 'wave_v']

test_dropped = test.dropna()
X_test = test_dropped[input_vars].values
Y_test = test_dropped[output_vars].values

X_train = None
Y_train = None
for i, buoy in enumerate(train.columns.levels[0].drop(test_buoy)):
    buoy_dat = train[buoy].dropna()
    if X_train is None:
        X_train = buoy_dat[input_vars].values
        Y_train = buoy_dat[output_vars].values
    else:
        X_train = np.concatenate([X_train, buoy_dat[input_vars].values])
        Y_train = np.concatenate([Y_train, buoy_dat[output_vars].values])


In [9]:
# Train model
pipe = Pipeline([('scaler', StandardScaler()), ('regression', LinearRegression())])
pipe.fit(X_train, Y_train)
print('Train accuracy of the model: {:.3f}'.format(pipe.score(X_train, Y_train)))


Train accuracy of the model: 0.591


In [10]:
# Test model
print('Test accuracy of the model: {:.3f}'.format(pipe.score(X_test, Y_test)))


Test accuracy of the model: 0.673


In [11]:
# Model coefficiens
describe_regression(pipe, input_vars, output_vars)


Unnamed: 0,wave_u,wave_v
10u,0.5699,0.172874
10v,-0.007967,0.592913
2t,-0.046752,-0.010967
sp,-0.034793,-0.107187
intercept,0.294814,0.253362


 The linear regression gives the expected result: The are strong correlations between 10u-wave_u and 10v-wave_v.
 However, the accuracy of the model is not good.
 For some reason the train accuracy is lower than the test accuracy.

 ## Linear Regression with older inputs
 Linear regression from Wind Vector, Temperature and Pressure to Wave Vector.
 Each input variable is used four times, once with the current value, once lagging one hour and so on.

 Inputs:
 * 10u
 * 10v
 * 2t
 * sp


 Outputs:
 * wave_u
 * wave_v

In [12]:
# Build input and output matrices
base_vars = ['10u', '10v', '2t', 'sp']
input_vars = base_vars.copy()
for l in range(3):
    input_vars.extend([v + '_lag' + str(l+1) for v in base_vars])
input_vars.sort()
output_vars = ['wave_u', 'wave_v']

test_dropped = test.dropna()
X_test = test_dropped[input_vars].values
Y_test = test_dropped[output_vars].values

X_train = None
Y_train = None
for i, buoy in enumerate(train.columns.levels[0].drop(test_buoy)):
    buoy_dat = train[buoy].dropna()
    if X_train is None:
        X_train = buoy_dat[input_vars].values
        Y_train = buoy_dat[output_vars].values
    else:
        X_train = np.concatenate([X_train, buoy_dat[input_vars].values])
        Y_train = np.concatenate([Y_train, buoy_dat[output_vars].values])


In [13]:
# Train model
pipe = Pipeline([('scaler', StandardScaler()), ('regression', LinearRegression())])
pipe.fit(X_train, Y_train)
print('Train accuracy of the model: {:.3f}'.format(pipe.score(X_train, Y_train)))


Train accuracy of the model: 0.638


In [14]:
# Test model
print('Test accuracy of the model: {:.3f}'.format(pipe.score(X_test, Y_test)))


Test accuracy of the model: 0.730


In [15]:
# Model coefficiens
describe_regression(pipe, input_vars, output_vars)


Unnamed: 0,wave_u,wave_v
10u,0.169834,0.083032
10u_lag1,0.0534,0.060775
10u_lag2,-0.207125,-0.323566
10u_lag3,0.583304,0.345466
10v,-0.02928,0.167502
10v_lag1,-0.08423,-0.009404
10v_lag2,0.188919,-0.143874
10v_lag3,-0.079525,0.633358
2t,0.113203,-0.00409
2t_lag1,-0.139367,-0.023063


 The regression model with lagging input variables shows increased accuracy.
 The largest influence on the model comes from the variables lagging the most, therefore longer lagging variables should be included.