 # Wave Map Model Building

In [1]:
import datetime
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, RidgeCV, MultiTaskLassoCV
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVR
from sklearn.model_selection import GridSearchCV

pd.options.plotting.backend = 'plotly'
np.random.seed(42)


In [2]:
# Load data
wave_data = pd.read_csv('data/wave_data.csv', header=[0,1], index_col=0, parse_dates=True)
weather_data = pd.read_csv('data/weather_data.csv', header=[0,1], index_col=0, parse_dates=True)


In [3]:
# Join data
data = pd.concat([wave_data, weather_data], axis=1, join='inner')


In [4]:
# Add squared wind speed
sq_dat = data.loc[:,(slice(None),'wind_speed')]**2
sq_dat.columns = pd.MultiIndex.from_tuples([(b, p+'_sq') for b, p in sq_dat.columns])
data = pd.concat([data, sq_dat], axis=1)


In [5]:
# Add lagging values
lagging_vars = ['10u', '10v', '2t', 'sp', 'wind_speed', 'wind_speed_sq']
lagging_times = [datetime.timedelta(hours=h) for h in range(1, 20)]
dat = [data]
for var in lagging_vars:
    for i, t in enumerate(lagging_times):
        var_dat = data.loc[:,(slice(None),var)]
        var_dat.index = var_dat.index + t
        var_dat.columns = pd.MultiIndex.from_tuples([(b, p+'_lag'+str(i+1)) for b, p in var_dat.columns])
        dat.append(var_dat)
data = pd.concat(dat, axis=1)


In [6]:
# Helper function
def describe_regression(pipe, coef_desc, target_desc):
    model = pipe.named_steps['regression']
    return pd.DataFrame(
        np.append(model.coef_, model.intercept_[:,np.newaxis], axis=1).T,
        index=np.append(coef_desc, 'intercept'), columns=target_desc)


 ## Train Test Split
 Use a randomly chosen one of the buoys as the test dataset.

In [7]:
num_buoys = data.columns.levshape[0]
test_buoy = 'pohjois-itaemeri'


In [8]:
test = data[test_buoy]
train = data.drop(test_buoy, axis=1)


 ## Simple Linear Regression
 Linear regression from Wind Vector, Temperature and Pressure to Wave Vector.
 No Temporal Information.

 Inputs:
 * 10u
 * 10v
 * 2t
 * sp


 Outputs:
 * wave_u
 * wave_v

In [9]:
# Build input and output matrices
input_vars = ['10u', '10v', '2t', 'sp']
output_vars = ['wave_u', 'wave_v']

test_dropped = test.dropna()
X_test = test_dropped[input_vars].values
Y_test = test_dropped[output_vars].values

X_train = None
Y_train = None
for i, buoy in enumerate(train.columns.levels[0].drop(test_buoy)):
    buoy_dat = train[buoy].dropna()
    if X_train is None:
        X_train = buoy_dat[input_vars].values
        Y_train = buoy_dat[output_vars].values
    else:
        X_train = np.concatenate([X_train, buoy_dat[input_vars].values])
        Y_train = np.concatenate([Y_train, buoy_dat[output_vars].values])


In [10]:
# Train model
pipe = Pipeline([('scaler', StandardScaler()), ('regression', LinearRegression())])
pipe.fit(X_train, Y_train)
print('Train accuracy of the model: {:.3f}'.format(pipe.score(X_train, Y_train)))


Train accuracy of the model: 0.595


In [11]:
# Test model
print('Test accuracy of the model: {:.3f}'.format(pipe.score(X_test, Y_test)))


Test accuracy of the model: 0.556


In [12]:
# Model coefficiens
describe_regression(pipe, input_vars, output_vars)


Unnamed: 0,wave_u,wave_v
10u,0.475594,0.115769
10v,0.007781,0.516111
2t,-0.038167,-0.007129
sp,-0.037664,-0.057895
intercept,0.233693,0.163399


 The linear regression gives the expected result: The are strong correlations between 10u-wave_u and 10v-wave_v.
 However, the accuracy of the model is not good.
 For some reason the train accuracy is lower than the test accuracy.

 ## Linear Regression with older inputs
 Linear regression from Wind Vector, Temperature and Pressure to Wave Vector.
 Each input variable is used four times, once with the current value, once lagging one hour and so on.

 Inputs:
 * 10u
 * 10v
 * 2t
 * sp
 * repeating from '_lag1' to '_lag3'


 Outputs:
 * wave_u
 * wave_v

In [13]:
# Build input and output matrices
base_vars = ['10u', '10v', '2t', 'sp']
input_vars = base_vars.copy()
for l in range(1, 4):
    input_vars.extend([v + '_lag' + str(l) for v in base_vars])
input_vars.sort()
output_vars = ['wave_u', 'wave_v']

test_dropped = test.dropna()
X_test = test_dropped[input_vars].values
Y_test = test_dropped[output_vars].values

X_train = None
Y_train = None
for i, buoy in enumerate(train.columns.levels[0].drop(test_buoy)):
    buoy_dat = train[buoy].dropna()
    if X_train is None:
        X_train = buoy_dat[input_vars].values
        Y_train = buoy_dat[output_vars].values
    else:
        X_train = np.concatenate([X_train, buoy_dat[input_vars].values])
        Y_train = np.concatenate([Y_train, buoy_dat[output_vars].values])


In [14]:
# Train model
pipe = Pipeline([('scaler', StandardScaler()), ('regression', LinearRegression())])
pipe.fit(X_train, Y_train)
print('Train accuracy of the model: {:.3f}'.format(pipe.score(X_train, Y_train)))


Train accuracy of the model: 0.641


In [15]:
# Test model
print('Test accuracy of the model: {:.3f}'.format(pipe.score(X_test, Y_test)))


Test accuracy of the model: 0.604


In [16]:
# Model coefficiens
describe_regression(pipe, input_vars, output_vars)


Unnamed: 0,wave_u,wave_v
10u,0.152264,0.063697
10u_lag1,0.048364,0.071366
10u_lag2,-0.149572,-0.27666
10u_lag3,0.448331,0.249198
10v,0.004779,0.143884
10v_lag1,-0.071641,-0.004695
10v_lag2,0.144235,-0.115674
10v_lag3,-0.062859,0.531107
2t,-0.017008,0.044236
2t_lag1,-0.041182,-0.073338


 The regression model with lagging input variables shows increased accuracy.
 The largest influence on the model comes from the variables lagging the most, therefore longer lagging variables should be included.

 ## Linear Regression with even older inputs
 Linear regression from Wind Vector, Temperature and Pressure to Wave Vector.
 Each input variable is used nine times, once with the current value, once lagging one hour and so on.
 The goal here is to see how old the variables with the strongest correlation are.

 Inputs:
 * 10u
 * 10v
 * 2t
 * sp
 * repeating from '_lag1' to '_lag9'


 Outputs:
 * wave_u
 * wave_v

In [17]:
# Build input and output matrices
base_vars = ['10u', '10v', '2t', 'sp']
input_vars = base_vars.copy()
for l in range(1, 10):
    input_vars.extend([v + '_lag' + str(l) for v in base_vars])
input_vars.sort()
output_vars = ['wave_u', 'wave_v']

test_dropped = test.dropna()
X_test = test_dropped[input_vars].values
Y_test = test_dropped[output_vars].values

X_train = None
Y_train = None
for i, buoy in enumerate(train.columns.levels[0].drop(test_buoy)):
    buoy_dat = train[buoy].dropna()
    if X_train is None:
        X_train = buoy_dat[input_vars].values
        Y_train = buoy_dat[output_vars].values
    else:
        X_train = np.concatenate([X_train, buoy_dat[input_vars].values])
        Y_train = np.concatenate([Y_train, buoy_dat[output_vars].values])


In [18]:
# Train model
pipe = Pipeline([('scaler', StandardScaler()), ('regression', LinearRegression())])
pipe.fit(X_train, Y_train)
print('Train accuracy of the model: {:.3f}'.format(pipe.score(X_train, Y_train)))


Train accuracy of the model: 0.655


In [19]:
# Test model
print('Test accuracy of the model: {:.3f}'.format(pipe.score(X_test, Y_test)))


Test accuracy of the model: 0.622


In [20]:
# Model coefficiens
describe_regression(pipe, input_vars, output_vars)


Unnamed: 0,wave_u,wave_v
10u,0.154774,0.04623
10u_lag1,0.013989,-0.01212
10u_lag2,0.087653,0.012538
10u_lag3,0.039182,0.004978
10u_lag4,0.058142,0.014094
10u_lag5,0.039508,0.016381
10u_lag6,0.026929,-0.025992
10u_lag7,0.035218,0.049622
10u_lag8,-0.02643,-0.061856
10u_lag9,0.089721,0.068774


 The model yields basically the same accuracy as when using less lagging variables.
 Of the lagging variables again, the oldest is the one with the strongest correlation.
 It is not clear from this result how much lag should be included.

 Next we try introducing regularization.

 ## Ridge Regression
 Ridge regression from Wind Vector, Temperature and Pressure to Wave Vector.
 Each input variable is used nine times, once with the current value, once lagging one hour and so on.
 The goal here is to see how old the variables with the strongest correlation are.

 Inputs:
 * 10u
 * 10v
 * 2t
 * sp
 * repeating from '_lag1' to '_lag9'


 Outputs:
 * wave_u
 * wave_v

In [21]:
# Build input and output matrices
base_vars = ['10u', '10v', '2t', 'sp']
input_vars = base_vars.copy()
for l in range(1, 10):
    input_vars.extend([v + '_lag' + str(l) for v in base_vars])
input_vars.sort()
output_vars = ['wave_u', 'wave_v']

test_dropped = test.dropna()
X_test = test_dropped[input_vars].values
Y_test = test_dropped[output_vars].values

X_train = None
Y_train = None
for i, buoy in enumerate(train.columns.levels[0].drop(test_buoy)):
    buoy_dat = train[buoy].dropna()
    if X_train is None:
        X_train = buoy_dat[input_vars].values
        Y_train = buoy_dat[output_vars].values
    else:
        X_train = np.concatenate([X_train, buoy_dat[input_vars].values])
        Y_train = np.concatenate([Y_train, buoy_dat[output_vars].values])


In [22]:
# Train model
alphas = np.logspace(-2, 5, num=20)
pipe = Pipeline([('scaler', StandardScaler()), ('regression', RidgeCV())])
pipe.fit(X_train, Y_train)
print('Chosen regularization parameter: alpha={:.3f}'.format(pipe.named_steps['regression'].alpha_))
print('Train accuracy of the model: {:.3f}'.format(pipe.score(X_train, Y_train)))


Chosen regularization parameter: alpha=10.000
Train accuracy of the model: 0.655


In [23]:
# Test model
print('Test accuracy of the model: {:.3f}'.format(pipe.score(X_test, Y_test)))


Test accuracy of the model: 0.622


In [24]:
# Model coefficiens
describe_regression(pipe, input_vars, output_vars)


Unnamed: 0,wave_u,wave_v
10u,0.150236,0.046001
10u_lag1,0.023088,-0.010495
10u_lag2,0.082001,0.011928
10u_lag3,0.040817,0.004881
10u_lag4,0.057162,0.01344
10u_lag5,0.040322,0.013524
10u_lag6,0.029715,-0.014167
10u_lag7,0.031809,0.036326
10u_lag8,-0.026349,-0.057475
10u_lag9,0.089709,0.068579


 The regularization does not seem to influence the performance much.

 ## Lasso Regression
 Lasso Regression for selecting important variables.

 Inputs:
 * 10u
 * 10v
 * 2t
 * sp
 * repeating from '_lag1' to '_lag9'


 Outputs:
 * wave_u
 * wave_v

In [25]:
# Build input and output matrices
base_vars = ['10u', '10v', '2t', 'sp']
input_vars = base_vars.copy()
for l in range(1, 10):
    input_vars.extend([v + '_lag' + str(l) for v in base_vars])
input_vars.sort()
output_vars = ['wave_u', 'wave_v']

test_dropped = test.dropna()
X_test = test_dropped[input_vars].values
Y_test = test_dropped[output_vars].values

X_train = None
Y_train = None
for i, buoy in enumerate(train.columns.levels[0].drop(test_buoy)):
    buoy_dat = train[buoy].dropna()
    if X_train is None:
        X_train = buoy_dat[input_vars].values
        Y_train = buoy_dat[output_vars].values
    else:
        X_train = np.concatenate([X_train, buoy_dat[input_vars].values])
        Y_train = np.concatenate([Y_train, buoy_dat[output_vars].values])


In [26]:
# Train model
alphas = np.logspace(-2, 5, num=20)
pipe = Pipeline([('scaler', StandardScaler()), ('regression', MultiTaskLassoCV(alphas=alphas))])
pipe.fit(X_train, Y_train)
print('Chosen regularization parameter: alpha={:.3f}'.format(pipe.named_steps['regression'].alpha_))
print('Train accuracy of the model: {:.3f}'.format(pipe.score(X_train, Y_train)))


Chosen regularization parameter: alpha=0.010
Train accuracy of the model: 0.653


In [27]:
# Test model
print('Test accuracy of the model: {:.3f}'.format(pipe.score(X_test, Y_test)))


Test accuracy of the model: 0.614


In [28]:
# Model coefficiens
coef = describe_regression(pipe, input_vars, output_vars)
coef


Unnamed: 0,wave_u,wave_v
10u,0.162133,0.043003
10u_lag1,0.004937,0.000906
10u_lag2,0.090687,0.012512
10u_lag3,0.041544,0.004853
10u_lag4,0.056993,0.006488
10u_lag5,0.034403,0.004144
10u_lag6,0.034929,0.004829
10u_lag7,0.003076,0.000563
10u_lag8,0.0,0.0
10u_lag9,0.077058,0.026465


In [29]:
# Unneeded variables
coef[(coef['wave_u'] == 0) & (coef['wave_v'] == 0)].index


Index(['10u_lag8', '10v_lag8', '2t', '2t_lag1', '2t_lag2', '2t_lag4',
       '2t_lag5', '2t_lag6', '2t_lag7', 'sp_lag1', 'sp_lag2', 'sp_lag3',
       'sp_lag4', 'sp_lag5', 'sp_lag6', 'sp_lag7', 'sp_lag8', 'sp_lag9'],
      dtype='object')

 The lasso regression shows that Surface Pressure and Temperature can be dropped all together.
 Next we try lasso with only the wind components.

 ## Lasso Regression - Wind Only
 Lasso Regression for selecting which wind lag is important.

 Inputs:
 * 10u
 * 10v
 * repeating from '_lag1' to '_lag9'


 Outputs:
 * wave_u
 * wave_v

In [30]:
# Build input and output matrices
base_vars = ['10u', '10v']
input_vars = base_vars.copy()
for l in range(1, 10):
    input_vars.extend([v + '_lag' + str(l) for v in base_vars])
input_vars.sort()
output_vars = ['wave_u', 'wave_v']

test_dropped = test.dropna()
X_test = test_dropped[input_vars].values
Y_test = test_dropped[output_vars].values

X_train = None
Y_train = None
for i, buoy in enumerate(train.columns.levels[0].drop(test_buoy)):
    buoy_dat = train[buoy].dropna()
    if X_train is None:
        X_train = buoy_dat[input_vars].values
        Y_train = buoy_dat[output_vars].values
    else:
        X_train = np.concatenate([X_train, buoy_dat[input_vars].values])
        Y_train = np.concatenate([Y_train, buoy_dat[output_vars].values])


In [31]:
# Train model
alphas = np.logspace(-2, 5, num=20)
pipe = Pipeline([('scaler', StandardScaler()), ('regression', MultiTaskLassoCV(alphas=alphas))])
pipe.fit(X_train, Y_train)
print('Chosen regularization parameter: alpha={:.3f}'.format(pipe.named_steps['regression'].alpha_))
print('Train accuracy of the model: {:.3f}'.format(pipe.score(X_train, Y_train)))


Chosen regularization parameter: alpha=0.010
Train accuracy of the model: 0.648


In [32]:
# Test model
print('Test accuracy of the model: {:.3f}'.format(pipe.score(X_test, Y_test)))


Test accuracy of the model: 0.610


In [33]:
# Model coefficiens
coef = describe_regression(pipe, input_vars, output_vars)
coef


Unnamed: 0,wave_u,wave_v
10u,0.175595,0.048166
10u_lag1,0.0,0.0
10u_lag2,0.087636,0.011842
10u_lag3,0.042225,0.004767
10u_lag4,0.056002,0.006162
10u_lag5,0.033991,0.00398
10u_lag6,0.035848,0.004863
10u_lag7,0.000365,6.6e-05
10u_lag8,0.0,0.0
10u_lag9,0.08162,0.028389


In [34]:
# Unneeded variables
coef[(coef['wave_u'] == 0) & (coef['wave_v'] == 0)].index


Index(['10u_lag1', '10u_lag8', '10v_lag8'], dtype='object')

 Not that much useful information.
 Why is the coefficient of the longest lag always that high?

 ## Lasso Regression - Wind Only - More Lag
 Lasso Regression for selecting which wind lag is important.

 Inputs:
 * 10u
 * 10v
 * repeating from '_lag2' '_lag4' to '_lag18'


 Outputs:
 * wave_u
 * wave_v

In [35]:
# Build input and output matrices
base_vars = ['10u', '10v']
input_vars = base_vars.copy()
for l in range(1, 10):
    input_vars.extend([v + '_lag' + str(l*2) for v in base_vars])
input_vars.sort()
output_vars = ['wave_u', 'wave_v']

test_dropped = test.dropna()
X_test = test_dropped[input_vars].values
Y_test = test_dropped[output_vars].values

X_train = None
Y_train = None
for i, buoy in enumerate(train.columns.levels[0].drop(test_buoy)):
    buoy_dat = train[buoy].dropna()
    if X_train is None:
        X_train = buoy_dat[input_vars].values
        Y_train = buoy_dat[output_vars].values
    else:
        X_train = np.concatenate([X_train, buoy_dat[input_vars].values])
        Y_train = np.concatenate([Y_train, buoy_dat[output_vars].values])


In [36]:
# Train model
alphas = np.logspace(-2, 5, num=20)
pipe = Pipeline([('scaler', StandardScaler()), ('regression', MultiTaskLassoCV(alphas=alphas))])
pipe.fit(X_train, Y_train)
print('Chosen regularization parameter: alpha={:.3f}'.format(pipe.named_steps['regression'].alpha_))
print('Train accuracy of the model: {:.3f}'.format(pipe.score(X_train, Y_train)))


Chosen regularization parameter: alpha=0.010
Train accuracy of the model: 0.649


In [37]:
# Test model
print('Test accuracy of the model: {:.3f}'.format(pipe.score(X_test, Y_test)))


Test accuracy of the model: 0.611


In [38]:
# Model coefficiens
coef = describe_regression(pipe, input_vars, output_vars)
coef


Unnamed: 0,wave_u,wave_v
10u,0.170329,0.046799
10u_lag10,0.018964,0.006332
10u_lag12,0.018491,0.007826
10u_lag14,0.00033,0.000154
10u_lag16,0.0,0.0
10u_lag18,0.011031,0.005829
10u_lag2,0.109882,0.014795
10u_lag4,0.100721,0.011654
10u_lag6,0.054962,0.008425
10u_lag8,0.032531,0.007921


In [39]:
# Unneeded variables
coef[(coef['wave_u'] == 0) & (coef['wave_v'] == 0)].index


Index(['10u_lag16', '10v_lag14', '10v_lag16', '10v_lag18'], dtype='object')

 Here the strongest influence seems to be from the parameters lagging by 2 and 4 hours.
 Will keep 2 and 4 hour lag.

 ## Ridge Regression - Wind only
 Ridge regression from Wind Vector, Wind Speed and Wind Speed Squared
 Lag for 2 hours and 4 hours.

 Inputs:
 * 10u
 * 10v
 * 10u_lag2
 * 10v_lag2
 * 10u_lag4
 * 10v_lag4


 Outputs:
 * wave_u
 * wave_v

In [40]:
# Build input and output matrices
base_vars = ['10u', '10v']
input_vars = base_vars.copy()
for l in [2, 4]:
    input_vars.extend([v + '_lag' + str(l) for v in base_vars])
input_vars.sort()
output_vars = ['wave_u', 'wave_v']

test_dropped = test.dropna()
X_test = test_dropped[input_vars].values
Y_test = test_dropped[output_vars].values

X_train = None
Y_train = None
for i, buoy in enumerate(train.columns.levels[0].drop(test_buoy)):
    buoy_dat = train[buoy].dropna()
    if X_train is None:
        X_train = buoy_dat[input_vars].values
        Y_train = buoy_dat[output_vars].values
    else:
        X_train = np.concatenate([X_train, buoy_dat[input_vars].values])
        Y_train = np.concatenate([Y_train, buoy_dat[output_vars].values])


In [41]:
# Train model
alphas = np.logspace(-2, 5, num=20)
pipe = Pipeline([('scaler', StandardScaler()), ('regression', RidgeCV())])
pipe.fit(X_train, Y_train)
print('Chosen regularization parameter: alpha={:.3f}'.format(pipe.named_steps['regression'].alpha_))
print('Train accuracy of the model: {:.3f}'.format(pipe.score(X_train, Y_train)))


Chosen regularization parameter: alpha=10.000
Train accuracy of the model: 0.640


In [42]:
# Test model
print('Test accuracy of the model: {:.3f}'.format(pipe.score(X_test, Y_test)))


Test accuracy of the model: 0.603


In [43]:
# Model coefficiens
describe_regression(pipe, input_vars, output_vars)


Unnamed: 0,wave_u,wave_v
10u,0.181888,0.100527
10u_lag2,0.035139,-0.116214
10u_lag4,0.294224,0.12765
10v,-0.027643,0.132095
10v_lag2,0.039646,0.067571
10v_lag4,0.005435,0.365886
intercept,0.233693,0.163399


 ## Ridge Regression - Wind only - More Features
 Ridge regression from Wind Vector, Wind Speed and Wind Speed Squared
 Lag for 2 hours and 4 hours.

 Inputs:
 * 10u
 * 10v
 * wind_speed
 * wind_speed_sq
 * 10u_lag2
 * 10v_lag2
 * wind_speed_lag2
 * wind_speed_sq_lag2
 * 10u_lag4
 * 10v_lag4
 * wind_speed_lag4
 * wind_speed_sq_lag4


 Outputs:
 * wave_u
 * wave_v

In [44]:
# Build input and output matrices
base_vars = ['10u', '10v', 'wind_speed', 'wind_speed_sq']
input_vars = base_vars.copy()
for l in [2, 4]:
    input_vars.extend([v + '_lag' + str(l) for v in base_vars])
input_vars.sort()
output_vars = ['wave_u', 'wave_v']

test_dropped = test.dropna()
X_test = test_dropped[input_vars].values
Y_test = test_dropped[output_vars].values

X_train = None
Y_train = None
for i, buoy in enumerate(train.columns.levels[0].drop(test_buoy)):
    buoy_dat = train[buoy].dropna()
    if X_train is None:
        X_train = buoy_dat[input_vars].values
        Y_train = buoy_dat[output_vars].values
    else:
        X_train = np.concatenate([X_train, buoy_dat[input_vars].values])
        Y_train = np.concatenate([Y_train, buoy_dat[output_vars].values])


In [45]:
# Train model
alphas = np.logspace(-2, 5, num=20)
pipe = Pipeline([('scaler', StandardScaler()), ('regression', RidgeCV())])
pipe.fit(X_train, Y_train)
print('Chosen regularization parameter: alpha={:.3f}'.format(pipe.named_steps['regression'].alpha_))
print('Train accuracy of the model: {:.3f}'.format(pipe.score(X_train, Y_train)))


Chosen regularization parameter: alpha=10.000
Train accuracy of the model: 0.659


In [46]:
# Test model
print('Test accuracy of the model: {:.3f}'.format(pipe.score(X_test, Y_test)))


Test accuracy of the model: 0.627


In [47]:
# Model coefficiens
describe_regression(pipe, input_vars, output_vars)


Unnamed: 0,wave_u,wave_v
10u,0.132618,0.102788
10u_lag2,0.051085,-0.12078
10u_lag4,0.295503,0.12662
10v,-0.016676,0.132647
10v_lag2,0.042303,0.071433
10v_lag4,-0.015191,0.361553
wind_speed,-0.096793,-0.028199
wind_speed_lag2,-0.067065,0.026061
wind_speed_lag4,-0.06066,0.056297
wind_speed_sq,0.157682,-0.003454


 Including the wind speed and the squared wind speed does improve the performance.

 ## Support Vector Machine
 Support vector regression from Wind Vector, Wind Speed and Wind Speed Squared
 Lag for 2 hours and 4 hours.

 Inputs:
 * 10u
 * 10v
 * wind_speed
 * wind_speed_sq
 * 10u_lag2
 * 10v_lag2
 * wind_speed_lag2
 * wind_speed_sq_lag2
 * 10u_lag4
 * 10v_lag4
 * wind_speed_lag4
 * wind_speed_sq_lag4


 Outputs:
 * wave_u
 * wave_v

In [48]:
# Build input and output matrices
base_vars = ['10u', '10v', 'wind_speed', 'wind_speed_sq']
input_vars = base_vars.copy()
for l in [2, 4]:
    input_vars.extend([v + '_lag' + str(l) for v in base_vars])
input_vars.sort()
output_vars = ['wave_u', 'wave_v']

test_dropped = test.dropna()
X_test = test_dropped[input_vars].values
Y_test = test_dropped[output_vars].values

X_train = None
Y_train = None
for i, buoy in enumerate(train.columns.levels[0].drop(test_buoy)):
    buoy_dat = train[buoy].dropna()
    if X_train is None:
        X_train = buoy_dat[input_vars].values
        Y_train = buoy_dat[output_vars].values
    else:
        X_train = np.concatenate([X_train, buoy_dat[input_vars].values])
        Y_train = np.concatenate([Y_train, buoy_dat[output_vars].values])


In [49]:
# Train model
pipe_u = Pipeline([('scaler', StandardScaler()), ('regression', LinearSVR())])
pipe_v = Pipeline([('scaler', StandardScaler()), ('regression', LinearSVR())])
pipe_u.fit(X_train, Y_train[:,0])
pipe_v.fit(X_train, Y_train[:,1])
print('Chosen regularization parameter (u): C={:.3f}'.format(pipe_u.named_steps['regression'].C))
print('Chosen regularization parameter (v): C={:.3f}'.format(pipe_v.named_steps['regression'].C))
print('Train accuracy of the model (u): {:.3f}'.format(pipe_u.score(X_train, Y_train[:,0])))
print('Train accuracy of the model (v): {:.3f}'.format(pipe_v.score(X_train, Y_train[:,1])))


Chosen regularization parameter (u): C=1.000
Chosen regularization parameter (v): C=1.000
Train accuracy of the model (u): 0.654
Train accuracy of the model (v): 0.610


In [50]:
# Test model
print('Test accuracy of the model (u): {:.3f}'.format(pipe_u.score(X_test, Y_test[:,0])))
print('Test accuracy of the model (v): {:.3f}'.format(pipe_v.score(X_test, Y_test[:,1])))

Test accuracy of the model (u): 0.684
Test accuracy of the model (v): 0.537
