 # Wave Map Model Building

In [1]:
import datetime
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, RidgeCV, MultiTaskLassoCV
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

pd.options.plotting.backend = 'plotly'
np.random.seed(42)


In [2]:
# Load data
wave_data = pd.read_csv('data/wave_data.csv', header=[0,1], index_col=0, parse_dates=True)
weather_data = pd.read_csv('data/weather_data.csv', header=[0,1], index_col=0, parse_dates=True)


In [3]:
# Join data
data = pd.concat([wave_data, weather_data], axis=1, join='inner')


In [4]:
# Add lagging values
lagging_vars = ['10u', '10v', '2t', 'sp']
lagging_times = [datetime.timedelta(hours=h) for h in range(1, 20)]
dat = [data]
for var in lagging_vars:
    for i, t in enumerate(lagging_times):
        var_dat = data.loc[:,(slice(None),var)]
        var_dat.index = var_dat.index + t
        var_dat.columns = pd.MultiIndex.from_tuples([(b, p+'_lag'+str(i+1)) for b, p in var_dat.columns])
        dat.append(var_dat)
data = pd.concat(dat, axis=1)


In [5]:
# Helper function
def describe_regression(pipe, coef_desc, target_desc):
    model = pipe.named_steps['regression']
    return pd.DataFrame(
        np.append(model.coef_, model.intercept_[:,np.newaxis], axis=1).T,
        index=np.append(coef_desc, 'intercept'), columns=target_desc)


 ## Train Test Split
 Use a randomly chosen one of the buoys as the test dataset.

In [6]:
num_buoys = data.columns.levshape[0]
test_buoy = data.columns.get_level_values(0)[np.random.randint(0, num_buoys)]


In [7]:
test = data[test_buoy]
train = data.drop(test_buoy, axis=1)


 ## Simple Linear Regression
 Linear regression from Wind Vector, Temperature and Pressure to Wave Vector.
 No Temporal Information.

 Inputs:
 * 10u
 * 10v
 * 2t
 * sp


 Outputs:
 * wave_u
 * wave_v

In [8]:
# Build input and output matrices
input_vars = ['10u', '10v', '2t', 'sp']
output_vars = ['wave_u', 'wave_v']

test_dropped = test.dropna()
X_test = test_dropped[input_vars].values
Y_test = test_dropped[output_vars].values

X_train = None
Y_train = None
for i, buoy in enumerate(train.columns.levels[0].drop(test_buoy)):
    buoy_dat = train[buoy].dropna()
    if X_train is None:
        X_train = buoy_dat[input_vars].values
        Y_train = buoy_dat[output_vars].values
    else:
        X_train = np.concatenate([X_train, buoy_dat[input_vars].values])
        Y_train = np.concatenate([Y_train, buoy_dat[output_vars].values])


In [9]:
# Train model
pipe = Pipeline([('scaler', StandardScaler()), ('regression', LinearRegression())])
pipe.fit(X_train, Y_train)
print('Train accuracy of the model: {:.3f}'.format(pipe.score(X_train, Y_train)))


Train accuracy of the model: 0.591


In [10]:
# Test model
print('Test accuracy of the model: {:.3f}'.format(pipe.score(X_test, Y_test)))


Test accuracy of the model: 0.673


In [11]:
# Model coefficiens
describe_regression(pipe, input_vars, output_vars)


Unnamed: 0,wave_u,wave_v
10u,0.568532,0.172455
10v,-0.007847,0.589692
2t,-0.047344,-0.011097
sp,-0.033673,-0.108037
intercept,0.294507,0.254345


 The linear regression gives the expected result: The are strong correlations between 10u-wave_u and 10v-wave_v.
 However, the accuracy of the model is not good.
 For some reason the train accuracy is lower than the test accuracy.

 ## Linear Regression with older inputs
 Linear regression from Wind Vector, Temperature and Pressure to Wave Vector.
 Each input variable is used four times, once with the current value, once lagging one hour and so on.

 Inputs:
 * 10u
 * 10v
 * 2t
 * sp
 * repeating from '_lag1' to '_lag3'


 Outputs:
 * wave_u
 * wave_v

In [12]:
# Build input and output matrices
base_vars = ['10u', '10v', '2t', 'sp']
input_vars = base_vars.copy()
for l in range(1, 4):
    input_vars.extend([v + '_lag' + str(l) for v in base_vars])
input_vars.sort()
output_vars = ['wave_u', 'wave_v']

test_dropped = test.dropna()
X_test = test_dropped[input_vars].values
Y_test = test_dropped[output_vars].values

X_train = None
Y_train = None
for i, buoy in enumerate(train.columns.levels[0].drop(test_buoy)):
    buoy_dat = train[buoy].dropna()
    if X_train is None:
        X_train = buoy_dat[input_vars].values
        Y_train = buoy_dat[output_vars].values
    else:
        X_train = np.concatenate([X_train, buoy_dat[input_vars].values])
        Y_train = np.concatenate([Y_train, buoy_dat[output_vars].values])


In [13]:
# Train model
pipe = Pipeline([('scaler', StandardScaler()), ('regression', LinearRegression())])
pipe.fit(X_train, Y_train)
print('Train accuracy of the model: {:.3f}'.format(pipe.score(X_train, Y_train)))


Train accuracy of the model: 0.637


In [14]:
# Test model
print('Test accuracy of the model: {:.3f}'.format(pipe.score(X_test, Y_test)))


Test accuracy of the model: 0.730


In [15]:
# Model coefficiens
describe_regression(pipe, input_vars, output_vars)


Unnamed: 0,wave_u,wave_v
10u,0.172373,0.080777
10u_lag1,0.04818,0.062795
10u_lag2,-0.204726,-0.325007
10u_lag3,0.581991,0.346611
10v,-0.028201,0.162692
10v_lag1,-0.086538,-0.005641
10v_lag2,0.19376,-0.146321
10v_lag3,-0.082544,0.633673
2t,0.121985,-0.008177
2t_lag1,-0.146604,-0.022737


 The regression model with lagging input variables shows increased accuracy.
 The largest influence on the model comes from the variables lagging the most, therefore longer lagging variables should be included.

 ## Linear Regression with even older inputs
 Linear regression from Wind Vector, Temperature and Pressure to Wave Vector.
 Each input variable is used nine times, once with the current value, once lagging one hour and so on.
 The goal here is to see how old the variables with the strongest correlation are.

 Inputs:
 * 10u
 * 10v
 * 2t
 * sp
 * repeating from '_lag1' to '_lag9'


 Outputs:
 * wave_u
 * wave_v

In [16]:
# Build input and output matrices
base_vars = ['10u', '10v', '2t', 'sp']
input_vars = base_vars.copy()
for l in range(1, 10):
    input_vars.extend([v + '_lag' + str(l) for v in base_vars])
input_vars.sort()
output_vars = ['wave_u', 'wave_v']

test_dropped = test.dropna()
X_test = test_dropped[input_vars].values
Y_test = test_dropped[output_vars].values

X_train = None
Y_train = None
for i, buoy in enumerate(train.columns.levels[0].drop(test_buoy)):
    buoy_dat = train[buoy].dropna()
    if X_train is None:
        X_train = buoy_dat[input_vars].values
        Y_train = buoy_dat[output_vars].values
    else:
        X_train = np.concatenate([X_train, buoy_dat[input_vars].values])
        Y_train = np.concatenate([Y_train, buoy_dat[output_vars].values])


In [17]:
# Train model
pipe = Pipeline([('scaler', StandardScaler()), ('regression', LinearRegression())])
pipe.fit(X_train, Y_train)
print('Train accuracy of the model: {:.3f}'.format(pipe.score(X_train, Y_train)))


Train accuracy of the model: 0.654


In [18]:
# Test model
print('Test accuracy of the model: {:.3f}'.format(pipe.score(X_test, Y_test)))


Test accuracy of the model: 0.738


In [19]:
# Model coefficiens
describe_regression(pipe, input_vars, output_vars)


Unnamed: 0,wave_u,wave_v
10u,0.174789,0.060686
10u_lag1,0.006992,-0.036485
10u_lag2,0.097512,0.053398
10u_lag3,0.052571,0.013026
10u_lag4,0.080103,-0.00989
10u_lag5,0.029568,0.032018
10u_lag6,0.044174,-0.027786
10u_lag7,0.039916,0.06172
10u_lag8,-0.045728,-0.108788
10u_lag9,0.143979,0.137901


 The model yields basically the same accuracy as when using less lagging variables.
 Of the lagging variables again, the oldest is the one with the strongest correlation.
 It is not clear from this result how much lag should be included.

 Next we try introducing regularization.

 ## Ridge Regression
 Ridge regression from Wind Vector, Temperature and Pressure to Wave Vector.
 Each input variable is used nine times, once with the current value, once lagging one hour and so on.
 The goal here is to see how old the variables with the strongest correlation are.

 Inputs:
 * 10u
 * 10v
 * 2t
 * sp
 * repeating from '_lag1' to '_lag9'


 Outputs:
 * wave_u
 * wave_v

In [20]:
# Build input and output matrices
base_vars = ['10u', '10v', '2t', 'sp']
input_vars = base_vars.copy()
for l in range(1, 10):
    input_vars.extend([v + '_lag' + str(l) for v in base_vars])
input_vars.sort()
output_vars = ['wave_u', 'wave_v']

test_dropped = test.dropna()
X_test = test_dropped[input_vars].values
Y_test = test_dropped[output_vars].values

X_train = None
Y_train = None
for i, buoy in enumerate(train.columns.levels[0].drop(test_buoy)):
    buoy_dat = train[buoy].dropna()
    if X_train is None:
        X_train = buoy_dat[input_vars].values
        Y_train = buoy_dat[output_vars].values
    else:
        X_train = np.concatenate([X_train, buoy_dat[input_vars].values])
        Y_train = np.concatenate([Y_train, buoy_dat[output_vars].values])


In [21]:
# Train model
alphas = np.logspace(-2, 5, num=20)
pipe = Pipeline([('scaler', StandardScaler()), ('regression', RidgeCV(alphas=alphas))])
pipe.fit(X_train, Y_train)
print('Chosen regularization parameter: alpha={:.3f}'.format(pipe.named_steps['regression'].alpha_))
print('Train accuracy of the model: {:.3f}'.format(pipe.score(X_train, Y_train)))


Chosen regularization parameter: alpha=48.329
Train accuracy of the model: 0.654


In [22]:
# Test model
print('Test accuracy of the model: {:.3f}'.format(pipe.score(X_test, Y_test)))


Test accuracy of the model: 0.738


In [23]:
# Model coefficiens
describe_regression(pipe, input_vars, output_vars)


Unnamed: 0,wave_u,wave_v
10u,0.163544,0.05392
10u_lag1,0.033729,-0.015842
10u_lag2,0.076866,0.038112
10u_lag3,0.061939,0.012271
10u_lag4,0.07127,-0.002256
10u_lag5,0.03907,0.020414
10u_lag6,0.045145,0.000492
10u_lag7,0.025047,0.015996
10u_lag8,-0.026632,-0.073775
10u_lag9,0.133395,0.126083


 The regularization does not seem to influence the performance much.

 ## Lasso Regression
 Lasso Regression for selecting important variables.

 Inputs:
 * 10u
 * 10v
 * 2t
 * sp
 * repeating from '_lag1' to '_lag9'


 Outputs:
 * wave_u
 * wave_v

In [24]:
# Build input and output matrices
base_vars = ['10u', '10v', '2t', 'sp']
input_vars = base_vars.copy()
for l in range(1, 10):
    input_vars.extend([v + '_lag' + str(l) for v in base_vars])
input_vars.sort()
output_vars = ['wave_u', 'wave_v']

test_dropped = test.dropna()
X_test = test_dropped[input_vars].values
Y_test = test_dropped[output_vars].values

X_train = None
Y_train = None
for i, buoy in enumerate(train.columns.levels[0].drop(test_buoy)):
    buoy_dat = train[buoy].dropna()
    if X_train is None:
        X_train = buoy_dat[input_vars].values
        Y_train = buoy_dat[output_vars].values
    else:
        X_train = np.concatenate([X_train, buoy_dat[input_vars].values])
        Y_train = np.concatenate([Y_train, buoy_dat[output_vars].values])


In [25]:
# Train model
alphas = np.logspace(-2, 5, num=20)
pipe = Pipeline([('scaler', StandardScaler()), ('regression', MultiTaskLassoCV(alphas=alphas))])
pipe.fit(X_train, Y_train)
print('Chosen regularization parameter: alpha={:.3f}'.format(pipe.named_steps['regression'].alpha_))
print('Train accuracy of the model: {:.3f}'.format(pipe.score(X_train, Y_train)))


Chosen regularization parameter: alpha=0.010
Train accuracy of the model: 0.652


In [26]:
# Test model
print('Test accuracy of the model: {:.3f}'.format(pipe.score(X_test, Y_test)))


Test accuracy of the model: 0.745


In [27]:
# Model coefficiens
coef = describe_regression(pipe, input_vars, output_vars)
coef


Unnamed: 0,wave_u,wave_v
10u,0.186117,0.056367
10u_lag1,0.0,0.0
10u_lag2,0.100976,0.019986
10u_lag3,0.052667,0.00873
10u_lag4,0.068059,0.009939
10u_lag5,0.035104,0.005485
10u_lag6,0.044011,0.007735
10u_lag7,0.0,0.0
10u_lag8,0.0,0.0
10u_lag9,0.122653,0.05999


In [28]:
# Unneeded variables
coef[(coef['wave_u'] == 0) & (coef['wave_v'] == 0)].index


Index(['10u_lag1', '10u_lag7', '10u_lag8', '10v_lag8', '2t', '2t_lag1',
       '2t_lag2', '2t_lag5', '2t_lag9', 'sp_lag1', 'sp_lag2', 'sp_lag3',
       'sp_lag4', 'sp_lag5', 'sp_lag6', 'sp_lag7', 'sp_lag8', 'sp_lag9'],
      dtype='object')

 The lasso regression shows that Surface Pressure and Temperature can be dropped all together.
 Next we try lasso with only the wind components.

 ## Lasso Regression - Wind Only
 Lasso Regression for selecting which wind lag is important.

 Inputs:
 * 10u
 * 10v
 * repeating from '_lag1' to '_lag9'


 Outputs:
 * wave_u
 * wave_v

In [29]:
# Build input and output matrices
base_vars = ['10u', '10v']
input_vars = base_vars.copy()
for l in range(1, 10):
    input_vars.extend([v + '_lag' + str(l) for v in base_vars])
input_vars.sort()
output_vars = ['wave_u', 'wave_v']

test_dropped = test.dropna()
X_test = test_dropped[input_vars].values
Y_test = test_dropped[output_vars].values

X_train = None
Y_train = None
for i, buoy in enumerate(train.columns.levels[0].drop(test_buoy)):
    buoy_dat = train[buoy].dropna()
    if X_train is None:
        X_train = buoy_dat[input_vars].values
        Y_train = buoy_dat[output_vars].values
    else:
        X_train = np.concatenate([X_train, buoy_dat[input_vars].values])
        Y_train = np.concatenate([Y_train, buoy_dat[output_vars].values])


In [30]:
# Train model
alphas = np.logspace(-2, 5, num=20)
pipe = Pipeline([('scaler', StandardScaler()), ('regression', MultiTaskLassoCV(alphas=alphas))])
pipe.fit(X_train, Y_train)
print('Chosen regularization parameter: alpha={:.3f}'.format(pipe.named_steps['regression'].alpha_))
print('Train accuracy of the model: {:.3f}'.format(pipe.score(X_train, Y_train)))


Chosen regularization parameter: alpha=0.010
Train accuracy of the model: 0.646


In [31]:
# Test model
print('Test accuracy of the model: {:.3f}'.format(pipe.score(X_test, Y_test)))


Test accuracy of the model: 0.744


In [32]:
# Model coefficiens
coef = describe_regression(pipe, input_vars, output_vars)
coef


Unnamed: 0,wave_u,wave_v
10u,0.198835,0.066719
10u_lag1,0.0,0.0
10u_lag2,0.092605,0.018071
10u_lag3,0.055665,0.00885
10u_lag4,0.065933,0.009085
10u_lag5,0.037232,0.005498
10u_lag6,0.039858,0.006675
10u_lag7,0.0,0.0
10u_lag8,0.0,0.0
10u_lag9,0.127936,0.064991


In [33]:
# Unneeded variables
coef[(coef['wave_u'] == 0) & (coef['wave_v'] == 0)].index


Index(['10u_lag1', '10u_lag7', '10u_lag8', '10v_lag8'], dtype='object')

 Not that much useful information.
 Why is the coefficient of the longest lag always that high?

 ## Lasso Regression - Wind Only - Shorter Lag
 Lasso Regression for selecting which wind lag is important.

 Inputs:
 * 10u
 * 10v
 * repeating from '_lag2' '_lag4' to '_lag18'


 Outputs:
 * wave_u
 * wave_v

In [34]:
# Build input and output matrices
base_vars = ['10u', '10v']
input_vars = base_vars.copy()
for l in range(1, 10):
    input_vars.extend([v + '_lag' + str(l*2) for v in base_vars])
input_vars.sort()
output_vars = ['wave_u', 'wave_v']

test_dropped = test.dropna()
X_test = test_dropped[input_vars].values
Y_test = test_dropped[output_vars].values

X_train = None
Y_train = None
for i, buoy in enumerate(train.columns.levels[0].drop(test_buoy)):
    buoy_dat = train[buoy].dropna()
    if X_train is None:
        X_train = buoy_dat[input_vars].values
        Y_train = buoy_dat[output_vars].values
    else:
        X_train = np.concatenate([X_train, buoy_dat[input_vars].values])
        Y_train = np.concatenate([Y_train, buoy_dat[output_vars].values])


In [35]:
# Train model
alphas = np.logspace(-2, 5, num=20)
pipe = Pipeline([('scaler', StandardScaler()), ('regression', MultiTaskLassoCV(alphas=alphas))])
pipe.fit(X_train, Y_train)
print('Chosen regularization parameter: alpha={:.3f}'.format(pipe.named_steps['regression'].alpha_))
print('Train accuracy of the model: {:.3f}'.format(pipe.score(X_train, Y_train)))


Chosen regularization parameter: alpha=0.010
Train accuracy of the model: 0.648


In [36]:
# Test model
print('Test accuracy of the model: {:.3f}'.format(pipe.score(X_test, Y_test)))


Test accuracy of the model: 0.743


In [37]:
# Model coefficiens
coef = describe_regression(pipe, input_vars, output_vars)
coef


Unnamed: 0,wave_u,wave_v
10u,0.190944,0.063493
10u_lag10,0.024989,0.010854
10u_lag12,0.020203,0.011042
10u_lag14,0.006489,0.004587
10u_lag16,0.0,0.0
10u_lag18,0.026579,0.022068
10u_lag2,0.122979,0.023852
10u_lag4,0.118203,0.01846
10u_lag6,0.067478,0.014322
10u_lag8,0.048293,0.015843


In [38]:
# Unneeded variables
coef[(coef['wave_u'] == 0) & (coef['wave_v'] == 0)].index


Index(['10u_lag16', '10v_lag16', '10v_lag18'], dtype='object')

 Here the strongest influence seems to be from the parameters lagging by 2 and 4 hours.
 Will keep 2 and 4 hour lag.