In [None]:
import numpy as np
import h5py

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
sourcefile='/work/ka1176/shared_data/2020-03/dev_data/dev_data_random_all_map/train_data.h5'

In [None]:
ddm_timestamp_unix = h5py.File(sourcefile, 'r')['ddm_timestamp_unix'][:]

In [None]:
ddm_timestamp_day = ((ddm_timestamp_unix - ddm_timestamp_unix[0]) / 24 / 3600).astype(int)

In [None]:
ix_start_valid = 130 #int(np.quantile(ddm_timestamp_day, 0.585))
ix_start_test =  210 #int(np.quantile(ddm_timestamp_day, 0.65))

In [None]:
plt.hist(ddm_timestamp_day, bins=range(250));
plt.vlines(ix_start_valid, 0, 250000);
plt.vlines(ix_start_test, 0, 250000);

In [None]:
N_train = 60000
N_valid = 10000
N_test  = 15000

In [None]:
ix_train = np.sort(np.random.choice(np.sum(ddm_timestamp_day < ix_start_valid), N_train, replace=False))
ix_valid = np.sort(np.random.choice(np.sum((ddm_timestamp_day >= ix_start_valid) & (ddm_timestamp_day < ix_start_test)), N_valid, replace=False)) + ix_start_valid
ix_test = np.sort(np.random.choice(np.sum(ddm_timestamp_day >= ix_start_test), N_test, replace=False)) + ix_start_test

In [None]:
%%time
windspeed = h5py.File(sourcefile, 'r')['windspeed'][:]
windspeed_train = windspeed[ix_train]
windspeed_valid = windspeed[ix_valid]
windspeed_test  = windspeed[ix_test]

In [None]:
# Possible source of error - the validation set wind speed statistic is skewed wrto the train wind speed statistic
sns.distplot(windspeed_train);
sns.distplot(windspeed_valid);
sns.distplot(windspeed_test);

In [None]:
%%time
brcs = h5py.File(sourcefile, 'r')['brcs'][:]
brcs_train = brcs[ix_train]
brcs_valid = brcs[ix_valid]
brcs_test  = brcs[ix_test]
del windspeed, brcs

In [None]:
## Linear regression

From brcs, we extract the quantity "sigma0", which is related to the wind speed. Actually, the relation is exponential. To fit a linear model, we take the logarithm and fit

y = a * log(sigma0) + b

In [None]:
sigma0_train = np.max(brcs_train, axis=(1,2))
sigma0_valid = np.max(brcs_valid, axis=(1,2))
sigma0_test  = np.max(brcs_test, axis=(1,2))

In [None]:
plt.hexbin(np.log(sigma0_train), windspeed_train)
plt.xlabel('log(sigma0)')
plt.ylabel('wind speed');

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lreg = LinearRegression()
lreg.fit(np.log(sigma0_train).reshape(-1,1), windspeed_train)

In [None]:
print(lreg.score(np.log(sigma0_train).reshape(-1,1), windspeed_train))
print(lreg.score(np.log(sigma0_valid).reshape(-1,1), windspeed_valid))
print(lreg.score(np.log(sigma0_test).reshape(-1,1), windspeed_test))

In [None]:
y_pred = lreg.predict(np.log(sigma0_test.reshape(-1,1)))

In [None]:
plt.hexbin(windspeed_test, y_pred);
plt.colorbar(label='Bin count')
plt.xlabel('true wind speed')
plt.ylabel('predicted wind speed');
ax=plt.gca()
ax.set_aspect('equal')
ax.plot(range(0, 15), range(0, 15), 'r:');

In [None]:
lreg.coef_, lreg.intercept_

In [None]:
print(f'RMSE: {np.sqrt(np.mean((y_pred - windspeed_test)**2)):.2f} m/s')

In [None]:
# RMSE if we would just always predict the mean value
print(f'RMSE: {np.sqrt(np.mean((np.mean(windspeed_test) - windspeed_test)**2)):.2f} m/s')

In [None]:
# TODO
# save as hdf5
# separate notebook for the linear regression
# ANN with the flattened BRCS
# CNN with the BRCS
# Bonus: CNN with the BRCS and the EFF_SCATTER