In [1]:
# Benchmark neural network models

In [2]:
import os
import datetime as dt
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from model_definitions import define_model_cnn, define_model_lstm
from predict import load_models, predict_batch
from train import train_nn_models
import time
from typing import Callable, Tuple
import tensorflow as tf


In [3]:
# download and save data
# !wget https://ngdc.noaa.gov/geomag/data/geomag/magnet/public.zip
# !unzip public.zip
# !wget https://ngdc.noaa.gov/geomag/data/geomag/magnet/private.zip
# !unzip private.zip
# !mkdir data
# !mv public data
# !mv private data

In [4]:
# load data
data_folder = "data"
solar_train = pd.read_csv(os.path.join(data_folder, "public", "solar_wind.csv"))
dst_train = pd.read_csv(os.path.join(data_folder, "public", "dst_labels.csv"))
sunspots_train = pd.read_csv(os.path.join(data_folder, "public", "sunspots.csv"))
solar_test = pd.read_csv(os.path.join(data_folder, "private", "solar_wind.csv"))
dst_test = pd.read_csv(os.path.join(data_folder, "private", "dst_labels.csv"))
sunspots_test = pd.read_csv(os.path.join(data_folder, "private", "sunspots.csv"))

In [5]:
# train and save model: CNN
# output_folder = os.path.join("trained_models", "cnn", "benchmark")
# os.makedirs(output_folder, exist_ok=True)
# # comment out training if model is already saved on disk
# train_nn_models(solar_train, sunspots_train, dst_train, define_model_cnn, 1, output_folder)

In [6]:
# train and save model: LSTM
output_folder = os.path.join("trained_models", "lstm", "benchmark")
# os.makedirs(output_folder, exist_ok=True)
# # comment out training if model is already saved on disk
# train_nn_models(solar_train, sunspots_train, dst_train, define_model_lstm, 1, output_folder)

In [7]:
# measure performance on train and test
t = time.time()
model_t_arr, model_t_plus_1_arr, norm_df = load_models(output_folder, 1)
dst_test["timedelta"] = pd.to_timedelta(dst_test["timedelta"])
# exclude times in the first week of dst_test
dst_test = dst_test.loc[dst_test["timedelta"] >= dt.timedelta(days=7)]
predictions = predict_batch(
    solar_test, sunspots_test, dst_test, model_t_arr, model_t_plus_1_arr, norm_df
)
dst_test = pd.merge(dst_test, predictions, "left", ["timedelta", "period"])
dst_test["dst_t_plus_1"] = dst_test.groupby("period")["dst"].shift(-1)
loss_t = np.sqrt(
    mean_squared_error(dst_test["dst"].values, dst_test["prediction_t"].values)
)
valid_ind = dst_test["dst_t_plus_1"].notnull()
loss_t_plus_1 = np.sqrt(
    mean_squared_error(
        dst_test.loc[valid_ind, "dst_t_plus_1"].values,
        dst_test.loc[valid_ind, "prediction_t_plus_1"].values,
    )
)
prediction_time = time.time() - t

print(f"RMSE for time t: {loss_t:0.2f}")
print(f"RMSE for time t+1: {loss_t_plus_1:0.2f}")
print(f"Prediction time: {prediction_time:0.1f}s")


RMSE for time t: 13.78
RMSE for time t+1: 14.26


NameError: name 'training_time' is not defined

In [None]:
plt.figure()
plt.plot(dst_test.loc[valid_ind, "dst"].values)
plt.plot(dst_test.loc[valid_ind, "prediction_t"].values)
plt.legend(["actual", "predicted"])