In [None]:
from pipetorch import flight_passengers
from pipetorch.train import *
from pipetorch.data import PTDataFrame
import time
from datetime import datetime, timedelta
from sklearn.metrics import *
from trainer.DataFrameLoader import *
import sys

config = {
  "random_state": 0,
  "batch_size": 5,
  "window_size": 12,
  "hidden_size": 95,
  "num_layers": 1,
  "loss": "MSELoss",
  "rnn": "GRU"
}


def factoryzero_date_parser(df: pd.DataFrame) -> pd.DataFrame:
    df["Timestamp"] = pd.to_datetime(df["Timestamp"], unit='s').round('min')
    df = df.set_index("Timestamp")
    return df


def knmi_date_parser(df: pd.DataFrame) -> pd.DataFrame:
    df = df.set_index(["Date(YYYYMMDD)", "Hour"])
    def transform_index(index):
        date, hour = index
        return pd.to_datetime(str(date) + str(hour - 1), format='%Y%m%d%H')
    df = df.set_index(df.index.map(transform_index))
    return df


to_impute = [
    {
        "conf": "knmi",
        "file": "344 Rotterdam.csv",
        "date_parser": knmi_date_parser,
        "target": ("0", "Temperature"),
        "features": [
            ("0", "Temperature"),
            ("0", "Global Radiation"),
            ("0", "Dew Temperature"),
            ("0", "Relative atmospheric humidity")
        ]
    },
    {
        "conf": "knmi",
        "file": "344 Rotterdam.csv",
        "date_parser": knmi_date_parser,
        "target": ("0", "Relative atmospheric humidity"),
        "features": [
            ("0", "Relative atmospheric humidity"),
            ("0", "Sunshine duration"),
            ("0", "Global Radiation"),
            ("0", "Horizontal visibility")
        ]
    },
    {
        "conf": "knmi",
        "file": "344 Rotterdam.csv",
        "date_parser": knmi_date_parser,
        "target": ("0", "Global Radiation"),
        "features": [
            ("0", "Global Radiation"),
            ("0", "Relative atmospheric humidity"),
            ("0", "Temperature"),
            ("0", "Sunshine duration")
        ]
    },
    {
        "conf": "factory zero",
        "file": "099.xlsx",
        "date_parser": factoryzero_date_parser,
        "target": ("alklimaHeatPump", "flow_temp"),
        "features": [
            ("alklimaHeatPump", "flow_temp"),
            ("alklimaHeatPump", "return_temp"),
            ("energyHeatpump", "power")
        ]
    },
    {
        "conf": "factory zero",
        "file": "099.xlsx",
        "date_parser": factoryzero_date_parser,
        "target": ("alklimaHeatPump", "op_mode"),
        "features": [
            ("alklimaHeatPump", "op_mode"),
            ("ventilation", "outdoor_temp")
        ]
    },
    {
        "conf": "factory zero",
        "file": "099.xlsx",
        "date_parser": factoryzero_date_parser,
        "target": ("smartMeter", "power"),
        "features": [
            ("smartMeter", "power"),
            ("ventilation", "outdoor_temp"),
            ("ventilation", "room_temp"),
            ("solar", "power")
        ]
    },
    {
        "conf": "factory zero",
        "file": "054.xlsx",
        "date_parser": factoryzero_date_parser,
        "target": ("co2sensor", "co2"),
        "features": [
            ("co2sensor", "co2"),
            ("co2sensor", "voc")
        ]
    }
]


for field in to_impute:
    print(f"Training {field['target'][0]} - {field['target'][1]}")
    # Set random state
    torch.manual_seed(config["random_state"])
    np.random.seed(config["random_state"])

    filename = "../pipeline/data/" + field["file"]
    dfloader = DataFrameLoader.from_file(filename, date_parser=field["date_parser"])
    dfloader.add_targets(field['target'][1], sheet_name=field['target'][0])
    for feature in field['features']:
        dfloader.add_features(feature[1], sheet_name=feature[0])

#     dfloader.add_index_as_feature("Timestamp")
#     dfloader.add_index_delta_as_feature("Timedelta")

    df = dfloader.to_ptdataframe()
    df = df.reset_index(drop=True)

    df = df.head(10000)
    df = df.astype(np.float32).sequence(config["window_size"]).split(0.2).scale()
    data = df.to_databunch(batch_size=config["batch_size"])

    # Define the model
    class RNN(nn.Module):
        def __init__(self, input_size, hidden_size=100, num_layers=1, output_size=1, rnn=nn.LSTM):
            super().__init__()
            self.l1 = rnn(input_size, hidden_size, num_layers, batch_first=True)
            self.l2 = nn.Linear(hidden_size, output_size)

        def forward(self, X):
            h, _ = self.l1(X)
            h = h[:,-1, :]
            y = self.l2(h)
            y = y + X[:,-1,-1:]
            return y

    rnns = {m.__name__: m for m in [nn.LSTM, nn.GRU]}
    model = RNN(
        input_size=len(dfloader.features),
        hidden_size=config["hidden_size"],
        num_layers=config["num_layers"],
        output_size=1,
        rnn=rnns[config["rnn"]]
    )

    losses = {m.__name__: m for m in [nn.MSELoss, nn.HuberLoss]}
    t = trainer(model, losses[config["loss"]](), data, metrics=r2_score, gpu=True)

    # Perform the training
    t.train(15, lr=(3e-4, 3e-2), report_frequency=1, save_lowest='loss')
    t.lowest()
    t.train(5, lr=(1e-6, 1e-5), report_frequency=1, save_lowest='loss')
    t.lowest()

    # Save model
    torch.save(model.state_dict(), f'models/rnn-{field["target"][1].replace(" ", "_")}.pt')