In [None]:
# UNCOMMENT IF ON REMOTE JUPYTER
# !pip install plotly pandas protobuf==3.20.0 tensorflow==2.6.2 scikit-learn numpy

In [ ]:
from sklearn.model_selection import TimeSeriesSplit
import pandas as pd
import numpy as np
import plotly.express as px
import tensorflow as tf
from sklearn.metrics import mean_absolute_error
from tensorflow.keras import Input, Sequential
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.layers import LSTM, Dense, TimeDistributed, Bidirectional
from tensorflow.keras.optimizers import  Nadam
from sklearn.compose import ColumnTransformer
from keras.layers import LeakyReLU
from data_util_common import canopy_dataset, train_val_test_split
from lstm_util import prepare_data, evaluate

In [ ]:
tf.random.set_seed(42)

In [ ]:
def create_model(input_shape, weights):
    model = Sequential()
    model.add(Input(shape=input_shape))
    model.add(Bidirectional(LSTM(168, return_sequences=True)))
    model.add(Bidirectional(LSTM(72, return_sequences=True)))
    model.add(Bidirectional(LSTM(48, return_sequences=True)))
    model.add(Bidirectional(LSTM(24, return_sequences=True)))
    model.add(TimeDistributed(Dense(1)))
    model.add(LeakyReLU())
   
    if weights:
        model.set_weights(weights)

    model.compile(optimizer=Nadam(learning_rate=0.0001), loss="mae")
    return model

In [ ]:
if __name__ == '__main__':
    data, label = canopy_dataset()
    split_idx = train_val_test_split(pd.to_datetime(data.index), [80, 20])
    train_val_size = split_idx[0]
    data = data.iloc[:train_val_size, :]
    
    # scale data
    numeric_features=['Irradiance_1_Wm2']
    features = [label] + numeric_features 
    
    data = data[features]
    
    print(data.columns)
    label_idx = list(data.columns).index(label)
    print(label_idx)
    T = 24
    past = T * 7
    future = T
    step = 1
    batch_size = 512
    sequence_length = int(past / step)

    X = [data.iloc[i:i+T, :] for i in range(0, len(data.index)-T, T)]
    timestamp = [data.index[i:i+T] for i in range(0, len(data.index)-T, T)]    
    mae = []

    tscv = TimeSeriesSplit(n_splits=5)
    for i, (train_index, test_index) in enumerate(tscv.split(X)):
        print(f"Fold {i} - TRAIN DAYS {len(train_index)} TEST DAYS {len(test_index)}")
        train = np.asarray([X[t] for t in train_index])
        train = train.reshape((T * len(train_index), len(features)))
        
        names = dict()
        for k in range(train.shape[1]):
            names[k] = features[k]
            
        train = pd.DataFrame(train).rename(columns=names)
        train_timestamp = np.asarray([timestamp[t] for t in train_index])
        train_timestamp = train_timestamp.reshape((T * len(train_timestamp), 1))
        train_timestamp = pd.to_datetime([x[0] for x in train_timestamp], format='%Y-%m-%d %H:%M:%S')
        
        pipeline=ColumnTransformer([
            ('label', MinMaxScaler(), [label]),
            ('num', MinMaxScaler(), numeric_features),
        ], remainder='passthrough')
        
        train=pipeline.fit_transform(train)

        # temp = pd.DataFrame(train)
        # temp.index = train_timestamp
        # temp.to_csv(f"lstm/cval/{i}_train.csv")
        
        split = train_val_test_split(train_timestamp, percentage=[70, 30])
        val = train[split[0]:, :]
        val_size = len(val)
        train = train[:split[0], :]
        train_size = len(train)
        
        test = np.asarray([X[t] for t in test_index])
        test = test.reshape((T * len(test_index), len(features)))
        test = pd.DataFrame(test).rename(columns=names)
        test = pipeline.transform(test)
        test_size = len(test)

        test_timestamp = np.asarray([timestamp[t] for t in test_index])
        test_timestamp = test_timestamp.reshape((T * len(test_index), 1))
        test_timestamp = pd.to_datetime([x[0] for x in test_timestamp], format='%Y-%m-%d %H:%M:%S')

        print(f"TRAIN {len(train)} - VAL {len(val)} - TEST {len(test)}")
        
        # PREPARE DATA ---------------------------------------------------------------------------------------------------
        # TRAIN
        dataset_train = prepare_data(train, 0, train_size-past, past, train_size, sequence_length, 1, batch_size, label_idx)
        
        # VAL
        dataset_val = prepare_data(val, 0, val_size-past, past, val_size, sequence_length, 1, batch_size, label_idx)
        
        # TEST
        dataset_test = prepare_data(test, 0, test_size, past, test_size, sequence_length, 1, batch_size, label_idx)

        for batch in dataset_train.take(1):
            inputs, targets = batch

        print(f"Input shape:  {inputs.numpy().shape}")
        print(f"Target shape: {targets.numpy().shape}")
        
        checkpoint_filepath = f'lstm/cval/checkpoint_{k}.h5'
        checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath, 
                            monitor="val_loss",
                            save_weights_only = True, 
                            save_best_only=True, verbose=0)

        model = create_model((inputs.shape[1], inputs.shape[2]), None)
        
        history = model.fit(
            dataset_train,
            epochs=1000,
            validation_data=dataset_val,
            shuffle=False,
            callbacks=[EarlyStopping(monitor="val_loss", patience=30, min_delta=0.01),
                      checkpoint
                      ],
            verbose=1
        )
         
        model.load_weights(checkpoint_filepath)
        
        days = int((test_size-past)/T)
        print(f"DAYS {days}")
        results = evaluate(dataset_test, model, days, pipeline.named_transformers_['label'])
        results.index = test_timestamp[past:]
        results.to_csv(f"lstm/cval/cval_res_{i}.csv")
        mae.append(mean_absolute_error(results['y_true'], results['y_pred']))
    
    print("MAE: {}".format(np.mean(mae)))

    with open('lstm/lstm_cavl_mae.npy', 'wb') as f:
        np.save(f, mae)

    fig = px.box(np.asarray(mae))
    fig.show()