In [5]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import itertools
import joblib
import datetime
from timeit import default_timer as timer
from pathlib import Path
from tqdm import tqdm

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import sklearn.metrics as metrics
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from lstm_v3 import StandardScaler3D, read_np

INPUTS = [0, 1, 2, 3, 5]
OUTPUTS = [3]

LOOKBACK = 10
BATCH_SIZE = 5024
CONVERT_TO_2D = False

#%% Scaler. Recreating takes around 3 minutes
SCALER_PATH = f'scaler_{INPUTS}_2D={CONVERT_TO_2D}.pkl'
SCALER_CREATION_DIRS = ['/home/jperez/data/sled250']

if os.path.exists(SCALER_PATH):
    print('Loading previous scaler')
    SCALER = joblib.load(SCALER_PATH)
else:
    print('Recreating scaler')
    if CONVERT_TO_2D:
        SCALER = StandardScaler3D()
    else:
        SCALER = StandardScaler()
    for creation_dir in SCALER_CREATION_DIRS:
        for filepath in tqdm(list(Path(creation_dir).glob('*.npy'))):
            X, Y = read_np(filepath, None)
            SCALER.partial_fit(X)
    joblib.dump(SCALER, SCALER_PATH)

class SledDataGenerator(keras.utils.Sequence):
    def __init__(self, data_dir, batch_size, lookback, shuffle, start, end, step=1):
        print(f'Loading dataset {data_dir} from t={start} to t={end} with 2D={CONVERT_TO_2D}')

        self.data_dir = Path(data_dir)
        self.batch_size = batch_size
        self.lookback = lookback
        self.shuffle = shuffle
        self.start = start
        self.end = end
        self.step = step
        
        # Check if we have a serialized version of the data, if not, generate it
        # n_count = ((end-start)//step)+1
        data_path = f'{self.data_dir.stem}_{start}_{end}_in_{INPUTS}_out_{OUTPUTS}_2d_{CONVERT_TO_2D}.npz'
        if os.path.exists(data_path):
            csv_data = np.load(data_path)
            self.x_data = csv_data['X']
            self.y_data = csv_data['Y']
        else:
            self.x_data = []
            self.y_data = []

            for timestep in tqdm(range(start, end, step)):
                X, Y = self.__read_np__(timestep)
                self.x_data.append(X)
                self.y_data.append(Y)
            self.x_data = np.array(self.x_data)
            self.y_data = np.array(self.y_data)
            np.savez(data_path, X=self.x_data, Y=self.y_data)
        print('Debug: X=', self.x_data.shape, 'Y=', self.y_data.shape)

        # Some sanity checks
        if not CONVERT_TO_2D:
            assert self.x_data.shape[1] == self.y_data.shape[1], 'x_data and y_data have a shape mismatch in the number of CSV rows'
            # assert self.x_data.shape[1] == N_CSV_ROWS, f'N_CSV_ROWS does not match {self.x_data.shape[1]}'

        # Helpful variables
        # 1D: [timestep, inputs, 1]
        # 2D: [timestep, height, width, inputs, 1]
        self.n_timesteps = self.x_data.shape[0]

        # Generate a list of the valid timesteps for batches
        self.list_timesteps = np.arange(lookback, self.n_timesteps)
        # self.list_rows = np.arange(N_CSV_ROWS)
        self.list_rows = np.arange(self.x_data.shape[1])
        self.list_IDs = [(t, r) for t in self.list_timesteps for r in self.list_rows]
        # self.list_IDs = [x for x in itertools.product(self.list_timesteps, self.list_rows)]
        
        self.on_epoch_end()

    def __len__(self):
        # return int(np.floor(len(self.list_timesteps) / self.batch_size))
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        batch_idxs = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        batch_pairs = [self.list_IDs[k] for k in batch_idxs]

        batch_x = []
        batch_y = []
        for (timestep, row) in batch_pairs:
            if CONVERT_TO_2D:
                X = self.x_data[timestep-self.lookback:timestep, :, :]
                Y = self.y_data[timestep, :, :]
            else:
                X = self.x_data[timestep-self.lookback:timestep, row]
                Y = self.y_data[timestep, row]

            batch_x.append(X)
            batch_y.append(Y)

        return np.array(batch_x), np.array(batch_y)

    def on_epoch_end(self):
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle:
            np.random.shuffle(self.indexes)

    # def __read_csv__(self, timestep):
    #     filename = os.path.join(self.data_dir, f'export{timestep}.csv')
    #     return read_csv(filename, SCALER)

    def __read_np__(self, timestep):
        filename = os.path.join(self.data_dir, f'{timestep}.npy')
        return read_np(filename, SCALER)


Loading previous scaler


In [2]:
R_EXPERIMENT_N = 1

In [10]:
model = keras.models.load_model('LSTM_v3_exp17.hdf5')

In [6]:
val_generator = SledDataGenerator('/home/jperez/data/sled250', batch_size=BATCH_SIZE, lookback=LOOKBACK, shuffle=True, start=510, end=638+1)
model.evaluate(val_generator)

Loading dataset /home/jperez/data/sled250 from t=510 to t=639 with 2D=False
Debug: X= (129, 14184, 5) Y= (129, 14184, 1)


1.47993004322052

In [11]:
val_generator = SledDataGenerator('/home/jperez/data/sled255', batch_size=BATCH_SIZE, lookback=LOOKBACK, shuffle=True, start=19, end=760+1)
model.evaluate(val_generator)

Loading dataset /home/jperez/data/sled255 from t=19 to t=761 with 2D=False
Debug: X= (742, 14184, 5) Y= (742, 14184, 1)


27168.796875