In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import itertools
import joblib
import datetime
from timeit import default_timer as timer
from pathlib import Path
from tqdm import tqdm

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import sklearn.metrics as metrics
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import scaling
import data

np.set_printoptions(suppress=True, linewidth=np.inf)

In [2]:
R_EXPERIMENT_N = 1

LOOKBACK = 10
BATCH_SIZE = 5024
PREDICT_AHEAD = 1
NEIGHBOR_SIZE = 1

USE_2D = False

INPUTS = ['X', 'Y', 'P', 'Vu', 'Vv', 'W.VF']
OUTPUTS = ['Vu', 'Vv']

SCALER_PATH = os.path.join('output', f'scaler_{INPUTS}_2D={USE_2D}.pkl')
SCALER_CREATION_DIRS = ['/home/jperez/data/sled250', '/home/jperez/data/sled255']

sc = scaling.load_or_create(SCALER_PATH, SCALER_CREATION_DIRS, INPUTS, OUTPUTS, USE_2D)

Loading previous scaler


Baseline Work

In [29]:
print(batch_y.shape, batch_x[:, :, :2].shape)

(5024, 1, 2) (5024, 10, 2)


In [20]:
from sklearn.metrics import mean_squared_error
val_generator = data.SledDataGenerator('/home/jperez/data/sled250', batch_size=BATCH_SIZE, lookback=1, predict_ahead=1, neighbor_size=1, shuffle=True, use_2D=USE_2D, 
                                        inputs=['Vu', 'Vv', 'P'], outputs=OUTPUTS, scaler=sc, 
                                        start=510, end=638+1)

# Baseline 1: No change
all_true = []
all_pred = []
for batch_x, batch_y in tqdm(val_generator):
    all_true.extend(batch_y.reshape(-1, 2))
    all_pred.extend(batch_x[:, :, :2].reshape(-1, 2))
print(mean_squared_error(all_true, all_pred))

Loading dataset /home/jperez/data/sled250 from t=510 to t=639 with 2D=False
Debug: X= (129, 14184, 3) Y= (129, 14184, 2)


100%|██████████| 361/361 [00:05<00:00, 70.58it/s]


6.371095993326289


In [33]:
# Baseline 2: Mean of timesteps
# Lookback 10 - MSE 142
# Lookback 2 - MSE 13.7 | Vu=21.86, Vv=5.54
# Lookback 1 - MSE 6.37 | Vu=10.05, Vv=2.7
val_generator = data.SledDataGenerator('/home/jperez/data/sled250', batch_size=BATCH_SIZE, lookback=2, predict_ahead=1, neighbor_size=1, shuffle=True, use_2D=USE_2D, 
                                        inputs=['Vu', 'Vv', 'P'], outputs=OUTPUTS, scaler=sc, 
                                        start=510, end=638+1)

all_true = []
all_pred = []
for batch_x, batch_y in tqdm(val_generator):
    all_true.extend(batch_y.reshape(-1, 2))
    all_pred.extend(np.mean(batch_x[:, :, :2], axis=1))
print(mean_squared_error(all_true, all_pred, multioutput='raw_values'))

Loading dataset /home/jperez/data/sled250 from t=510 to t=639 with 2D=False
Debug: X= (129, 14184, 3) Y= (129, 14184, 2)


100%|██████████| 358/358 [00:05<00:00, 66.41it/s]


[21.8616049   5.53655403]


In [3]:
train_generator = data.SledDataGenerator('/home/jperez/data/sled250', batch_size=BATCH_SIZE, lookback=LOOKBACK, predict_ahead=PREDICT_AHEAD, neighbor_size=NEIGHBOR_SIZE, shuffle=True, use_2D=USE_2D, inputs=INPUTS, outputs=OUTPUTS, scaler=sc, 
                                        start=1, end=510+1)


Loading dataset /home/jperez/data/sled250 from t=1 to t=511 with 2D=False
Debug: X= (510, 14184, 6) Y= (510, 14184, 2)


In [7]:
y_data = train_generator[0][1]
print(y_data.reshape(-1, 2).shape)

(5024, 2)


In [5]:
print(train_generator[0][0].shape, train_generator[0][1].shape)

(5024, 10, 6) (5024, 1, 2)


In [6]:
from numpy import array
from numpy.random import uniform
from numpy import hstack
def create_data(n):
 x1 = array([i/100+uniform(-1,3) for i in range(n)]).reshape(n,1)
 x2 = array([i/100+uniform(-3,5)+2 for i in range(n)]).reshape(n,1)
 x3 = array([i/100+uniform(-6,5)-3 for i in range(n)]).reshape(n,1)

 y1= [x1[i]-x2[i]+x3[i]+uniform(-2,2) for i in range(n)]
 y2= [x1[i]+x2[i]-x3[i]+5+uniform(-1,3) for i in range(n)]
 X = hstack((x1, x2, x3))
 Y = hstack((y1, y2))
 return X, Y
x, y = create_data(n=400)
print(y.shape)

(400, 2)


In [176]:
train_generator = data.SledDataGenerator('/home/jperez/data/sled255', batch_size=BATCH_SIZE, lookback=LOOKBACK, predict_ahead=PREDICT_AHEAD, neighbor_size=NEIGHBOR_SIZE, shuffle=True, use_2D=USE_2D, inputs=INPUTS, outputs=OUTPUTS, scaler=sc, 
                                        start=19, end=760+1)

Loading dataset /home/jperez/data/sled255 from t=19 to t=761 with 2D=False


100%|██████████| 742/742 [00:00<00:00, 1388.95it/s]


Debug: X= (742, 14184, 5) Y= (742, 14184, 1)


In [180]:
timestep_idx = 0
row_idx = 50

x_data = train_generator.x_data

x1, y1, p1, vu1, wvf1 = x_data[timestep_idx, row_idx]
print(f'{x1:.3f}, {y1:.3f}, max_x={np.max(x_data[:,:,0])}, min_x={np.min(x_data[:,:,0])} , max_y={np.max(x_data[:,:,1])}, min_y={np.min(x_data[:,:,1])}')

around = [[x1+0.025, y1],
        [x1-0.025, y1],
        [x1, y1+0.025],
        [x1, y1-0.025],
        [x1-0.025, y1-0.025],
        [x1+0.025, y1-0.025],
        [x1-0.025, y1+0.025],
        [x1+0.025, y1+0.025]
]
around_t = ['Right', 'Left', 'Top', 'Bottom', 'BottomLeft', 'BottomRight', 'TopLeft', 'TopRight']

for idx, (x3, y3) in enumerate(around):
    s = np.sqrt((x_data[timestep_idx, :, 0]-x3)**2 + (x_data[timestep_idx, :, 1]-y3)**2)
    print(around_t[idx], 'is', x3, y3, 'which is closest to row', np.argmin(s))

distances = []
for row in range(x_data.shape[1]):
    if row == row_idx:
        continue
    x2, y2, p2, vu2, wvf2 = x_data[timestep_idx, row]
    distances.append([np.around(np.sqrt((x2-x1)**2+(y2-y1)**2), decimals=4), row, x_data[timestep_idx, row, :2]])

sorted(distances, key=lambda x: x[0])[:8]

0.025, 0.025, max_x=3.9749999, min_x=-0.0250000004 , max_y=3.9749999, min_y=-0.0250000004
Right is 0.0500000004 0.0250000004 which is closest to row 8
Left is 3.999999984016789e-10 0.0250000004 which is closest to row 8
Top is 0.0250000004 0.0500000004 which is closest to row 8
Bottom is 0.0250000004 3.999999984016789e-10 which is closest to row 8
BottomLeft is 3.999999984016789e-10 3.999999984016789e-10 which is closest to row 14046
BottomRight is 0.0500000004 3.999999984016789e-10 which is closest to row 8
TopLeft is 3.999999984016789e-10 0.0500000004 which is closest to row 8
TopRight is 0.0500000004 0.0500000004 which is closest to row 12


[[0.0, 8, array([0.025, 0.025])],
 [0.0, 11, array([0.025, 0.025])],
 [0.0, 48, array([0.025, 0.025])],
 [0.0, 52, array([0.025, 0.025])],
 [0.0, 54, array([0.025, 0.025])],
 [0.0, 56, array([0.025, 0.025])],
 [0.0, 58, array([0.025, 0.025])],
 [0.0, 60, array([0.025, 0.025])]]

In [157]:
x_data[timestep_idx, :, :2]

array([[ 0.   ,  0.25 ],
       [ 0.   ,  0.275],
       [-0.025,  0.275],
       ...,
       [ 0.25 ,  1.975],
       [ 0.225,  1.975],
       [ 0.2  ,  1.975]])

In [160]:
x_data[0, 7, :2]

array([-0.025,  0.2  ])

In [159]:
for x in np.arange(-0.025, 3.975, 0.025):
    for y in np.arange(0, 2.2, 0.025):
        idx = np.argmin((np.sqrt(x_data[0, :, 0]-x)**2 + (x_data[0, :, 1]-y)**2))
        print(f'Row {idx} for x={x}, y={y}')

Row 10 for x=-0.025, y=0.0
Row 9 for x=-0.025, y=0.025
Row 7 for x=-0.025, y=0.05
Row 7 for x=-0.025, y=0.07500000000000001
Row 7 for x=-0.025, y=0.1
Row 7 for x=-0.025, y=0.125
Row 7 for x=-0.025, y=0.15000000000000002
Row 7 for x=-0.025, y=0.17500000000000002
Row 7 for x=-0.025, y=0.2
Row 5 for x=-0.025, y=0.225
Row 3 for x=-0.025, y=0.25
Row 2 for x=-0.025, y=0.275
Row 808 for x=-0.025, y=0.30000000000000004
Row 809 for x=-0.025, y=0.325
Row 810 for x=-0.025, y=0.35000000000000003
Row 811 for x=-0.025, y=0.375
Row 812 for x=-0.025, y=0.4
Row 813 for x=-0.025, y=0.42500000000000004
Row 970 for x=-0.025, y=0.45
Row 971 for x=-0.025, y=0.47500000000000003
Row 972 for x=-0.025, y=0.5
Row 973 for x=-0.025, y=0.525
Row 974 for x=-0.025, y=0.55
Row 975 for x=-0.025, y=0.5750000000000001
Row 2778 for x=-0.025, y=0.6000000000000001
Row 2779 for x=-0.025, y=0.625
Row 2780 for x=-0.025, y=0.65
Row 2781 for x=-0.025, y=0.675
Row 2782 for x=-0.025, y=0.7000000000000001
Row 2953 for x=-0.025, y=0

  idx = np.argmin((np.sqrt(x_data[0, :, 0]-x)**2 + (x_data[0, :, 1]-y)**2))



Row 0 for x=0.07500000000000001, y=1.175
Row 0 for x=0.07500000000000001, y=1.2000000000000002
Row 0 for x=0.07500000000000001, y=1.225
Row 0 for x=0.07500000000000001, y=1.25
Row 0 for x=0.07500000000000001, y=1.2750000000000001
Row 0 for x=0.07500000000000001, y=1.3
Row 0 for x=0.07500000000000001, y=1.3250000000000002
Row 0 for x=0.07500000000000001, y=1.35
Row 0 for x=0.07500000000000001, y=1.375
Row 0 for x=0.07500000000000001, y=1.4000000000000001
Row 0 for x=0.07500000000000001, y=1.425
Row 0 for x=0.07500000000000001, y=1.4500000000000002
Row 0 for x=0.07500000000000001, y=1.475
Row 0 for x=0.07500000000000001, y=1.5
Row 0 for x=0.07500000000000001, y=1.5250000000000001
Row 0 for x=0.07500000000000001, y=1.55
Row 0 for x=0.07500000000000001, y=1.5750000000000002
Row 0 for x=0.07500000000000001, y=1.6
Row 0 for x=0.07500000000000001, y=1.625
Row 0 for x=0.07500000000000001, y=1.6500000000000001
Row 0 for x=0.07500000000000001, y=1.675
Row 0 for x=0.07500000000000001, y=1.700000

In [97]:
unique_x = np.unique(np.around(train_generator.x_data[timestep_idx, :, 0], decimals=5))
print(len(unique_x), unique_x)
print(np.unique([np.around(unique_x[i]-unique_x[i+1], decimals=3) for i in range(unique_x.shape[0]-1)]))

3085 [-0.025  0.     0.025 ...  3.925  3.95   3.975]
[-0.025 -0.024 -0.023 -0.022 -0.021 -0.02  -0.019 -0.018 -0.017 -0.015 -0.014 -0.009 -0.008 -0.006 -0.005 -0.004 -0.003 -0.002 -0.001 -0.   ]


In [78]:
import pandas as pd
df = pd.DataFrame(train_generator.x_data[timestep_idx, :])
df.to_csv('example.csv')

In [10]:
model = keras.models.load_model('LSTM_v3_exp17.hdf5')

In [6]:
val_generator = data.SledDataGenerator('/home/jperez/data/sled255', batch_size=BATCH_SIZE, lookback=LOOKBACK, shuffle=True, use_2D=USE_2D, inputs=INPUTS, outputs=OUTPUTS, scaler=sc, start=19, end=760+1)
model.evaluate(val_generator)

Loading dataset /home/jperez/data/sled250 from t=510 to t=639 with 2D=False
Debug: X= (129, 14184, 5) Y= (129, 14184, 1)


1.47993004322052