# Pull the data from Athena and turn it into training data

Create a series of temperature strings that look like this:

[t0, t1 ... t23]

...with 24 temperatures for a given site.  Create these by sliding a window along unique temperatures by site.

TODO:

It's not actually learning...

* More than one site
* Deal with nulls
* Better windowing: https://www.tensorflow.org/tutorials/structured_data/time_series#4_create_tfdatadatasets
* Other data as inputs (pressure, wind speed etc)
* Encode time as input (e.g. hour of day)
* Longer input history
* More sites in input
* Normalise the data (temp => -1 - 1 etc)
* Wind speed and direction as vector (snazzy!)


https://www.tensorflow.org/tutorials/structured_data/time_series

In [1]:
import pandas as pd
import numpy as np
from pyathena import connect
from pyathena.pandas.cursor import PandasCursor
from sklearn.model_selection import train_test_split

cursor = connect(s3_staging_dir="s3://dantelore.queryresults/pyathena/",
                 region_name="eu-west-1", cursor_class=PandasCursor).cursor()


length = 24
sub_sequences = []

sites_df = pd.DataFrame(['WITTERING'], columns=['site_name']) #cursor.execute("select distinct(site_name) from lake.weather").as_pandas()
for site_name in sites_df['site_name']:
    df = cursor.execute(f"select observation_ts, temperature from lake.weather where site_name = '{site_name}' order by observation_ts asc").as_pandas()

    df['temperature'] = df['temperature'].astype('float32')

    train_mean = df['temperature'].mean()
    train_std = df['temperature'].std()

    df['temperature'] = df['temperature'].apply(lambda x: (x - train_mean) / train_std)     

    all_temps = df['temperature'].values

    for i in range(0, len(all_temps) - length):
        sub_sequences.append([[x] for x in all_temps[i:i+length]])

input_sequences = np.array(sub_sequences)

# Some funky numpy slicing here.  After the comma we are slicing by column in a 2D array - so xs takes all but the last column, ys takes only the last column
xs = input_sequences[:,:-1]
ys = input_sequences[:,-1]

x_train, x_test, y_train, y_test = train_test_split(xs, ys, test_size=0.1)
#x_train = xs[0:8000]
#x_test = xs[8000:]
#y_train = ys[0:8000]
#y_test = ys[8000:]

# ((8391, 23, 1), (8391, 1), (933, 23, 1), (933, 1))
x_train.shape, y_train.shape, x_test.shape, y_test.shape


((8391, 23, 1), (8391, 1), (933, 23, 1), (933, 1))

In [2]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Bidirectional, LSTM, InputLayer
from tensorflow.keras.optimizers import Adam

model = Sequential([
    InputLayer((length - 1, 1)),
    LSTM(64),
    Dense(units=8, activation='relu'),
    Dense(units=1, activation='linear')
])

adam = Adam(lr=0.0001)

model.compile(loss=tf.keras.losses.MeanSquaredError(), optimizer=adam, metrics=[tf.keras.metrics.MeanAbsoluteError()])

# Create the training history dataframe at the same time as the model...
history_df = pd.DataFrame(columns=['loss', 'mean_absolute_error', 'val_loss', 'val_mean_absolute_error'])

model.summary()

Metal device set to: Apple M1 Pro

systemMemory: 32.00 GB
maxCacheSize: 10.67 GB

Model: "sequential"


2023-01-23 16:12:27.860849: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-01-23 16:12:27.861004: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
  super(Adam, self).__init__(name, **kwargs)


_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 64)                16896     
                                                                 
 dense (Dense)               (None, 8)                 520       
                                                                 
 dense_1 (Dense)             (None, 1)                 9         
                                                                 
Total params: 17,425
Trainable params: 17,425
Non-trainable params: 0
_________________________________________________________________


In [3]:
from datetime import datetime
import matplotlib.pyplot as plt
from IPython.display import clear_output
from tensorflow.keras.callbacks import ModelCheckpoint


tf.get_logger().setLevel('ERROR')

run_count = 1
epochs_per_run = 10
model_filename = 'data/weather_models/model1/'

cp = ModelCheckpoint(model_filename, save_best_only=True, verbose=0)

for run in range(0, run_count):
    print(f"Starting Run {run}/{run_count}")
    # Use verbose=2 here to prevent progreess bars locking up jupyter after a few hours
    history = model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=epochs_per_run, verbose=2, callbacks=[cp])

    history_df = pd.concat([history_df, pd.DataFrame(history.history)], ignore_index=True)

    plt.plot(history_df['mean_absolute_error'], label="Training Error")
    plt.plot(history_df['val_mean_absolute_error'], label="Validation Error")
    plt.title(f'Model Training Progress - Run {run} of {run_count}')
    plt.ylabel('Mean Absolute Error')
    plt.xlabel('Epoch')
    plt.show()

Starting Run 0/1
Epoch 1/10


2023-01-23 16:12:28.282961: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2023-01-23 16:12:29.044410: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-01-23 16:12:29.200714: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-01-23 16:12:29.358820: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


In [None]:
from tensorflow.keras.models import load_model

model = load_model(model_filename)

train_predictions = model.predict(x_train).flatten()
train_results = pd.DataFrame(data={'Predicted': train_predictions, "Actual": y_train.flatten()})

train_results['Predicted'] = train_results['Predicted'].apply(lambda x: x * train_std + train_mean)
train_results['Actual'] = train_results['Actual'].apply(lambda x: x * train_std + train_mean)

plt.plot(train_results['Predicted'][0:100])
plt.plot(train_results['Actual'][0:100])

In [None]:
val_predictions = model.predict(x_test).flatten()

val_results = pd.DataFrame(data={'Predicted': val_predictions, "Actual": y_test.flatten()})

val_results['Predicted'] = val_results['Predicted'].apply(lambda x: x * train_std + train_mean)
val_results['Actual'] = val_results['Actual'].apply(lambda x: x * train_std + train_mean)

plt.plot(val_results['Predicted'][0:100])
plt.plot(val_results['Actual'][0:100])