In [1]:
%matplotlib inline

import numpy as np
import matplotlib as mpl
import pandas as pd

# TensorFlow
import os
HOME = os.getenv('HOME')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # to get rid of the TF warnings
import tensorflow as tf
from tensorflow.keras import models
from tensorflow.keras.layers import Dense, LSTM, Reshape, Flatten
from tensorflow.keras.utils import get_file

from tqdm import tqdm

In [2]:
# Get the data
url = 'https://storage.googleapis.com/tensorflow/tf-keras-datasets/jena_climate_2009_2016.csv.zip'
csv_path = f'{HOME}/tensorflow_datasets/climate/'
csv_path += 'jena_climate_2009_2016.csv.zip'

zip_path = get_file(origin = url, fname = csv_path,
                    archive_format='zip',extract=True)

In [3]:
df = pd.read_csv(csv_path)

In [4]:
df.head()

Unnamed: 0,Date Time,p (mbar),T (degC),Tpot (K),Tdew (degC),rh (%),VPmax (mbar),VPact (mbar),VPdef (mbar),sh (g/kg),H2OC (mmol/mol),rho (g/m**3),wv (m/s),max. wv (m/s),wd (deg)
0,01.01.2009 00:10:00,996.52,-8.02,265.4,-8.9,93.3,3.33,3.11,0.22,1.94,3.12,1307.75,1.03,1.75,152.3
1,01.01.2009 00:20:00,996.57,-8.41,265.01,-9.28,93.4,3.23,3.02,0.21,1.89,3.03,1309.8,0.72,1.5,136.1
2,01.01.2009 00:30:00,996.53,-8.51,264.91,-9.31,93.9,3.21,3.01,0.2,1.88,3.02,1310.24,0.19,0.63,171.6
3,01.01.2009 00:40:00,996.51,-8.31,265.12,-9.07,94.2,3.26,3.07,0.19,1.92,3.08,1309.19,0.34,0.5,198.0
4,01.01.2009 00:50:00,996.51,-8.27,265.15,-9.04,94.1,3.27,3.08,0.19,1.92,3.09,1309.0,0.32,0.63,214.3


In [5]:
TRAIN_SPLIT = 300000

In [6]:
tf.random.set_seed(13)

In [7]:
def create_time_steps(length):
  return list(range(-length, 0))

In [8]:
def show_plot(plot_data, delta, title):
  labels = ['History', 'True Future', 'Model Prediction']
  marker = ['.-', 'rx', 'go']
  time_steps = create_time_steps(plot_data[0].shape[0])
  if delta:
    future = delta
  else:
    future = 0

  plt.title(title)
  for i, x in enumerate(plot_data):
    if i:
      plt.plot(future, plot_data[i], marker[i], markersize=10,
               label=labels[i])
    else:
      plt.plot(time_steps, plot_data[i].flatten(), marker[i], label=labels[i])
  plt.legend()
  plt.xlim([time_steps[0], (future+5)*2])
  plt.xlabel('Time-Step')
  return plt

In [36]:
def multivariate_data(dataset, target, start_index, end_index, history_size,
                                       target_size, step, single_step=False):
    print('start_index:',start_index)
    print('end_index:',end_index)
    data,labels = [],[]

    start_index = start_index + history_size
    if end_index is None:
        end_index = len(dataset) - target_size

    for i in range(start_index, end_index):
        indices = range(i-history_size, i, step)
        data.append(dataset[indices])
        if single_step:
            labels.append(target[i+target_size])
        else:
            labels.append(target[i:i+target_size])
    return np.array(data), np.array(labels)

In [37]:
past_history = 720
future_target = 72
STEP = 6

features_considered = ['p (mbar)', 'T (degC)', 'rho (g/m**3)']

features = df[features_considered]
features.index = df['Date Time']
features.head()

dataset = features.values
data_mean = dataset[:TRAIN_SPLIT].mean(axis=0)
data_std = dataset[:TRAIN_SPLIT].std(axis=0)

dataset = (dataset-data_mean)/data_std

x_train_multi, y_train_multi = multivariate_data(dataset, dataset[:, 1], 0,
                                                 TRAIN_SPLIT, past_history,
                                                 future_target, STEP)
x_val_multi, y_val_multi = multivariate_data(dataset, dataset[:, 1],
                                             TRAIN_SPLIT, None, past_history,
                                             future_target, STEP)

start_index: 0
end_index: 300000
start_index: 300000
end_index: None


Let's check out a sample data-point.

In [38]:
print(x_train_multi.shape)
print(y_train_multi.shape)

(299280, 120, 3)
(299280, 72)


## Ahora mi intento

In [39]:
def get_sample(DF,ind,Nhistory,Nfuture,columns_in,columns_out):
    Nsamples = len(df.index)
    if ind-Nhistory > 0 and ind+Nfuture < Nsamples:
        inp = DF.iloc[ind-Nhistory:ind][columns_in]
        out = DF.iloc[ind:ind+Nfuture][columns_out]
        return inp.values, out.values
    else: return None,None

In [40]:
def prepare_dataset(DF,inds,Nhistory,Nfuture,columns_in,columns_out):
    inps, outs = [],[]
    for ind in tqdm(inds):
        inp,out = get_sample(DF,ind,Nhistory,Nfuture,columns_in,columns_out)
        if inp is None or out is None: continue
        inps.append( inp )
        outs.append( out )
    inps = np.array(inps)
    outs = np.array(outs)
    if len(inps.shape) == 1: inps = np.expand_dims(inps, axis=0)
    if len(outs.shape) == 1: outs = np.expand_dims(outs, axis=0)
    return inps,outs

In [48]:
# Normalize!!
cols = df.columns.values[1:]
df_num = df[cols]
mean = df_num.mean()
std = df_num.std()
dfn = df.copy()    # dfn = normalized
dfn['Date Time'] = pd.to_datetime(dfn['Date Time'],format='%d.%m.%Y %H:%M:%S')
dfn = dfn.loc[dfn['Date Time'].apply(lambda x: x.minute) == 0]
dfn[cols] = (dfn[cols]-mean)/std

Nhistory = int(3*24*60/10)
Nfuture  = int(0.5*24*60/10)
columns_in = ['p (mbar)', 'T (degC)', 'rho (g/m**3)']

columns_out = ['T (degC)']

# Random split for train and test
inds = np.array(range(Nhistory,len(dfn.index)-Nfuture))
#np.random.shuffle(inds)

Ntrain = int(len(inds)*0.8)
Nvalid = int(len(inds)*0.199)
Ntest  = int(len(inds)*0.001)

train_ind = inds[:Ntrain]
valid_ind = inds[Ntrain:Ntrain+Nvalid]
test_ind  = inds[Ntrain+Nvalid:]   # Ntrain+Nvalid+Ntest]

train_inp,train_out = prepare_dataset(dfn, train_ind,
                                      Nhistory, Nfuture,
                                      columns_in, columns_out)

100%|██████████| 55670/55670 [01:08<00:00, 807.41it/s]


In [49]:
print(train_inp.shape)
print(train_out.shape)
print(train_inp[Nhistory])

(55669, 432, 3)
(55669, 72, 1)
[[-1.14288427 -0.98299755  0.63732631]
 [-1.10340341 -1.01623842  0.68035298]
 [-1.17638318 -0.97112581  0.61756406]
 ...
 [-2.11794185 -1.08628169  0.47297445]
 [-2.09879962 -1.02929733  0.42219297]
 [-2.16819022 -1.05897668  0.43219918]]


## Hasta aquí

In [None]:
print ('Single window of past history : {}'.format(x_train_multi[0].shape))
print ('\n Target temperature to predict : {}'.format(y_train_multi[0].shape))

In [None]:
train_data_multi = tf.data.Dataset.from_tensor_slices((x_train_multi, y_train_multi))
train_data_multi = train_data_multi.cache().shuffle(BUFFER_SIZE).batch(BATCH_SIZE).repeat()

val_data_multi = tf.data.Dataset.from_tensor_slices((x_val_multi, y_val_multi))
val_data_multi = val_data_multi.batch(BATCH_SIZE).repeat()

Plotting a sample data-point.

In [None]:
def multi_step_plot(history, true_future, prediction):
  plt.figure(figsize=(12, 6))
  num_in = create_time_steps(len(history))
  num_out = len(true_future)

  plt.plot(num_in, np.array(history[:, 1]), label='History')
  plt.plot(np.arange(num_out)/STEP, np.array(true_future), 'bo',
           label='True Future')
  if prediction.any():
    plt.plot(np.arange(num_out)/STEP, np.array(prediction), 'ro',
             label='Predicted Future')
  plt.legend(loc='upper left')
  plt.show()

In this plot and subsequent similar plots, the history and the future data are sampled every hour.

In [None]:
for x, y in train_data_multi.take(1):
  multi_step_plot(x[0], y[0], np.array([0]))

Since the task here is a bit more complicated than the previous task, the model now consists of two LSTM layers. Finally, since 72 predictions are made, the dense layer outputs 72 predictions.

In [None]:
multi_step_model = tf.keras.models.Sequential()
multi_step_model.add(tf.keras.layers.LSTM(32,
                                          return_sequences=True,
                                          input_shape=x_train_multi.shape[-2:]))
multi_step_model.add(tf.keras.layers.LSTM(16, activation='relu'))
multi_step_model.add(tf.keras.layers.Dense(72))

multi_step_model.compile(optimizer=tf.keras.optimizers.RMSprop(clipvalue=1.0), loss='mae')

Let's see how the model predicts before it trains.

In [None]:
for x, y in val_data_multi.take(1):
  print (multi_step_model.predict(x).shape)

In [None]:
multi_step_history = multi_step_model.fit(train_data_multi, epochs=EPOCHS,
                                          steps_per_epoch=EVALUATION_INTERVAL,
                                          validation_data=val_data_multi,
                                          validation_steps=50)

In [None]:
plot_train_history(multi_step_history, 'Multi-Step Training and validation loss')

#### Predict a multi-step future
Let's now have a look at how well your network has learnt to predict the future.

In [None]:
for x, y in val_data_multi.take(3):
  multi_step_plot(x[0], y[0], multi_step_model.predict(x)[0])

## Next steps
This tutorial was a quick introduction to time series forecasting using an RNN. You may now try to predict the stock market and become a billionaire.

In addition, you may also write a generator to yield data (instead of the uni/multivariate_data function), which would be more memory efficient. You may also check out this [time series windowing](https://www.tensorflow.org/guide/data#time_series_windowing) guide and use it in this tutorial.

For further understanding, you may read Chapter 15 of [Hands-on Machine Learning with Scikit-Learn, Keras, and TensorFlow](https://www.oreilly.com/library/view/hands-on-machine-learning/9781492032632/), 2nd Edition and Chapter 6 of [Deep Learning with Python](https://www.manning.com/books/deep-learning-with-python).