Imports

In [1]:
from math import sqrt
from numpy import concatenate
from matplotlib import pyplot as plt
%matplotlib inline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
import numpy as np
import pandas as pd
import time

Using TensorFlow backend.


In [2]:
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences

## Baseline LSTM + room to change network architecture

In [3]:
def handle_wind_dir(data):
    '''
    Different ways to handle the string format:
     1. Drop it
     2. LabelEncode it
     3. One-hot encode it
     
    The reason I need to handle it is because Keras' pad_sequences function takes int() of
    all the columns.
    '''
    # I'll drop it first
    data = data.drop('wnd_dir', axis=1, inplace=False)
    return data

In [4]:
import warnings; warnings.filterwarnings('ignore')

In [5]:
df = pd.read_csv('../data/pollution.csv', header=0, index_col=0)

In [6]:
# Optionally drop wind dir
df = handle_wind_dir(df)

In [7]:
train, test = train_test_split(df, test_size=.998, random_state=789)

In [8]:
len(train)

87

In [9]:
len(test)

43713

In [339]:
def prepare_sequences(df):
    ''' 
    Let df = train and prepare sequences.
    '''
    input_cols = list(df.columns)
    df['single_input_vector'] = df[input_cols].apply(tuple, axis=1).apply(list)\
        .apply(lambda x: [list(x)])
    return df.reset_index(drop=True)

In [340]:
train = prepare_sequences(train)

In [341]:
train['cumulative_input_vectors'] = train.single_input_vector.cumsum()

In [342]:
train.head()

Unnamed: 0,pollution,dew,temp,press,wnd_spd,snow,rain,single_input_vector,cumulative_input_vectors
0,16.0,10,14.0,1020.0,18.32,0,0,"[[16.0, 10.0, 14.0, 1020.0, 18.32, 0.0, 0.0]]","[[16.0, 10.0, 14.0, 1020.0, 18.32, 0.0, 0.0]]"
1,12.0,-11,2.0,1025.0,109.07,0,0,"[[12.0, -11.0, 2.0, 1025.0, 109.07, 0.0, 0.0]]","[[16.0, 10.0, 14.0, 1020.0, 18.32, 0.0, 0.0], ..."
2,0.0,17,21.0,1012.0,48.27,0,0,"[[0.0, 17.0, 21.0, 1012.0, 48.27, 0.0, 0.0]]","[[16.0, 10.0, 14.0, 1020.0, 18.32, 0.0, 0.0], ..."
3,68.0,17,19.0,1006.0,3.58,0,0,"[[68.0, 17.0, 19.0, 1006.0, 3.58, 0.0, 0.0]]","[[16.0, 10.0, 14.0, 1020.0, 18.32, 0.0, 0.0], ..."
4,0.0,-19,5.0,1017.0,32.62,0,0,"[[0.0, -19.0, 5.0, 1017.0, 32.62, 0.0, 0.0]]","[[16.0, 10.0, 14.0, 1020.0, 18.32, 0.0, 0.0], ..."


In [343]:
# predicting G for now, just a test example
# If your output is multi-dimensional, you need to capture those 
# dimensions in one object
# If your output is a single dimension, this step may be unnecessary
def set_output_featureset(df, original_df):
    '''
    Let df=train and prepare output vector
    '''
    output_cols = list(original_df.columns)
    df['output_vector'] = df[output_cols].apply(tuple, axis=1).apply(list)
    return df

In [344]:
train = set_output_featureset(train, df)

In [345]:
train.head()

Unnamed: 0,pollution,dew,temp,press,wnd_spd,snow,rain,single_input_vector,cumulative_input_vectors,output_vector
0,16.0,10,14.0,1020.0,18.32,0,0,"[[16.0, 10.0, 14.0, 1020.0, 18.32, 0.0, 0.0]]","[[16.0, 10.0, 14.0, 1020.0, 18.32, 0.0, 0.0]]","[16.0, 10.0, 14.0, 1020.0, 18.32, 0.0, 0.0]"
1,12.0,-11,2.0,1025.0,109.07,0,0,"[[12.0, -11.0, 2.0, 1025.0, 109.07, 0.0, 0.0]]","[[16.0, 10.0, 14.0, 1020.0, 18.32, 0.0, 0.0], ...","[12.0, -11.0, 2.0, 1025.0, 109.07, 0.0, 0.0]"
2,0.0,17,21.0,1012.0,48.27,0,0,"[[0.0, 17.0, 21.0, 1012.0, 48.27, 0.0, 0.0]]","[[16.0, 10.0, 14.0, 1020.0, 18.32, 0.0, 0.0], ...","[0.0, 17.0, 21.0, 1012.0, 48.27, 0.0, 0.0]"
3,68.0,17,19.0,1006.0,3.58,0,0,"[[68.0, 17.0, 19.0, 1006.0, 3.58, 0.0, 0.0]]","[[16.0, 10.0, 14.0, 1020.0, 18.32, 0.0, 0.0], ...","[68.0, 17.0, 19.0, 1006.0, 3.58, 0.0, 0.0]"
4,0.0,-19,5.0,1017.0,32.62,0,0,"[[0.0, -19.0, 5.0, 1017.0, 32.62, 0.0, 0.0]]","[[16.0, 10.0, 14.0, 1020.0, 18.32, 0.0, 0.0], ...","[0.0, -19.0, 5.0, 1017.0, 32.62, 0.0, 0.0]"


In [346]:
# Pad my sequences with 0's so each element in the series has the same 
# length.
def pad_my_sequences(df):
    '''
    Let df=train and pad sequences
    '''
    max_sequence_length = df.cumulative_input_vectors.apply(len).max()
    padded_sequences = pad_sequences(df.cumulative_input_vectors,
                               maxlen=max_sequence_length,
                                    value=np.zeros(7)).tolist()

    input_vecs = pd.Series(padded_sequences).apply(np.asarray).copy()
    df['padded_input_vectors'] = input_vecs
    return df

In [347]:
max_sequence_length = train.cumulative_input_vectors.apply(len).max()

In [348]:
def func(x, maxlen=None):
    '''
    Pad sequences with lists of 0s. Functional lambda programming.
    '''
    zeros_to_add = maxlen - len(x)
    prepended = [np.zeros(7).tolist()]
    y = prepended*zeros_to_add + x
    return y

In [349]:
train.padded_input_vectors = train.cumulative_input_vectors.apply(lambda x: func(x, max_sequence_length))

In [350]:
# Extract your training data
X_train_init = np.asarray(train.padded_input_vectors)
# Training data for LSTM should be in the form of a 3D tuple:
#   (# of samples, timesteps, input_dim)
# Note that the input data that comes out of the dataframe 
# will not make a 3D array. It makes an array of arrays, 
# which is not the same thing.
# So far X_train_init is an array of arrays.

# Convert to 3D vector usng hstack and reshape
# horizontal stack = hstack, essentially removes the outer array 
#  encapsulation.

# reshape into (# of records, total_timesteps, input_dim)
# The reshape essentially reshaped the inner list into an 11 by 6 matrix,
#  or a max_sequence_length (rows) by input_dim (col) matrix
X_train = np.hstack(X_train_init).reshape(len(train), 
                                          max_sequence_length,
                                         len(df.columns))
y_train_init = np.asarray(train.output_vector)
y_train = np.hstack(y_train_init).reshape(len(train), len(df.columns))

In [314]:
print(X_train.shape)
print(y_train.shape)

(87, 87, 7)
(87, 7)


In [315]:
# Get your input dimensions
# Input length is the length for one input sequence 
#  (i.e. the number of rows for your sample, which is
#     the max_sequence_length by construction)
input_length = X_train.shape[1]
# Input dim is the number of dimensions in one input vector 
#  (i.e. number of input columns)
input_dim = X_train.shape[2]
output_dim = len(y_train[0])

In [355]:
output_dim

7

In [326]:
from keras.models import Model, Sequential
from keras.layers import LSTM, Dense

# Initialize the model
model = Sequential()

# arbitrarily picked the output dim to be 100
model.add(LSTM(100, input_shape=(input_length, input_dim)))
# The max output value is > 1 so used relu as final activation
model.add(Dense(output_dim, activation='relu'))

model.compile(loss='mean_squared_error',
             optimizer='adam',
             metrics=['accuracy'])


In [327]:
# Set batch_size to 30 to show that it doesn't have to be a factor 
# or multiple of your sample size
history = model.fit(X_train, y_train,
                   batch_size = 30, epochs=50,
                   verbose = 1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [328]:
model.predict(X_train)

array([[  6.35896730e+00,   7.97015727e-02,   0.00000000e+00,
          8.13524055e+00,   5.88157415e+00,   0.00000000e+00,
          0.00000000e+00],
       [  6.33025789e+00,   2.58132249e-01,   0.00000000e+00,
          8.14544964e+00,   5.60397673e+00,   0.00000000e+00,
          0.00000000e+00],
       [  6.34985209e+00,   2.62183487e-01,   0.00000000e+00,
          8.16323662e+00,   5.60958290e+00,   0.00000000e+00,
          0.00000000e+00],
       [  6.35255146e+00,   2.62741566e-01,   0.00000000e+00,
          8.16568661e+00,   5.61035538e+00,   0.00000000e+00,
          0.00000000e+00],
       [  6.35291815e+00,   2.62817293e-01,   0.00000000e+00,
          8.16601944e+00,   5.61046028e+00,   0.00000000e+00,
          0.00000000e+00],
       [  6.22186041e+00,   3.04498151e-03,   0.00000000e+00,
          8.03037357e+00,   5.32202387e+00,   0.00000000e+00,
          8.38674456e-02],
       [  6.22186708e+00,   3.04635242e-03,   0.00000000e+00,
          8.03038025e+00,   5.32

In [329]:
y_train

array([[  1.60000000e+01,   1.00000000e+01,   1.40000000e+01,
          1.02000000e+03,   1.83200000e+01,   0.00000000e+00,
          0.00000000e+00],
       [  1.20000000e+01,  -1.10000000e+01,   2.00000000e+00,
          1.02500000e+03,   1.09070000e+02,   0.00000000e+00,
          0.00000000e+00],
       [  0.00000000e+00,   1.70000000e+01,   2.10000000e+01,
          1.01200000e+03,   4.82700000e+01,   0.00000000e+00,
          0.00000000e+00],
       [  6.80000000e+01,   1.70000000e+01,   1.90000000e+01,
          1.00600000e+03,   3.58000000e+00,   0.00000000e+00,
          0.00000000e+00],
       [  0.00000000e+00,  -1.90000000e+01,   5.00000000e+00,
          1.01700000e+03,   3.26200000e+01,   0.00000000e+00,
          0.00000000e+00],
       [  3.90000000e+02,  -8.00000000e+00,   0.00000000e+00,
          1.03100000e+03,   8.90000000e-01,   0.00000000e+00,
          0.00000000e+00],
       [  6.00000000e+00,   1.00000000e+00,   2.00000000e+00,
          1.02800000e+03,   8.90