In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np

from tensorflow.contrib import rnn

In [2]:
#Directory where summaries will be stored
LOG_DIR = './logs'
# unrolled through 48 time steps
TIME_STEPS = 48
# number of inputs
FEATURE_COUNT = 11
TRAINING_STEPS = 10000
BATCH_SIZE = 60
# hidden LSTM units
NUM_UNITS = 60
# learning rate for adam
LEARNING_RATE = 0.001

In [3]:
def rnn_data(data, time_steps, labels=False):
    """
    creates new data frame based on previous observation
      * example:
        l = [1, 2, 3, 4, 5]
        time_steps = 2
        -> labels == False [[1, 2], [2, 3], [3, 4]]
        -> labels == True [2, 3, 4, 5]
    """
    rnn_df = []
    for i in range(len(data) - time_steps - 1):
        if labels:
            data_ = data.iloc[i + time_steps].as_matrix()
        else:
            data_ = data.iloc[i: i + time_steps].as_matrix()

        rnn_df.append(data_)
    return np.array(rnn_df)

In [4]:
def split_data(data, val_size=0.1, test_size=0.1):
    """
    splits data to training, validation and testing parts
    """
    ntest = int(round(len(data) * (1 - test_size)))
    nval = int(round(len(data.iloc[:ntest]) * (1 - val_size)))

    df_train, df_val, df_test = data.iloc[:nval], data.iloc[nval:ntest], data.iloc[ntest:]

    return df_train, df_val, df_test

In [5]:
def prepare_data(data, time_steps, labels=False, val_size=0.15, test_size=0.15):
    """
    Given the number of `time_steps` and some data,
    prepares training, validation and test data for an lstm cell.
    """
    df_train, df_val, df_test = split_data(data, val_size, test_size)
    return (rnn_data(df_train, time_steps, labels=labels),
            rnn_data(df_val, time_steps, labels=labels),
            rnn_data(df_test, time_steps, labels=labels))

In [6]:
def generic_normalize(x, define_max=0, define_min=999):
    if define_min == 999:
        norm_min = x.min()
    else:
        norm_min = define_min
    norm_min = x.min()
    if define_max == 0:
        max = x.max()
    else:
        max = define_max
    diff = max - norm_min

    return ((x - norm_min) / diff) - 0.5, norm_min, diff

In [7]:
def generic_denormalize(x, denorm_min, diff):
    return ((x + 0.5) * diff) + denorm_min

In [8]:
# Create log directory
if tf.gfile.Exists(LOG_DIR):
    tf.gfile.DeleteRecursively(LOG_DIR)
tf.gfile.MakeDirs(LOG_DIR)

In [9]:
# Get data
# Expects the file to to be in folder named 'data' which resides in
# the folder that this notebook resides in
file_path = 'data'
file_name = '/1114147.csv'
df_raw_data = pd.read_csv(file_path + file_name, encoding="ISO-8859-1", low_memory=False)

In [10]:
# Remove the first two columns
df_inputs = df_raw_data.iloc[:, 2:]
# Change date to a datetime object so that it is easier to manipulate
df_inputs.DATE = pd.to_datetime(df_inputs.DATE)

# Create individual columns for seperate date values
df_inputs['year'] = df_inputs.DATE.dt.year
df_inputs['month'] = df_inputs.DATE.dt.month
df_inputs['day'] = df_inputs.DATE.dt.dayofyear
df_inputs['hour'] = df_inputs.DATE.dt.hour

# Blank cells of Columns we will use with zeros
df_inputs.HOURLYWindGustSpeed = df_inputs.HOURLYWindGustSpeed.fillna(0)
df_inputs.HOURLYPrecip = df_inputs.HOURLYPrecip.fillna(0)

# Get a subset of the weather data that corresponds to what is available
# daily without a request
df_inputs = df_inputs[[
    'day',
    'hour',
    'HOURLYWindDirection',
    'HOURLYWindGustSpeed',
    'HOURLYVISIBILITY',
    'HOURLYDRYBULBTEMPF',
    'HOURLYDewPointTempF',
    'HOURLYRelativeHumidity',
    'HOURLYAltimeterSetting',
    'HOURLYSeaLevelPressure',
    'HOURLYPrecip'
]]

# Drop the rows without a complet eset of data
df_inputs = df_inputs.dropna()

# Remove extranious non digit data
df_inputs.HOURLYWindDirection.replace('VRB', -1, inplace=True)
df_inputs.HOURLYVISIBILITY.replace(['V', 's'], '', regex=True, inplace=True)
df_inputs.HOURLYDRYBULBTEMPF.replace(['V', 's'], '', regex=True, inplace=True)
df_inputs.HOURLYDewPointTempF.replace(['V', 's'], '', regex=True, inplace=True)
df_inputs.HOURLYAltimeterSetting.replace(['V', 's'], '', regex=True, inplace=True)
df_inputs.HOURLYSeaLevelPressure.replace(['V', 's'], '', regex=True, inplace=True)
df_inputs.HOURLYPrecip.replace(['T', 's'], [0.001, ''], regex=True, inplace=True)

# change all values from strings to numbers
df_inputs = df_inputs.apply(pd.to_numeric)
df_inputs = df_inputs.astype('float32')

In [12]:
# Normalize the data to make it easier for Tensorflow to process
df_inputs.hour, hour_min, hour_diff = generic_normalize(df_inputs.hour)
df_inputs.day, day_min, day_diff = generic_normalize(df_inputs.day)
df_inputs.HOURLYWindDirection, HOURLYWindDirection_min, HOURLYWindDirection_diff = \
    generic_normalize(df_inputs.HOURLYWindDirection)
df_inputs.HOURLYWindGustSpeed, HOURLYWindGustSpeed_min, HOURLYWindGustSpeed_diff = \
    generic_normalize(df_inputs.HOURLYWindGustSpeed)
df_inputs.HOURLYVISIBILITY, HOURLYVISIBILITY_min, HOURLYVISIBILITY_diff = \
    generic_normalize(df_inputs.HOURLYVISIBILITY)
df_inputs.HOURLYDRYBULBTEMPF, HOURLYDRYBULBTEMPF_min, HOURLYDRYBULBTEMPF_diff = \
    generic_normalize(df_inputs.HOURLYDRYBULBTEMPF)
df_inputs.HOURLYDewPointTempF, HOURLYDewPointTempF_min, HOURLYDewPointTempF_diff = \
    generic_normalize(df_inputs.HOURLYDewPointTempF)
df_inputs.HOURLYRelativeHumidity, HOURLYRelativeHumidity_min, HOURLYRelativeHumidity_diff = \
    generic_normalize(df_inputs.HOURLYRelativeHumidity)
df_inputs.HOURLYAltimeterSetting, HOURLYAltimeterSetting_min, HOURLYAltimeterSetting_diff = \
    generic_normalize(df_inputs.HOURLYAltimeterSetting)
df_inputs.HOURLYSeaLevelPressure, HOURLYSeaLevelPressure_min, HOURLYSeaLevelPressure_diff = \
    generic_normalize(df_inputs.HOURLYSeaLevelPressure)
df_inputs.HOURLYPrecip, HOURLYPrecip_min, HOURLYPrecip_diff = \
    generic_normalize(df_inputs.HOURLYPrecip)

In [13]:
# Prepare the data for Tensorflow
train_x, val_x, test_x = prepare_data(df_inputs, TIME_STEPS)
train_y, val_y, test_y = prepare_data(df_inputs, TIME_STEPS, labels=True)

In [14]:
# defining placeholders
# input data placeholder
x = tf.placeholder("float", [None, TIME_STEPS, FEATURE_COUNT])
# input label placeholder
y = tf.placeholder("float", [None, FEATURE_COUNT])

In [15]:
# processing the input tensor from [batch_size,n_steps,n_input] to
# "time_steps" number of [batch_size,n_input] tensors
input = tf.unstack(x, TIME_STEPS, 1)

In [16]:
# define the model
# two LSTM layers with layer normalization
with tf.variable_scope("rnn1"):
    lstm_layer = rnn.LayerNormBasicLSTMCell(NUM_UNITS, forget_bias=1)
    outputs, _ = rnn.static_rnn(lstm_layer, input, dtype="float32")
with tf.variable_scope("rnn2"):
    # you can find a batch norm cell online
    lstm_layer2 = rnn.LayerNormBasicLSTMCell(NUM_UNITS, forget_bias=1)
    outputs, _ = rnn.static_rnn(lstm_layer2, outputs, dtype="float32")
# fully connected layer to produce the desired outputs
with tf.variable_scope("fc1"):
    # definately use xavier init
    # weights and biases of appropriate shape to accomplish above task
    out_weights = tf.get_variable("out_weights", shape=[NUM_UNITS, FEATURE_COUNT],
                                  initializer=tf.contrib.layers.xavier_initializer())
    out_bias = tf.get_variable("out_bias", shape=[NUM_UNITS, FEATURE_COUNT],
                               initializer=tf.contrib.layers.xavier_initializer())
    prediction = tf.matmul(outputs[-1], out_weights) + out_bias

In [17]:
# loss function
with tf.name_scope("loss_function") as scope:
    loss = tf.reduce_mean(tf.nn.l2_loss(prediction - y))
    tf.summary.scalar("loss_function", loss)

In [18]:
# optimization
with tf.name_scope("train") as scope:
    opt = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE).minimize(loss)

In [19]:
# initialize variables
init_g = tf.global_variables_initializer()
init_l = tf.local_variables_initializer()

In [20]:
# Merge all the summaries for Tensorboard
merged_summary_op = tf.summary.merge_all()

In [21]:
# Create shuffleable dataset that we can batch sample from
c_t = np.c_[train_x.reshape(len(train_x), -1), train_y.reshape(len(train_y), -1)]
x_t = c_t[:, :train_x.size // len(train_x)].reshape(train_x.shape)
y_t = c_t[:, :train_y.size // len(train_y)].reshape(train_y.shape)

# Validation data
c_v = np.c_[val_x.reshape(len(val_x), -1), val_y.reshape(len(val_y), -1)]
x_v = c_v[:, :val_x.size // len(val_x)].reshape(val_x.shape)
y_v = c_v[:, :val_y.size // len(val_y)].reshape(val_y.shape)

# Testing data
c_tt = np.c_[test_x.reshape(len(test_x), -1), test_y.reshape(len(test_y), -1)]
x_tt = c_tt[:, :test_x.size // len(test_x)].reshape(test_x.shape)
y_tt = c_tt[:, :test_y.size // len(test_y)].reshape(test_y.shape)

In [22]:

with tf.Session() as sess:
    sess.run(init_g)
    sess.run(init_l)
    summary_writer = tf.summary.FileWriter(LOG_DIR, graph=sess.graph)
    iter = 1
    while iter < TRAINING_STEPS:
        # Shuffle data
        np.random.shuffle(c_t)
        batch_x = x_t[:BATCH_SIZE]
        batch_y = y_t[:BATCH_SIZE]
        np.random.shuffle(c_v)
        batch_xv = x_v[:BATCH_SIZE]
        batch_yv = y_v[:BATCH_SIZE]
        np.random.shuffle(c_tt)
        batch_xt = x_tt[:BATCH_SIZE]
        batch_yt = y_tt[:BATCH_SIZE]
        
        # Optimize the model
        sess.run(opt, feed_dict={x: batch_x, y: batch_y})
        
        # Get summary data points for analysis in Tensorboard
        if iter % 10 == 0:
            
            los = sess.run(loss, feed_dict={x: batch_x, y: batch_y})
            
        summary_str = sess.run(merged_summary_op, feed_dict={x: batch_x, y: batch_y})
        summary_writer.add_summary(summary_str, iter)
        iter = iter + 1

In [None]:
df_raw_data.keys()

In [None]:
df_raw_data.head()

In [None]:
df_raw_data.keys()

In [None]:
df_num_subset = df_raw_data.iloc[:, 2:]

In [None]:
df_num_subset.DATE = pd.to_datetime(df_num_subset.DATE)

In [None]:
df_num_subset.DATE

In [None]:
# Trying to put the years more middle of the road here
df_num_subset['year_norm'] = (df_num_subset.DATE.dt.year - 2000) / 20
df_num_subset['month_norm'] = df_num_subset.DATE.dt.month / 12
df_num_subset['day_norm'] = df_num_subset.DATE.dt.day / 31
df_num_subset['time_norm'] = ((df_num_subset.DATE.dt.hour * 60) + df_num_subset.DATE.dt.minute) / 1440
df_num_subset['time_norm']

In [None]:
df_num_subset.LATITUDE = df_num_subset.LATITUDE / 90
df_num_subset.LATITUDE

In [None]:
df_num_subset.LONGITUDE = df_num_subset.LONGITUDE / 180
df_num_subset.LONGITUDE

In [None]:
df_num_subset.REPORTTPYE.unique()

In [None]:
tensorboard --logdir=/tmp/tensorflow/mnist/logs

In [None]:
df_num_subset.head()

In [None]:
df_num_subset.keys()

In [None]:
df_raw_data.keys()

In [None]:
df2 = df_raw_data.drop(['STATION', 'STATION_NAME', 'REPORTTPYE', 'HOURLYSKYCONDITIONS', 'HOURLYPRSENTWEATHERTYPE',
                        'HOURLYPressureTendency', 'HOURLYPressureChange',
                        'DAILYMaximumDryBulbTemp', 'DAILYMinimumDryBulbTemp', 'DAILYAverageDryBulbTemp',
                        'DAILYDeptFromNormalAverageTemp', 'DAILYAverageRelativeHumidity', 'DAILYAverageDewPointTemp',
                        'DAILYAverageWetBulbTemp', 'DAILYHeatingDegreeDays', 'DAILYCoolingDegreeDays'], axis=1)
df2 = df2.iloc[:, :21]

In [None]:
df2.keys()

In [None]:
df2.head()

In [None]:
df2.HOURLYWindGustSpeed = df2.HOURLYWindGustSpeed.fillna(0)

In [None]:
df2.DATE = pd.to_datetime(df2.DATE)

In [None]:
df2.DATE

In [None]:
df3 = df2.dropna()

In [None]:
df3.size

In [None]:
df2.size

In [None]:
df3.keys()

In [None]:
df3.DATE

In [None]:
df3.head()

In [None]:
x = pd.to_datetime(df3.DATE)
x = x.dt.hour

In [None]:
x.unique()

In [None]:
x.asfreq()

In [None]:
x.value_counts()

In [None]:
df3.keys()

In [None]:
# Trying to put the years more middle of the road here
df3.DATE = pd.to_datetime(df3.DATE)
df3['year'] = df3.DATE.dt.year
df3['month'] = df3.DATE.dt.month
df3['day'] = df3.DATE.dt.day
df3['hour'] = df3.DATE.dt.hour

In [None]:
df3.keys()

In [None]:
df3 = df3.iloc[:, 4:]

In [None]:
df3.keys()

In [None]:
df_go_time = df3[[
    'year', 
    'month',
    'day',
    'hour',
    'HOURLYWindDirection',
    'HOURLYWindGustSpeed',
    'HOURLYVISIBILITY',
    'HOURLYDRYBULBTEMPF',
    'HOURLYWETBULBTEMPF',
    'HOURLYDewPointTempF',
    'HOURLYRelativeHumidity',
    'HOURLYAltimeterSetting',
    'HOURLYSeaLevelPressure']]

In [None]:
df_go_time.keys()

In [None]:
df_go_time.size