# Encoder Decoder; Min gas thresholded

We are training a seperate model for each timestep in the lookahead window

In [52]:
import pandas as pd
import numpy as np
from numpy import array
from numpy import mean
from numpy import std
import keras
from keras.models import Sequential
from tensorflow.keras.layers import LSTM
from keras.layers import Dense
from keras.layers import TimeDistributed
from keras.layers import RepeatVector
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

In [53]:
def split_sequence(sequence, n_steps_in, n_steps_out, step_interval, n_step_lookahead):
    X, y = list(), list()
    example_count = int((len(sequence)/step_interval))
    for i in range(example_count):
        # find the end of this pattern
        end_ix = (i*step_interval) + n_steps_in
        out_start_ix = end_ix + n_step_lookahead -1
        out_end_ix = end_ix + n_steps_out + n_step_lookahead -1
        # check if we are beyond the sequence
        if out_end_ix > len(sequence):
            break
        # gather input and output parts of the pattern
        seq_x, seq_y = sequence[(i*step_interval):end_ix], sequence[out_start_ix:out_end_ix]
        X.append(seq_x)
        y.append(seq_y)
    return array(X), array(y)

In [54]:
#To demonstrate above function
sequence = range(0,13)
n_steps_in = 1
n_steps_in = 5
n_steps_out =1
step_interval =1
n_step_lookahead=5
split_sequence(sequence, n_steps_in, n_steps_out, step_interval, n_step_lookahead)

(array([[0, 1, 2, 3, 4],
        [1, 2, 3, 4, 5],
        [2, 3, 4, 5, 6],
        [3, 4, 5, 6, 7]]),
 array([[ 9],
        [10],
        [11],
        [12]]))

In [55]:
def outliers_thresholded(feature, data):
    data_mean, data_std = mean(data[feature]), std(data[feature])
    cut_off = data_std * 2
    lower, upper = data_mean - cut_off, data_mean + cut_off
    for index,row in data.iterrows():
      if row[feature] < lower:
        row[feature]=lower
      elif row[feature] > upper:
        row[feature]=upper
    return data

Load data, datetime to index, downsample with left edge label, convert wei to gwei

In [153]:
def generate_training_val_examples():
    #Load data as float, datetime to index
    data = pd.read_csv (r'C:\Users\conal\Desktop\MCM\Practicum\data\ETH,gas,usage merged 11-26 to 01-26.csv', header=0)
    data['datetime'] = pd.to_datetime(data['datetime'], format = '%Y-%m-%d %H:%M:%S')
    data = data.set_index('datetime')
    data = data.squeeze()
    data = data.astype('float')

    #Resample with left edge label i.e min 1-5 mean labelled as min1
    data = data.resample(resample_rate).mean()

    
     #Add 24hr lag for min gas price
    data['min_gas_price_24hr_lag'] = data['min_gas_price'].shift(288)
    data = data[288:]
    
    #Set outlier limit
    for i in inputs:
        data = outliers_thresholded(i, data)
    

    #Filter inputs, Scale 
    data =data[inputs]
    scaler = MinMaxScaler()
    data[inputs] = scaler.fit_transform(data[inputs])
    

    #Creat input:output examples
    data = data[start_date:end_date].to_numpy()
    X, y = split_sequence(data, n_steps_in, n_steps_out, step_interval, n_step_lookahead)
    y = y[:, :, :1]
    X_train, X_val = np.split(X, [int(0.7 * len(X))])
    #we are only lookign to forecast the min gas price
    y_train, y_val = np.split(y, [int(0.7 * len(X))])

    
    #Reshape to 3D for LSTM
    X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], len(inputs)))
    y_train =y_train.reshape((y_train.shape[0], y_train.shape[1], 1))
    X_val = X_val.reshape((X_val.shape[0], X_val.shape[1], len(inputs)))
    y_val = y_val.reshape((y_val.shape[0], y_val.shape[1], 1))
    
    
    
    
    
    
    return X_train, y_train, X_val, y_val



In [154]:
#Create Training Examples for all lookaheads
resample_rate = '5T'
start_date='2021-11-26 00:00:00'
end_date='2022-01-26 23:55:00' 
inputs = ['min_gas_price', 'max_gas_price', 'min_gas_price_24hr_lag', 'Open']
#No of timesteps behind to forecast on, no of timesteps to forecast ahead
n_steps_in =  2016
n_steps_out = 12
#How many timesteps between start of training examples
step_interval = 1
n_step_lookahead = 1

In [155]:
X_train, y_train, X_val, y_val = generate_training_val_examples()

In [156]:
model = Sequential()
model.add(LSTM(128, activation='tanh', input_shape=(n_steps_in, len(inputs)), return_sequences=True))
model.add(LSTM(128, activation='tanh', input_shape=(n_steps_in, len(inputs)), return_sequences=True))
model.add(LSTM(128, activation='tanh', input_shape=(n_steps_in, len(inputs)), return_sequences=True))
model.add(LSTM(128, activation='tanh', input_shape=(n_steps_in, len(inputs)), return_sequences=True))
model.add(LSTM(128, activation='tanh', input_shape=(n_steps_in, len(inputs))))
model.add(RepeatVector(n_steps_out))
model.add(LSTM(128, activation='tanh', return_sequences=True))
model.add(LSTM(128, activation='tanh', return_sequences=True))
model.add(LSTM(128, activation='tanh', return_sequences=True))
model.add(LSTM(128, activation='tanh', return_sequences=True))
model.add(TimeDistributed(Dense(units=1)))
model.compile(optimizer='adam', loss='mse')
model.fit(X_train, y_train, epochs=10, verbose=1, batch_size=100, validation_data=(X_val, y_val))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
 11/109 [==>...........................] - ETA: 51s - loss: 0.0154

KeyboardInterrupt: 

In [123]:
model.save('encoder_decoder_12_thresholded')




INFO:tensorflow:Assets written to: encoder_decoder_12_thresholded\assets


INFO:tensorflow:Assets written to: encoder_decoder_12_thresholded\assets


## Evaluation metrics


In [81]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_squared_error



In [157]:
yhat = model.predict(X_val, verbose=1)



In [158]:
def return_metrics(y_val, y_pred):
    RMSE = mean_squared_error(y_val, y_pred, squared=False)
    MAE = mean_absolute_error(y_val, y_pred)
    MAPE = mean_absolute_percentage_error(y_val, y_pred)
    R2 = r2_score(y_val, y_pred)
    MSE = mean_squared_error(y_val, y_pred)
    return RMSE, MAE, MAPE, R2, MSE

In [162]:
#Define our min max scaler so we can revert transform
#Load data as float, datetime to index
#Load data as float, datetime to index
data = pd.read_csv (r'C:\Users\conal\Desktop\MCM\Practicum\data\ETH,gas,usage merged 11-26 to 01-26.csv', header=0)
data['datetime'] = pd.to_datetime(data['datetime'], format = '%Y-%m-%d %H:%M:%S')
data = data.set_index('datetime')
data = data.squeeze()
data = data.astype('float')

#Resample with left edge label i.e min 1-5 mean labelled as min1
data = data.resample(resample_rate).mean()

    
#Add 24hr lag for min gas price
data['min_gas_price_24hr_lag'] = data['min_gas_price'].shift(288)
data = data[288:]
    
#Set outlier limit
for i in inputs:
    data = outliers_thresholded(i, data)
    

#Filter inputs, Scale 
data =data[inputs]
scaler = MinMaxScaler()
data[inputs] = scaler.fit_transform(data[inputs])
    

In [165]:
RMSE_list, MAE_list, MAPE_list, R2_list, MSE_list = [],[],[],[],[]
for i in range(0, len(y_val)):
    pred_descaled= (scaler.inverse_transform(array([yhat[i],]*(len(inputs))).transpose()[0]))[:, :1]
    val_descaled= (scaler.inverse_transform(array([y_val[i],]*(len(inputs))).transpose()[0]))[:, :1]
    RMSE, MAE, MAPE, R2, MSE = return_metrics((val_descaled), (pred_descaled))
    RMSE_list.append(RMSE)
    MAE_list.append(MAE)
    MAPE_list.append(MAPE)
    R2_list.append(R2)
    MSE_list.append(MSE)

In [168]:
print('mean_RMSE ' + str(mean(RMSE_list))) 
print('mean_MAE ' + str(mean(MAE_list))) 
print('mean_MAPE ' + str(mean(MAPE_list)))
print('mean_R2 ' + str(mean(R2_list)))



mean_RMSE 33.08741830117068
mean_MAE 29.753615534783247
mean_MAPE 0.23471605569308118
mean_R2 -3.884154610419752e+26


In [None]:
def outliers_thresholded(feature, data):
    data_mean, data_std = mean(data[feature]), std(data[feature])
    cut_off = data_std * 2
    lower, upper = data_mean - cut_off, data_mean + cut_off
    for index,row in data.iterrows():
      if row[feature] < lower:
        row[feature]=lower
      elif row[feature] > upper:
        row[feature]=upper
    return data

In [102]:
def generate_training_val_examples_no_thresh():
    #Load data as float, datetime to index
    data = pd.read_csv (r'C:\Users\conal\Desktop\MCM\Practicum\data\ETH,gas,usage merged 11-26 to 01-26.csv', header=0)
    data['datetime'] = pd.to_datetime(data['datetime'], format = '%Y-%m-%d %H:%M:%S')
    data = data.set_index('datetime')
    data = data.squeeze()
    data = data.astype('float')

    #Resample with left edge label i.e min 1-5 mean labelled as min1
    data = data.resample(resample_rate).mean()

    
     #Add 24hr lag for min gas price
    data['min_gas_price_24hr_lag'] = data['min_gas_price'].shift(288)
    data = data[288:]
    
 
    

    #Filter inputs, Scale 
    data =data[inputs]

    

    #Creat input:output examples
    data = data[start_date:end_date].to_numpy()
    X, y = split_sequence(data, n_steps_in, n_steps_out, step_interval, n_step_lookahead)
    y = y[:, :, :1]
    X_train, X_val = np.split(X, [int(0.7 * len(X))])
    #we are only lookign to forecast the min gas price
    y_train, y_val = np.split(y, [int(0.7 * len(X))])

    
    #Reshape to 3D for LSTM
    X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], len(inputs)))
    y_train =y_train.reshape((y_train.shape[0], y_train.shape[1], 1))
    X_val = X_val.reshape((X_val.shape[0], X_val.shape[1], len(inputs)))
    y_val = y_val.reshape((y_val.shape[0], y_val.shape[1], 1))
    
    
    
    
    
    
    return X_train, y_train, X_val, y_val

In [103]:
X_train2, y_train2, X_val2, y_val2 = generate_training_val_examples_no_thresh() 

In [124]:
RMSE_list, MAE_list, MAPE_list, R2_list, MSE_list = [],[],[],[],[]
for i in range(0, len(y_val)):
    pred_descaled= (scaler.inverse_transform(array([yhat[i],]*(len(inputs))).transpose()[0]))[:, :1]
    RMSE, MAE, MAPE, R2, MSE = return_metrics((y_val2[i]), (pred_descaled*1000000000))
    RMSE_list.append(RMSE)
    MAE_list.append(MAE)
    MAPE_list.append(MAPE)
    R2_list.append(R2)
    MSE_list.append(MSE)

In [105]:
print('mean_RMSE ' + str(mean(RMSE_list))) 
print('mean_MAE ' + str(mean(MAE_list))) 
print('mean_MAPE ' + str(mean(MAPE_list)))
print('mean_R2 ' + str(mean(R2_list)))

mean_RMSE 34.61227458614174
mean_MAE 27.160770241910676
mean_MAPE 0.17230442706705165
mean_R2 -1.9373244759557138


In [121]:
y_val2.max()

1173.8338214285714

In [137]:
val_descaled= (scaler.inverse_transform(array([y_val[8],]*(len(inputs))).transpose()[0]))[:, :1]
(val_descaled*1000000000).max()

241.0422666666666

In [None]:
def outliers_thresholded(feature, data):
    data_mean, data_std = mean(data[feature]), std(data[feature])
    cut_off = data_std * 2
    lower, upper = data_mean - cut_off, data_mean + cut_off
    for index,row in data.iterrows():
      if row[feature] < lower:
        row[feature]=lower
      elif row[feature] > upper:
        row[feature]=upper
    return data

In [150]:
def generate_training_val_examples_no_min_max():
    #Load data as float, datetime to index
    data = pd.read_csv (r'C:\Users\conal\Desktop\MCM\Practicum\data\ETH,gas,usage merged 11-26 to 01-26.csv', header=0)
    data['datetime'] = pd.to_datetime(data['datetime'], format = '%Y-%m-%d %H:%M:%S')
    data = data.set_index('datetime')
    data = data.squeeze()
    data = data.astype('float')

    #Resample with left edge label i.e min 1-5 mean labelled as min1
    data = data.resample(resample_rate).mean()

    
     #Add 24hr lag for min gas price
    data['min_gas_price_24hr_lag'] = data['min_gas_price'].shift(288)
    data = data[288:]
    
    #Set outlier limit
    for i in inputs:
        data = outliers_thresholded(i, data)
    

    #Filter inputs, Scale 
    data =data[inputs]

    

    #Creat input:output examples
    data = data[start_date:end_date].to_numpy()
    X, y = split_sequence(data, n_steps_in, n_steps_out, step_interval, n_step_lookahead)
    y = y[:, :, :1]
    X_train, X_val = np.split(X, [int(0.7 * len(X))])
    #we are only lookign to forecast the min gas price
    y_train, y_val = np.split(y, [int(0.7 * len(X))])

    
    #Reshape to 3D for LSTM
    X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], len(inputs)))
    y_train =y_train.reshape((y_train.shape[0], y_train.shape[1], 1))
    X_val = X_val.reshape((X_val.shape[0], X_val.shape[1], len(inputs)))
    y_val = y_val.reshape((y_val.shape[0], y_val.shape[1], 1))
    
    
    
    
    
    
    return X_train, y_train, X_val, y_val

In [151]:
X_train3, y_train3, X_val3, y_val3=generate_training_val_examples_no_min_max()

In [152]:
y_val3.max()

235.41319339294142