# Oil Predictor

In [1]:
# Initial imports
import os
import numpy as np
import pandas as pd
import alpaca_trade_api as tradeapi
from pathlib import Path
from dotenv import load_dotenv
%matplotlib inline

In [2]:
# Set the random seed for reproducibility
# Note: This is used for model prototyping, but it is good practice to comment this out and run multiple experiments to evaluate your model.
from numpy.random import seed

seed(1)
from tensorflow import random

random.set_seed(2)

### Data Loading

In this activity, we will use closing prices from different stocks to make predictions of future closing prices based on the temporal data of each stock.

In [3]:
alpaca_api_key = 'PK55DVY40BM8OTB4HSVX'
alpaca_secret_key = 'VDBV4ac8Cu1MiLfxYgKSh7zJ1H7u4ifXXtKNylW6'

api = tradeapi.REST(
    alpaca_api_key,
    alpaca_secret_key,
    api_version = "v2"
)

In [4]:
print(alpaca_api_key)

PK55DVY40BM8OTB4HSVX


In [5]:
# Set timeframe to '1D'
timeframe = "1D"

# Set start and end datetimes between now and 3 years ago.
start_date = pd.Timestamp("2018-09-11", tz="America/New_York").isoformat()
end_date = pd.Timestamp("2021-11-08", tz="America/New_York").isoformat()

# Set the ticker information
tickers = ["UNG", "GAZ", "UNL", "KOLD", "BOIL" , "TIP"]

# Get 3 year's worth of historical price data for Microsoft and Coca-Cola
raw_commod = api.get_barset(
    tickers,
    timeframe,
    start=start_date,
    end=end_date,
    limit=1000,
).df

# Display sample data
raw_commod.head()

Unnamed: 0_level_0,BOIL,BOIL,BOIL,BOIL,BOIL,GAZ,GAZ,GAZ,GAZ,GAZ,...,UNG,UNG,UNG,UNG,UNG,UNL,UNL,UNL,UNL,UNL
Unnamed: 0_level_1,open,high,low,close,volume,open,high,low,close,volume,...,open,high,low,close,volume,open,high,low,close,volume
time,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2018-09-11 00:00:00-04:00,26.4,27.06,26.2,26.94,64304,,,,,,...,22.96,23.27,22.87,23.22,1350867,9.1366,9.1366,9.1366,9.1366,100.0
2018-09-12 00:00:00-04:00,27.32,27.45,26.7,26.75,45356,,,,,,...,23.48,23.51,23.14,23.21,1522494,,,,,
2018-09-13 00:00:00-04:00,27.01,27.14,26.34,26.37,39465,,,,,,...,23.33,23.4,23.03,23.06,1467542,9.1521,9.1521,9.1521,9.1521,400.0
2018-09-14 00:00:00-04:00,25.89,25.9,25.46,25.53,111158,,,,,,...,22.86,22.86,22.67,22.7,1835799,9.041,9.041,9.03,9.0375,2115.0
2018-09-17 00:00:00-04:00,25.85,26.3,25.8,26.01,39346,,,,,,...,22.86,23.09,22.84,22.95,1661151,9.1,9.1,9.1,9.1,1600.0


In [6]:
commod_data = pd.DataFrame()

for c in raw_commod.columns:
    if c[1] in "close":
        commod_data[c[0]] = raw_commod[c[0]][c[1]]

In [7]:
commod_data.index= commod_data.index.date
commod_data= commod_data.dropna()
commod_data

Unnamed: 0,BOIL,GAZ,KOLD,TIP,UNG,UNL
2018-11-19,59.8500,63.4900,11.7000,109.27,37.180,11.590
2018-11-21,57.0100,59.5500,12.2300,109.06,36.300,11.200
2018-11-28,61.1700,61.1200,10.9900,108.62,37.740,11.590
2018-11-29,58.9000,58.5600,11.3300,109.23,37.080,11.480
2018-12-04,56.3000,59.8300,11.6900,109.65,36.310,11.520
...,...,...,...,...,...,...
2021-11-02,65.8100,28.5787,7.0500,128.54,18.420,14.095
2021-11-03,73.4900,30.0000,6.2899,128.39,19.440,14.570
2021-11-04,70.5000,29.5300,6.5550,129.22,19.020,14.470
2021-11-05,68.4100,29.0500,6.7200,129.67,18.760,14.420


In [8]:
def window_data(commod_data, window, feature_col_number, target_col_number):
    """
    This function accepts the column number for the features (X) and the target (y).
    It chunks the data up with a rolling window of Xt - window to predict Xt.
    It returns two numpy arrays of X and y.
    """
    X = []
    y = []
    for i in range(len(commod_data) - window):
        features = commod_data.iloc[i : (i + window), feature_col_number]
        target = commod_data.iloc[(i + window), target_col_number]
        X.append(features)
        y.append(target)
    return np.array(X), np.array(y).reshape(-1, 1)

In [9]:
# Creating the features (X) and target (y) data using the window_data() function.
window_size = 5

feature_column = 2
target_column = 2
X, y = window_data(commod_data, window_size, feature_column, target_column)
print (f"X sample values:\n{X[:5]} \n")
print (f"y sample values:\n{y[:5]}")

X sample values:
[[11.7    12.23   10.99   11.33   11.69  ]
 [12.23   10.99   11.33   11.69   12.31  ]
 [10.99   11.33   11.69   12.31   11.5901]
 [11.33   11.69   12.31   11.5901 12.23  ]
 [11.69   12.31   11.5901 12.23   13.43  ]] 

y sample values:
[[12.31  ]
 [11.5901]
 [12.23  ]
 [13.43  ]
 [13.285 ]]


### Training

In [10]:
# Use 70% of the data for training and the remainder for testing
split = int(0.7 * len(X))
X_train = X[: split]
X_test = X[split:]
y_train = y[: split]
y_test = y[split:]
X_train

array([[11.7   , 12.23  , 10.99  , 11.33  , 11.69  ],
       [12.23  , 10.99  , 11.33  , 11.69  , 12.31  ],
       [10.99  , 11.33  , 11.69  , 12.31  , 11.5901],
       ...,
       [40.825 , 46.42  , 46.132 , 47.29  , 48.97  ],
       [46.42  , 46.132 , 47.29  , 48.97  , 43.26  ],
       [46.132 , 47.29  , 48.97  , 43.26  , 42.079 ]])

In [11]:
from sklearn.preprocessing import MinMaxScaler

# Create a MinMaxScaler object
X_scaler = MinMaxScaler()
y_scaler = MinMaxScaler()

# Fit the MinMaxScaler object with the training feature data X_train
X_scaler.fit(X_train)

# Scale the features training and testing sets
X_train = X_scaler.transform(X_train)
X_test = X_scaler.transform(X_test)

# Fit the MinMaxScaler object with the training target data y_train
y_scaler.fit(y_train)

# Scale the target training and testing sets
y_train = y_scaler.transform(y_train)
y_test = y_scaler.transform(y_test)

In [12]:
# Reshape the features for the model
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))
print (f"X_train sample values:\n{X_train[:5]} \n")
print (f"X_test sample values:\n{X_test[:5]}")

X_train sample values:
[[[0.00942895]
  [0.01646746]
  [0.        ]
  [0.        ]
  [0.00133735]]

 [[0.01646746]
  [0.        ]
  [0.00451527]
  [0.00480256]
  [0.00963723]]

 [[0.        ]
  [0.00451527]
  [0.00929615]
  [0.01307364]
  [0.        ]]

 [[0.00451527]
  [0.00929615]
  [0.01752988]
  [0.00346985]
  [0.00856628]]

 [[0.00929615]
  [0.01752988]
  [0.00796946]
  [0.0120064 ]
  [0.02463056]]] 

X_test sample values:
[[[0.48207171]
  [0.50438247]
  [0.42855246]
  [0.41020544]
  [0.38862033]]

 [[0.50438247]
  [0.42855246]
  [0.41286853]
  [0.39074173]
  [0.38580908]]

 [[0.42855246]
  [0.41286853]
  [0.3934927 ]
  [0.38794023]
  [0.43226162]]

 [[0.41286853]
  [0.3934927 ]
  [0.39070385]
  [0.43423159]
  [0.3157956 ]]

 [[0.3934927 ]
  [0.39070385]
  [0.43678619]
  [0.31816969]
  [0.30816507]]]


### Build and Train the LSTM RNN

In [13]:
# Import required Keras modules
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

In [14]:
# Define the LSTM RNN model.
model = Sequential()

number_units = 5
dropout_fraction = 0.1

# Layer 1
model.add(LSTM(
    units=number_units,
    return_sequences=True,
    input_shape=(X_train.shape[1], 1))
    )
model.add(Dropout(dropout_fraction))
# Layer 2
model.add(LSTM(units=number_units, return_sequences=True))
model.add(Dropout(dropout_fraction))
# Layer 3
model.add(LSTM(units=number_units))
model.add(Dropout(dropout_fraction))
# Output layer
model.add(Dense(1))

### Compiling the LSTM RNN Model

In [15]:
# Compile the model
model.compile(optimizer="adam", loss="mean_squared_error")

In [16]:
# Summarize the model
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 5, 5)              140       
_________________________________________________________________
dropout (Dropout)            (None, 5, 5)              0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 5, 5)              220       
_________________________________________________________________
dropout_1 (Dropout)          (None, 5, 5)              0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 5)                 220       
_________________________________________________________________
dropout_2 (Dropout)          (None, 5)                 0         
_________________________________________________________________
dense (Dense)                (None, 1)                 6

### Training the Model

In [17]:
# Train the model
model.fit(X_train, y_train, epochs=10, shuffle=False, batch_size=1, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x23f1e93bcc8>

### Model Performance

In [18]:
# Evaluate the model
model.evaluate(X_test, y_test)



0.013459761627018452

### Making Predictions

In [19]:
# Make some predictions
predicted = model.predict(X_test)

In [20]:
# Recover the original prices instead of the scaled version
predicted_prices = y_scaler.inverse_transform(predicted)
real_prices = y_scaler.inverse_transform(y_test.reshape(-1, 1))

In [21]:
commod = pd.DataFrame({
    "Real": real_prices.ravel(),
    "Predicted": predicted_prices.ravel()
    }, index = commod_data.index[-len(real_prices): ])
commod.head()

Unnamed: 0,Real,Predicted
2021-01-28,40.41,45.361488
2021-01-29,43.88,44.598927
2021-02-01,35.18,43.451637
2021-02-02,34.61,42.971142
2021-02-03,36.24,42.374527


### Prepare data for forcasting

In [22]:
commod[-10:]["Real"]

2021-10-26    6.3000
2021-10-27    5.9750
2021-10-28    6.6100
2021-10-29    7.1100
2021-11-01    7.9100
2021-11-02    7.0500
2021-11-03    6.2899
2021-11-04    6.5550
2021-11-05    6.7200
2021-11-08    7.3100
Name: Real, dtype: float64

In [23]:
x_future, _ = window_data(pd.DataFrame(commod[-10:]["Real"]), 5, 0, 0)
x_future = X_scaler.transform(x_future)
x_future = x_future.reshape((x_future.shape[0],x_future.shape[1], 1))

### Forcast commodity movement

In [24]:
last = model.predict(x_future)[-1:]
last = y_scaler.inverse_transform(last)

In [25]:
commod[-9:]["Real"].append(pd.Series(last[0]))

2021-10-27     5.975000
2021-10-28     6.610000
2021-10-29     7.110000
2021-11-01     7.910000
2021-11-02     7.050000
2021-11-03     6.289900
2021-11-04     6.555000
2021-11-05     6.720000
2021-11-08     7.310000
0             19.364838
dtype: float64

In [26]:
from datetime import timedelta

def forecast_lstm(model, 
                  data,  
                  x_scaler, 
                  y_scaler, 
                  num_of_obs = 10 ,
                  steps_ahead = 15, 
                  window_size = 5,
                  target_col="Real"
                 ):
    """
    This function requires window_data to be defined
    """
    for i in range(0, steps_ahead):
        x_future, _ = window_data(pd.DataFrame(data[-num_of_obs:][target_col]), window_size, 0, 0)
        x_future = X_scaler.transform(x_future)
        x_future = x_future.reshape((x_future.shape[0],x_future.shape[1], 1))
        last = model.predict(x_future)[-1:]
        last = y_scaler.inverse_transform(last)
        #print(last)
        new_data = data[-(num_of_obs-1):].copy()
        
        last_date = new_data.iloc[[-1]].index
        last_date = last_date + timedelta(days=1)
        last_date = pd.to_datetime(last_date[0])
        
        new_data = new_data.append(pd.DataFrame(index=[last_date]))
        new_data.iloc[-1][target_col] = last
        data = new_data
        #print(data)
        #break
    return data

In [27]:
x_future, _ = window_data(pd.DataFrame(commod[-10:]["Real"]), 5, 0, 0)

In [28]:
forecast_lstm(model, pd.DataFrame(commod["Real"]), X_scaler, y_scaler)

Unnamed: 0,Real
2021-11-14,24.844378
2021-11-15,27.994965
2021-11-16,28.391823
2021-11-17,29.328947
2021-11-18,30.748384
2021-11-19,32.368649
2021-11-20,33.888592
2021-11-21,35.055843
2021-11-22,35.776554
2021-11-23,36.711872


In [29]:
first15= forecast_lstm(model, pd.DataFrame(commod["Real"]), X_scaler, y_scaler)
