# Oil Predictor

In [1]:
# Initial imports
import os
import numpy as np
import pandas as pd
import alpaca_trade_api as tradeapi
from pathlib import Path
from dotenv import load_dotenv
%matplotlib inline

In [2]:
# Set the random seed for reproducibility
# Note: This is used for model prototyping, but it is good practice to comment this out and run multiple experiments to evaluate your model.
from numpy.random import seed

seed(1)
from tensorflow import random

random.set_seed(2)

### Data Loading

In this activity, we will use closing prices from different stocks to make predictions of future closing prices based on the temporal data of each stock.

In [3]:
alpaca_api_key = 'PK55DVY40BM8OTB4HSVX'
alpaca_secret_key = 'VDBV4ac8Cu1MiLfxYgKSh7zJ1H7u4ifXXtKNylW6'

api = tradeapi.REST(
    alpaca_api_key,
    alpaca_secret_key,
    api_version = "v2"
)

In [4]:
print(alpaca_api_key)

PK55DVY40BM8OTB4HSVX


In [5]:
# Set timeframe to '1D'
timeframe = "1D"

# Set start and end datetimes between now and 3 years ago.
start_date = pd.Timestamp("2018-09-11", tz="America/New_York").isoformat()
end_date = pd.Timestamp("2021-11-08", tz="America/New_York").isoformat()

# Set the ticker information
tickers = ["USO", "UCO", "DBO", "USL", "OIL", "SCO" , "TIP"]

# Get 3 year's worth of historical price data for Microsoft and Coca-Cola
raw_commod = api.get_barset(
    tickers,
    timeframe,
    start=start_date,
    end=end_date,
    limit=1000,
).df

# Display sample data
raw_commod.head()

Unnamed: 0_level_0,DBO,DBO,DBO,DBO,DBO,OIL,OIL,OIL,OIL,OIL,...,USL,USL,USL,USL,USL,USO,USO,USO,USO,USO
Unnamed: 0_level_1,open,high,low,close,volume,open,high,low,close,volume,...,open,high,low,close,volume,open,high,low,close,volume
time,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2018-09-11 00:00:00-04:00,12.41,12.7,12.4,12.675,219738,,,,,,...,25.15,25.63,25.15,25.59,4257.0,14.27,14.63,14.23,14.6,10567587
2018-09-12 00:00:00-04:00,12.78,12.921,12.74,12.83,427687,,,,,,...,25.78,26.01,25.7799,25.9,24730.0,14.75,14.96,14.6867,14.775,16562274
2018-09-13 00:00:00-04:00,12.72,12.77,12.53,12.61,422843,,,,,,...,25.677,25.677,25.34,25.45,9563.0,14.61,14.67,14.39,14.48,17497577
2018-09-14 00:00:00-04:00,12.61,12.78,12.475,12.635,425976,,,,,,...,25.51,25.52,25.4875,25.49,2682.0,14.48,14.71,14.31,14.5,15650639
2018-09-17 00:00:00-04:00,12.72,12.78,12.585,12.62,343214,,,,,,...,25.67,25.7201,25.417,25.43,12276.0,14.61,14.66,14.42,14.47,13367750


In [6]:
commod_data = pd.DataFrame()

for c in raw_commod.columns:
    if c[1] in "close":
        commod_data[c[0]] = raw_commod[c[0]][c[1]]

In [7]:
commod_data.index= commod_data.index.date
commod_data= commod_data.dropna()
commod_data

Unnamed: 0,DBO,OIL,SCO,TIP,UCO,USL,USO
2018-11-19,10.705,59.4500,21.170,109.27,21.780,21.8525,12.140
2018-11-20,9.970,55.2600,24.220,109.11,18.760,20.3500,11.280
2018-11-21,10.190,56.4035,23.180,109.06,19.540,20.8400,11.520
2018-11-23,9.595,53.1972,25.850,108.88,17.265,19.6700,10.850
2018-11-26,9.670,53.6300,25.210,108.77,17.490,19.7900,10.945
...,...,...,...,...,...,...,...
2021-09-03,12.680,21.0000,17.520,128.62,74.070,25.2500,48.700
2021-09-07,12.510,20.7100,17.955,128.33,72.194,24.9100,48.040
2021-09-08,12.660,21.0000,17.620,128.83,73.550,25.1550,48.600
2021-09-09,12.425,20.6000,18.190,129.51,71.150,24.7500,47.730


In [8]:
def window_data(commod_data, window, feature_col_number, target_col_number):
    """
    This function accepts the column number for the features (X) and the target (y).
    It chunks the data up with a rolling window of Xt - window to predict Xt.
    It returns two numpy arrays of X and y.
    """
    X = []
    y = []
    for i in range(len(commod_data) - window):
        features = commod_data.iloc[i : (i + window), feature_col_number]
        target = commod_data.iloc[(i + window), target_col_number]
        X.append(features)
        y.append(target)
    return np.array(X), np.array(y).reshape(-1, 1)

In [9]:
# Creating the features (X) and target (y) data using the window_data() function.
window_size = 5

feature_column = 2
target_column = 2
X, y = window_data(commod_data, window_size, feature_column, target_column)
print (f"X sample values:\n{X[:5]} \n")
print (f"y sample values:\n{y[:5]}")

X sample values:
[[21.17 24.22 23.18 25.85 25.21]
 [24.22 23.18 25.85 25.21 25.14]
 [23.18 25.85 25.21 25.14 26.49]
 [25.85 25.21 25.14 26.49 25.46]
 [25.21 25.14 26.49 25.46 26.11]] 

y sample values:
[[25.14]
 [26.49]
 [25.46]
 [26.11]
 [23.61]]


### Training

In [10]:
# Use 70% of the data for training and the remainder for testing
split = int(0.7 * len(X))
X_train = X[: split]
X_test = X[split:]
y_train = y[: split]
y_test = y[split:]
X_train

array([[21.17  , 24.22  , 23.18  , 25.85  , 25.21  ],
       [24.22  , 23.18  , 25.85  , 25.21  , 25.14  ],
       [23.18  , 25.85  , 25.21  , 25.14  , 26.49  ],
       ...,
       [12.99  , 13.09  , 13.31  , 13.61  , 13.34  ],
       [13.09  , 13.31  , 13.61  , 13.34  , 13.1778],
       [13.31  , 13.61  , 13.34  , 13.1778, 12.88  ]])

In [11]:
from sklearn.preprocessing import MinMaxScaler

# Create a MinMaxScaler object
X_scaler = MinMaxScaler()
y_scaler = MinMaxScaler()

# Fit the MinMaxScaler object with the training feature data X_train
X_scaler.fit(X_train)

# Scale the features training and testing sets
X_train = X_scaler.transform(X_train)
X_test = X_scaler.transform(X_test)

# Fit the MinMaxScaler object with the training target data y_train
y_scaler.fit(y_train)

# Scale the target training and testing sets
y_train = y_scaler.transform(y_train)
y_test = y_scaler.transform(y_test)

In [12]:
# Reshape the features for the model
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))
print (f"X_train sample values:\n{X_train[:5]} \n")
print (f"X_test sample values:\n{X_test[:5]}")

X_train sample values:
[[[0.22819733]
  [0.299509  ]
  [0.27519289]
  [0.33761983]
  [0.32265607]]

 [[0.299509  ]
  [0.27519289]
  [0.33761983]
  [0.32265607]
  [0.32101941]]

 [[0.27519289]
  [0.33761983]
  [0.32265607]
  [0.32101941]
  [0.35258359]]

 [[0.33761983]
  [0.32265607]
  [0.32101941]
  [0.35258359]
  [0.32850129]]

 [[0.32265607]
  [0.32101941]
  [0.35258359]
  [0.32850129]
  [0.34369885]]] 

X_test sample values:
[[[0.05143792]
  [0.04512509]
  [0.04133271]
  [0.03436989]
  [0.03607201]]

 [[0.04512509]
  [0.04133271]
  [0.03436989]
  [0.03607201]
  [0.03577274]]

 [[0.04133271]
  [0.03436989]
  [0.03607201]
  [0.03577274]
  [0.03670797]]

 [[0.03436989]
  [0.03607201]
  [0.03577274]
  [0.03670797]
  [0.02291326]]

 [[0.03607201]
  [0.03577274]
  [0.03670797]
  [0.02291326]
  [0.02595277]]]


### Build and Train the LSTM RNN

In [13]:
# Import required Keras modules
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

In [14]:
# Define the LSTM RNN model.
model = Sequential()

number_units = 5
dropout_fraction = 0.2

# Layer 1
model.add(LSTM(
    units=number_units,
    return_sequences=True,
    input_shape=(X_train.shape[1], 1))
    )
model.add(Dropout(dropout_fraction))
# Layer 2
model.add(LSTM(units=number_units, return_sequences=True))
model.add(Dropout(dropout_fraction))
# Layer 3
model.add(LSTM(units=number_units))
model.add(Dropout(dropout_fraction))
# Output layer
model.add(Dense(1))

### Compiling the LSTM RNN Model

In [15]:
# Compile the model
model.compile(optimizer="adam", loss="mean_squared_error")

In [16]:
# Summarize the model
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 5, 5)              140       
_________________________________________________________________
dropout (Dropout)            (None, 5, 5)              0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 5, 5)              220       
_________________________________________________________________
dropout_1 (Dropout)          (None, 5, 5)              0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 5)                 220       
_________________________________________________________________
dropout_2 (Dropout)          (None, 5)                 0         
_________________________________________________________________
dense (Dense)                (None, 1)                 6

### Training the Model

In [17]:
# Train the model
model.fit(X_train, y_train, epochs=10, shuffle=False, batch_size=1, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1531ca9d648>

### Model Performance

In [18]:
# Evaluate the model
model.evaluate(X_test, y_test)



0.00461718812584877

### Making Predictions

In [19]:
# Make some predictions
predicted = model.predict(X_test)

In [20]:
# Recover the original prices instead of the scaled version
predicted_prices = y_scaler.inverse_transform(predicted)
real_prices = y_scaler.inverse_transform(y_test.reshape(-1, 1))

In [21]:
commod = pd.DataFrame({
    "Real": real_prices.ravel(),
    "Predicted": predicted_prices.ravel()
    }, index = commod_data.index[-len(real_prices): ])
commod.head()

Unnamed: 0,Real,Predicted
2020-12-08,12.94,13.912395
2020-12-09,12.98,13.789457
2020-12-10,12.39,13.711695
2020-12-11,12.52,13.643787
2020-12-14,12.32,13.613734


### Prepare data for forcasting

In [22]:
commod[-10:]["Real"]

2021-08-27    18.000
2021-08-30    17.810
2021-08-31    18.050
2021-09-01    18.130
2021-09-02    17.370
2021-09-03    17.520
2021-09-07    17.955
2021-09-08    17.620
2021-09-09    18.190
2021-09-10    17.389
Name: Real, dtype: float64

In [23]:
x_future, _ = window_data(pd.DataFrame(commod[-10:]["Real"]), 5, 0, 0)
x_future = X_scaler.transform(x_future)
x_future = x_future.reshape((x_future.shape[0],x_future.shape[1], 1))

### Forcast commodity movement

In [24]:
last = model.predict(x_future)[-1:]
last = y_scaler.inverse_transform(last)

In [25]:
commod[-9:]["Real"].append(pd.Series(last[0]))

2021-08-30    17.810000
2021-08-31    18.050000
2021-09-01    18.130000
2021-09-02    17.370000
2021-09-03    17.520000
2021-09-07    17.955000
2021-09-08    17.620000
2021-09-09    18.190000
2021-09-10    17.389000
0             16.955908
dtype: float64

In [26]:
from datetime import timedelta

def forecast_lstm(model, 
                  data,  
                  x_scaler, 
                  y_scaler, 
                  num_of_obs = 10 ,
                  steps_ahead = 15, 
                  window_size = 5,
                  target_col="Real"
                 ):
    """
    This function requires window_data to be defined
    """
    for i in range(0, steps_ahead):
        x_future, _ = window_data(pd.DataFrame(data[-num_of_obs:][target_col]), window_size, 0, 0)
        x_future = X_scaler.transform(x_future)
        x_future = x_future.reshape((x_future.shape[0],x_future.shape[1], 1))
        last = model.predict(x_future)[-1:]
        last = y_scaler.inverse_transform(last)
        #print(last)
        new_data = data[-(num_of_obs-1):].copy()
        
        last_date = new_data.iloc[[-1]].index
        last_date = last_date + timedelta(days=1)
        last_date = pd.to_datetime(last_date[0])
        
        new_data = new_data.append(pd.DataFrame(index=[last_date]))
        new_data.iloc[-1][target_col] = last
        data = new_data
        #print(data)
        #break
    return data

In [27]:
x_future, _ = window_data(pd.DataFrame(commod[-10:]["Real"]), 5, 0, 0)

In [28]:
forecast_lstm(model, pd.DataFrame(commod["Real"]), X_scaler, y_scaler)

Unnamed: 0,Real
2021-09-16,16.583788
2021-09-17,16.49165
2021-09-18,16.471148
2021-09-19,16.402922
2021-09-20,16.306273
2021-09-21,16.217421
2021-09-22,16.131212
2021-09-23,16.082558
2021-09-24,16.036493
2021-09-25,15.977832


In [29]:
first15= forecast_lstm(model, pd.DataFrame(commod["Real"]), X_scaler, y_scaler)
