In [1]:
import pandas as pd
import numpy as np
import os
from pmdarima.arima import ADFTest
from pmdarima.arima.utils import ndiffs
from pmdarima.arima import auto_arima
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt

  from pandas.core import (


train and test data are adjusted to account for stock split which happened to Google, Amazon and Tesla. At the time the data was downloaded, the stock split has taken place while when the paper was published, it still used the pre-split stock data. The adjustment was done by multiplying prices of Google, Amazon and Tesla by 20, 20 and 3, respectively.

In [2]:
wk_dir = 'C:\\Users\\Lenovo\\Desktop\\Reproducible Research\\Project'
os.chdir(wk_dir)

train = pd.read_csv('train_adjusted.csv')
test = pd.read_csv('test_adjusted.csv')

***Perform ADF test to identify the order of integration***

In [4]:
adf_test = ADFTest(alpha=0.05)

adf_results = {}
for column in train.columns:
    if column != 'Date':
        result = adf_test.should_diff(train[column])
        adf_results[column] = {'Should_Diff': result[1], 'p-value': result[0]}
adf_test_results = pd.DataFrame(adf_results).T
print(adf_test_results)

     Should_Diff   p-value
AAPL        True  0.939979
META        True  0.436833
MSFT        True  0.902401
AMZN        True  0.536282
GOOG        True  0.055217
TSLA        True  0.390992


***The result suggests that the current time series are not stationary. With that said, we perform additional test to check how many differencing operation is needed for each time series to convert them to startionary processes.***

In [5]:
n_diff_results = {}
for column in train.columns:
    if column != 'Date':
        n_diff = ndiffs(train[column])
        n_diff_results[column] = {'Number of Differences': n_diff}
n_diff_results = pd.DataFrame(n_diff_results).T
print(n_diff_results)

      Number of Differences
AAPL                      1
META                      1
MSFT                      1
AMZN                      1
GOOG                      1
TSLA                      1


***We knows that first order difference suffices for these time series which are anticipated for stock price data. The next step, we use auto_arima with fixed differencing parameter capped at 1 as a result of the previous test***

In [6]:
arima_model_results = {}
for column in train.columns:
    if column != 'Date':
        auto_arima_result = auto_arima(train[column], start_p=1, d=1, q=0,
                      max_p=5, max_d=1, max_q=5, seasonal=False, trace = True,
                      supress_warnings=True, stepwise = True,
                      random_state=1, n_fits=50)
        arima_model_results[column]=auto_arima_result
        
arima_model_results

Performing stepwise search to minimize aic
 ARIMA(1,1,2)(0,0,0)[0] intercept   : AIC=2375.231, Time=0.67 sec
 ARIMA(0,1,0)(0,0,0)[0] intercept   : AIC=2373.772, Time=0.14 sec
 ARIMA(1,1,0)(0,0,0)[0] intercept   : AIC=2375.667, Time=0.12 sec
 ARIMA(0,1,1)(0,0,0)[0] intercept   : AIC=2375.653, Time=0.15 sec
 ARIMA(0,1,0)(0,0,0)[0]             : AIC=2376.144, Time=0.04 sec
 ARIMA(1,1,1)(0,0,0)[0] intercept   : AIC=2377.399, Time=1.06 sec

Best model:  ARIMA(0,1,0)(0,0,0)[0] intercept
Total fit time: 2.194 seconds
Performing stepwise search to minimize aic
 ARIMA(1,1,2)(0,0,0)[0] intercept   : AIC=6062.912, Time=0.92 sec
 ARIMA(0,1,0)(0,0,0)[0] intercept   : AIC=6063.987, Time=0.04 sec
 ARIMA(1,1,0)(0,0,0)[0] intercept   : AIC=6065.541, Time=0.10 sec
 ARIMA(0,1,1)(0,0,0)[0] intercept   : AIC=6065.525, Time=0.19 sec
 ARIMA(0,1,0)(0,0,0)[0]             : AIC=6063.746, Time=0.04 sec
 ARIMA(0,1,2)(0,0,0)[0] intercept   : AIC=6067.115, Time=0.22 sec
 ARIMA(1,1,1)(0,0,0)[0] intercept   : AIC=606

       with_intercept=False)}

In [7]:
mae_results = {}
rmse_results = {}
for column in test.columns:
    if column != 'Date':
        stock_hist = train[column]
        predict = pd.Series()
        for i in range(0, len(test), 1):
            output = arima_model_results[column].fit_predict(stock_hist, n_periods=1)
            obs = test[column][i]
            stock_hist.loc[len(stock_hist)]=obs
            predict.loc[len(predict)]=output    
        actual = test[column]
        mae = mean_absolute_error(actual, predict)
        rmse = np.sqrt(mean_squared_error(actual, predict))
        mae_results[column]=mae
        rmse_results[column]=rmse

  array = numpy.asarray(array, order=order, dtype=dtype)
  array = numpy.asarray(array, order=order, dtype=dtype)
  array = numpy.asarray(array, order=order, dtype=dtype)
  array = numpy.asarray(array, order=order, dtype=dtype)
  array = numpy.asarray(array, order=order, dtype=dtype)
  array = numpy.asarray(array, order=order, dtype=dtype)
  array = numpy.asarray(array, order=order, dtype=dtype)
  array = numpy.asarray(array, order=order, dtype=dtype)
  array = numpy.asarray(array, order=order, dtype=dtype)
  array = numpy.asarray(array, order=order, dtype=dtype)
  array = numpy.asarray(array, order=order, dtype=dtype)
  array = numpy.asarray(array, order=order, dtype=dtype)


{'AAPL': 2.57018331849275,
 'META': 6.18670410298312,
 'MSFT': 4.456804306595927,
 'TSLA': 20.499440973945017,
 'AMZN': 60.457461059254776,
 'GOOG': 34.69949780577345}

In [8]:
mae_results

{'AAPL': 1.8819874358816384,
 'META': 4.552589651933048,
 'MSFT': 3.268965759311277,
 'TSLA': 13.83666348502994,
 'AMZN': 45.33258599389168,
 'GOOG': 24.834809606539057}

In [9]:
rmse_results

{'AAPL': 2.57018331849275,
 'META': 6.18670410298312,
 'MSFT': 4.456804306595927,
 'TSLA': 20.499440973945017,
 'AMZN': 60.457461059254776,
 'GOOG': 34.69949780577345}