# Time Series Forecasting Assignment

In [1]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.arima_model import ARMA
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from fbprophet import Prophet

<IPython.core.display.Javascript object>

### Import the Microsoft stock price data set (MSFT_data.csv) into a Pandas dataframe.

In [3]:
data = pd.read_csv("../data/MSFT_data.csv")
data.tail()

Unnamed: 0,date,open,high,low,close,volume,Name
1254,2018-02-01,94.79,96.07,93.5813,94.26,47227882,MSFT
1255,2018-02-02,93.64,93.97,91.5,91.78,47867753,MSFT
1256,2018-02-05,90.56,93.24,88.0,88.0,51031465,MSFT
1257,2018-02-06,86.89,91.475,85.25,91.33,67998564,MSFT
1258,2018-02-07,90.49,91.77,89.2,89.61,41107592,MSFT


<IPython.core.display.Javascript object>

In [4]:
data["date"] = pd.to_datetime(data["date"])
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1259 entries, 0 to 1258
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    1259 non-null   datetime64[ns]
 1   open    1259 non-null   float64       
 2   high    1259 non-null   float64       
 3   low     1259 non-null   float64       
 4   close   1259 non-null   float64       
 5   volume  1259 non-null   int64         
 6   Name    1259 non-null   object        
dtypes: datetime64[ns](1), float64(4), int64(1), object(1)
memory usage: 69.0+ KB


<IPython.core.display.Javascript object>

### Generate a line chart showing the observed values (closing prices).

In [18]:
px.line(data, "date", "close")

<IPython.core.display.Javascript object>

### Decompose the time series and check it for stationarity. If the data is not stationary, difference the observations and store the results in a new Diff column.

In [6]:
series = data.set_index("date")
series = series["close"]
decomposition = sm.tsa.seasonal_decompose(series, model="additive", period=132)


trend = decomposition.trend
seasonality = decomposition.seasonal
resid = decomposition.resid

merged = data.merge(trend, on="date")
merged = merged.merge(seasonality, on="date")
merged = merged.merge(resid, on="date")
merged

Unnamed: 0,date,open,high,low,close,volume,Name,trend,seasonal,resid
0,2013-02-08,27.35,27.710,27.3100,27.55,33318306,MSFT,,-1.136786,
1,2013-02-11,27.65,27.920,27.5000,27.86,32247549,MSFT,,-0.485164,
2,2013-02-12,27.88,28.000,27.7500,27.88,35990829,MSFT,,-0.392365,
3,2013-02-13,27.93,28.110,27.8800,28.03,41715530,MSFT,,-0.408956,
4,2013-02-14,27.92,28.060,27.8700,28.04,32663174,MSFT,,-0.154011,
...,...,...,...,...,...,...,...,...,...,...
1254,2018-02-01,94.79,96.070,93.5813,94.26,47227882,MSFT,,0.973451,
1255,2018-02-02,93.64,93.970,91.5000,91.78,47867753,MSFT,,1.065573,
1256,2018-02-05,90.56,93.240,88.0000,88.00,51031465,MSFT,,1.121785,
1257,2018-02-06,86.89,91.475,85.2500,91.33,67998564,MSFT,,1.022222,


<IPython.core.display.Javascript object>

In [None]:
px.line(merged, )

In [7]:
adf_test = sm.tsa.stattools.adfuller(data["close"])

results = pd.Series(
    adf_test[0:4],
    index=["ADF Test Statistic", "P-Value", "# Lags Used", "# Observations Used"],
)

for key, value in adf_test[4].items():
    results["Critical Value (%s)" % key] = value

print("Augmented Dickey-Fuller Test Results:\n")
print(results)

Augmented Dickey-Fuller Test Results:

ADF Test Statistic         0.415655
P-Value                    0.982071
# Lags Used                0.000000
# Observations Used     1258.000000
Critical Value (1%)       -3.435559
Critical Value (5%)       -2.863840
Critical Value (10%)      -2.567995
dtype: float64


<IPython.core.display.Javascript object>

In [8]:
data["diff"] = data["close"].diff()
data["diff"][0] = 0
adf_test = sm.tsa.stattools.adfuller(data["diff"][1:])

results = pd.Series(
    adf_test[0:4],
    index=["ADF Test Statistic", "P-Value", "# Lags Used", "# Observations Used"],
)

for key, value in adf_test[4].items():
    results["Critical Value (%s)" % key] = value

print("Augmented Dickey-Fuller Test Results:\n")
print(results)

Augmented Dickey-Fuller Test Results:

ADF Test Statistic       -36.480256
P-Value                    0.000000
# Lags Used                0.000000
# Observations Used     1257.000000
Critical Value (1%)       -3.435563
Critical Value (5%)       -2.863842
Critical Value (10%)      -2.567996
dtype: float64




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



<IPython.core.display.Javascript object>

### Forecast the time series 60 days into the future using double and triple exponential smoothing models.

In [9]:
model = sm.tsa.ExponentialSmoothing(data["close"], trend="add").fit()
double_exp = model.forecast(60)

<IPython.core.display.Javascript object>

In [10]:
model = sm.tsa.ExponentialSmoothing(
    data["close"], trend="add", seasonal="add", seasonal_periods=4
).fit()
triple_exp = model.forecast(60)

<IPython.core.display.Javascript object>

### Forecast the time series 60 days into the future using ARMA, ARIMA, and SARIMA models.

In [11]:
def forecast(data, field, model, periods):
    model_results = model.forecast(periods)[0]
    current_value = data[field].iloc[-1]
    forecasts = []

    for result in model_results:
        forecast = current_value + result
        forecasts.append(forecast)
        current_value = forecast

    forecasts = pd.Series(forecasts)
    forecasts.index = [x + periods for x in list(data[field].tail(periods).index)]

    return forecasts

<IPython.core.display.Javascript object>

In [17]:
model = ARMA(data["diff"], order=(3, 1)).fit()
arma_forecasts = forecast(data, "close", model, 60)

model = ARIMA(data["diff"], order=(1, 1, 1)).fit()
arima_forecasts = forecast(data, "close", model, 60)

model = SARIMAX(data["close"], order=(2, 1, 1), seasonal_order=(1, 1, 1, 12)).fit()
sarima_forecasts = model.forecast(60)

<IPython.core.display.Javascript object>

### Forecast the time series 60 days into the future using the Facebook Prophet model.

In [13]:
# series = data[["date", "close"]]
# series.columns = ["ds", "y"]

# model = Prophet()
# model.fit(series)

# future = model.make_future_dataframe(60)
# results = model.predict(future)
# prophet_forecasts = results.iloc[-60:]['yhat']

<IPython.core.display.Javascript object>

### Combine the observed values and all the forecasts into a single data frame and generate a line chart to visually compare the different models.

In [14]:
future_dates = pd.DataFrame(
    {
        "date": pd.date_range(
            start=data["date"].iloc[-1], periods=61, freq="D", closed="right"
        )
    }
)

fcast_df = pd.concat([pd.DataFrame(data["date"]), future_dates], ignore_index=True)
fcast_df = fcast_df.merge(data[["date", "close"]], on="date", how="left")

fcast_df["DoubleExp"] = double_exp
fcast_df["TripleExp"] = triple_exp
fcast_df["ARMA"] = arma_forecasts
fcast_df["ARIMA"] = arima_forecasts
fcast_df["SARIMA"] = sarima_forecasts

fcast_df

Unnamed: 0,date,close,DoubleExp,TripleExp,ARMA,ARIMA,SARIMA
0,2013-02-08,27.55,,,,,
1,2013-02-11,27.86,,,,,
2,2013-02-12,27.88,,,,,
3,2013-02-13,28.03,,,,,
4,2013-02-14,28.04,,,,,
...,...,...,...,...,...,...,...
1314,2018-04-04,,92.430703,92.427968,92.399478,94.611570,92.839257
1315,2018-04-05,,92.480073,92.465076,92.448789,94.701500,92.876495
1316,2018-04-06,,92.529442,92.536360,92.498100,94.791488,92.947921
1317,2018-04-07,,92.578812,92.645962,92.547411,94.881533,93.058341


<IPython.core.display.Javascript object>

In [15]:
melted = pd.melt(
    fcast_df, id_vars="date", value_vars=fcast_df.drop(columns="date").columns
)

px.line(melted, "date", "value", color="variable")

<IPython.core.display.Javascript object>