In [1]:
# !pip install seaborn
# !pip install keras
# !pip install tensorflow
# !pip install xgboost


In [2]:
import yfinance as yf
import pandas as pd
from datetime import datetime, timedelta
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose
import pmdarima as pm
from pmdarima.arima import auto_arima
from pylab import rcParams
import requests
import json
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
from xgboost import XGBRegressor

In [3]:
# with open("tingo_api.json", "r") as file:
#     tingo_api = json.load(file)
# api_key = tingo_api['api_key']

In [4]:
# headers = {
#     'Content-Type': 'application/json',
#     'Authorization': f'Token {api_key}'
# }
# url =  "https://api.tiingo.com/tiingo/daily/MSFT/prices?startDate=2017-06-13&endDate=2025-06-13&resampleFreq=1min"


# response = requests.get(url, headers=headers)
# data = response.json()

# ARIMA

In [5]:
msft = yf.Ticker("MSFT")
msft_df = msft.history(
    period="max", 
    interval="1h",
)

print(msft_df.head())

                                 Open        High         Low       Close  \
Datetime                                                                    
2023-06-27 09:30:00-04:00  331.859985  334.114410  331.119995  332.000000   
2023-06-27 10:30:00-04:00  331.989990  332.107300  329.299988  331.873413   
2023-06-27 11:30:00-04:00  331.920013  332.839996  331.269989  331.970001   
2023-06-27 12:30:00-04:00  331.970001  333.510010  331.700012  333.312897   
2023-06-27 13:30:00-04:00  333.320007  335.815002  333.320007  335.480011   

                            Volume  Dividends  Stock Splits  
Datetime                                                     
2023-06-27 09:30:00-04:00  4718321        0.0           0.0  
2023-06-27 10:30:00-04:00  2911379        0.0           0.0  
2023-06-27 11:30:00-04:00  1904459        0.0           0.0  
2023-06-27 12:30:00-04:00  2752173        0.0           0.0  
2023-06-27 13:30:00-04:00  2018662        0.0           0.0  


In [None]:
plt.figure(figsize=(14, 7))
plt.plot(msft_df.index, msft_df['Close'], label='MSFT Close Price')
plt.title('Microsoft (MSFT) Close Price over 60 Days with 5-Minute Interval')
plt.xlabel('Date')
plt.ylabel('Price (USD)')
plt.legend();

# Testing For Stationarity

In [None]:
def test_stationarity(timeseries):
   
    rolmean = timeseries.rolling(50).mean()
    rolstd = timeseries.rolling(50).std()
 
    plt.plot(timeseries, color='blue',label='Original')
    plt.plot(rolmean, color='red', label='Rolling Mean')
    plt.plot(rolstd, color='black', label = 'Rolling Std')
    plt.legend(loc='best')
    plt.title('Rolling Mean and Standard Deviation')
    print("Results of dickey fuller test")
    adft = adfuller(timeseries)
    output = pd.Series(adft[0:4],index=['Test Statistics','p-value','No. of lags used','Number of observations used'])
    for key,values in adft[4].items():
        output['critical value (%s)'%key] =  values
    print(output)
test_stationarity(msft_df['Close'])

Interpretation: 
* Null Hypothesis: The time series is non-stationary.
* Alternative Hypothesis: The time series is stationary.

In [None]:
msft_df['Close'].plot(kind='kde');

In [None]:
result = seasonal_decompose(msft_df['Close'], model='multiplicative', period = 78)
fig = plt.figure()  
fig = result.plot()  
fig.set_size_inches(16, 9);

# Reducing Trend

In [None]:
rcParams['figure.figsize'] = 10, 6
msft_df['Log_Close'] = np.log(msft_df['Close'])
msft_df['Rolling_Log_Avg'] = msft_df['Log_Close'].rolling(7).mean()
msft_df['Rolling_Log_Std'] = msft_df['Log_Close'].rolling(7).std()
plt.legend(loc='best')
plt.title('Moving Average')
plt.plot(msft_df['Log_Close'], color='blue', label = 'Log Price')
plt.plot(msft_df['Rolling_Log_Avg'], color="red", label = "Rolling Log Mean")
plt.plot(msft_df['Rolling_Log_Std'], color = "black", label = "Rolling Log Standard Deviation")
plt.xlabel('Dates')
plt.ylabel('Log Price')
plt.legend()
plt.show()

# Train/Test 

In [None]:
msft_df.dropna(inplace=True)
train_size = int(len(msft_df) * 0.99)
train_data, test_data =  msft_df['Rolling_Log_Avg'].iloc[:train_size], msft_df['Rolling_Log_Avg'].iloc[train_size:]
plt.figure(figsize=(10,6))
plt.grid(True)
plt.xlabel('Dates')
plt.ylabel('Rolling Log Closing Prices')
plt.plot(train_data, 'green', label='Train data')
plt.plot(test_data, 'blue', label='Test data')
plt.legend();

# HyperParameter Tuning: P, D, Q

In [None]:
model_autoARIMA = auto_arima(train_data, start_p=0, start_q=0,
                      test='adf',
                      max_p=3, max_q=3, 
                      m=1,              
                      d=None,           
                      seasonal=False,   
                      start_P=0, 
                      D=0, 
                      trace=True,
                      error_action='ignore',  
                      suppress_warnings=True, 
                      stepwise=True)
print(model_autoARIMA.summary())
model_autoARIMA.plot_diagnostics(figsize=(15,8))
plt.show()

## Test residual autocorrelation

In [None]:
residuals = model_autoARIMA.resid()
plt.figure(figsize=(10,5))
plt.plot(residuals, label="Residuals")
plt.legend()
plt.title("Residuals Over Time")
plt.show()

Ljung-Box Test Hypothesis test
* Null Hypothesis: - The residuals are independently distributed (i.e., no autocorrelation).
* Alternative: The residuals are not independent (i.e., they're autocorrelated)

In [None]:
from statsmodels.stats.diagnostic import acorr_ljungbox

ljung_box_results = acorr_ljungbox(residuals, lags=[1], return_df=True)
print(ljung_box_results)

# Fit Model with Optimal Parameters

In [None]:
tscv = TimeSeriesSplit(n_splits=100)
best_p, best_d, bestq = model_autoARIMA.order

actual_values = []
predicted_values = []
simple_values = []


for i, (train_idx, test_idx) in enumerate(tscv.split(msft_df)):
    print("fold:", i)
    print("Train dates:", msft_df.index[train_idx][[0, -1]])
    print("Test dates:", msft_df.index[test_idx][[0, -1]])
    train, test = msft_df['Rolling_Log_Avg'].iloc[train_idx], msft_df['Rolling_Log_Avg'].iloc[test_idx]
    arima = ARIMA(train, order =(best_p, best_d, bestq))
    arima_fit = arima.fit()
    forecast = arima_fit.forecast()
    actual_values.append(test[0])
    predicted_values.append(forecast.values[0])
    simple_values.append(train[-1])
print("Actual Vs Predicted:", mean_squared_error(actual_values, predicted_values))
print("Actual Vs Simple:", mean_squared_error(actual_values, simple_values))


### Arima Results

In [16]:
results_df = pd.DataFrame({
    "Actual_vs_Predicted": [mean_squared_error(actual_values, predicted_values)],
    "Actual_Vs_Simple": [mean_squared_error(actual_values, simple_values)]
})
results_df

Unnamed: 0,Actual_vs_Predicted,Actual_Vs_Simple
0,8.230505e-08,7.346659e-07


In [17]:
# plt.figure(figsize=(14,7))
# plt.plot(train_data.index, train_data,  label='training data')
# plt.plot(test_data.index, test_data, color = 'blue', label='Actual Stock Price')
# plt.plot(test_data.index, forecast, color = 'orange',label='Predicted Stock Price')
# plt.title('Microsoft Stock Price Prediction')
# plt.xlabel('Time')
# plt.ylabel('MSFT Stock Price')
# plt.legend(loc='upper left', fontsize=8)
# #plt.savefig("3mo.png", dpi=300, bbox_inches="tight")
# plt.show()

In [18]:
# forecast = arima_fitted.forecast(steps=len(test_data))
# plt.figure(figsize=(20,10))
# plt.plot(train_data.index, train_data,  label='training data')
# plt.plot(test_data.index, test_data, color = 'blue', label='Actual Stock Price')
# plt.plot(test_data.index, forecast, color = 'orange',label='Predicted Stock Price')
# plt.title('Microsoft Stock Price Prediction')
# plt.xlabel('Time')
# plt.ylabel('MSFT Stock Price')
# plt.legend(loc='upper left', fontsize=8)
# #plt.savefig("3mo.png", dpi=300, bbox_inches="tight")
# plt.show()

In [19]:
# print("Mean Squared Error:", mean_squared_error(test_data, forecast))
# print("Root Mean Square Error:", np.sqrt(mean_squared_error(test_data, forecast)))
# print("Mean Absolute Error:", mean_absolute_error(test_data, forecast))
# print("Mean Absolute Percentage Error:", mean_absolute_percentage_error(test_data, forecast))

In [20]:
# timestamps = test_data.index[-4:]  
# actual_values = np.exp(test_data.iloc[-4:])
# predicted_values = np.exp(forecast[-4:])

# df_results = pd.DataFrame({
#     'Timestamp': timestamps,
#     'Actual Value': actual_values.values, 
#     'Predicted Value': predicted_values
# })

# print(df_results)

In [21]:
# msft_df['price_lag_1hour'] = msft_df['Close'].shift(1)
# msft_df['price_lag_2hour'] = msft_df['Close'].shift(2)
# msft_df['price_lag_3hour'] = msft_df['Close'].shift(3)
# msft_df['price_lag_4hour'] = msft_df['Close'].shift(4)
# msft_df['price_lag_1day'] = msft_df['Close'].shift(24)
# msft_df['price_lag_2day'] = msft_df['Close'].shift(48)
# msft_df['price_lag_2day'] = msft_df['Close'].shift(72)
# msft_df['price_lag_1week'] = msft_df['Close'].shift(168)