In [1]:
# !pip install ta
# !pip install pandas pandas_ta

In [2]:
import yfinance as yf
import pandas as pd
from datetime import datetime, timedelta
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import ta
import pandas_ta as ta

from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose
import pmdarima as pm
from pmdarima.arima import auto_arima
from pylab import rcParams
import requests
import json
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
from xgboost import XGBRegressor

In [3]:
msft = yf.Ticker("MSFT")
msft_df = msft.history(
    period="max", 
    interval="1h",
)
print(msft_df.head())

                                 Open        High         Low       Close  \
Datetime                                                                    
2023-06-21 09:30:00-04:00  336.369995  337.730011  333.630005  334.320007   
2023-06-21 10:30:00-04:00  334.329987  334.825989  332.070007  332.980103   
2023-06-21 11:30:00-04:00  332.980011  334.480011  332.299988  333.690002   
2023-06-21 12:30:00-04:00  333.660004  334.570007  333.480011  333.669891   
2023-06-21 13:30:00-04:00  333.640015  335.220001  333.579987  335.209991   

                            Volume  Dividends  Stock Splits  
Datetime                                                     
2023-06-21 09:30:00-04:00  5706332        0.0           0.0  
2023-06-21 10:30:00-04:00  3318595        0.0           0.0  
2023-06-21 11:30:00-04:00  2518807        0.0           0.0  
2023-06-21 12:30:00-04:00  1501630        0.0           0.0  
2023-06-21 13:30:00-04:00  1314182        0.0           0.0  


In [4]:
msft_df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2023-06-21 09:30:00-04:00,336.369995,337.730011,333.630005,334.320007,5706332,0.0,0.0
2023-06-21 10:30:00-04:00,334.329987,334.825989,332.070007,332.980103,3318595,0.0,0.0
2023-06-21 11:30:00-04:00,332.980011,334.480011,332.299988,333.690002,2518807,0.0,0.0
2023-06-21 12:30:00-04:00,333.660004,334.570007,333.480011,333.669891,1501630,0.0,0.0
2023-06-21 13:30:00-04:00,333.640015,335.220001,333.579987,335.209991,1314182,0.0,0.0
...,...,...,...,...,...,...,...
2025-06-18 11:30:00-04:00,479.220001,479.690002,478.779999,478.989990,986629,0.0,0.0
2025-06-18 12:30:00-04:00,478.970001,479.799988,478.429993,478.429993,816997,0.0,0.0
2025-06-18 13:30:00-04:00,478.470001,480.339996,477.980011,479.315002,1352118,0.0,0.0
2025-06-18 14:30:00-04:00,479.195007,480.670013,477.850006,478.540009,1689336,0.0,0.0


google sites or sights

# Feature Engineering

## Lagged Prices

In [5]:
for col in ['Open', 'High', 'Low', 'Close', 'Volume']:
    lag_periods = [1, 2, 3, 4, 5, 6] # within 1 day range
    for lag in lag_periods:
        msft_df[f'{col}_price_lag_{lag}H'] = msft_df[col].shift(lag)

## Lagged Returns

#### Hourly

In [6]:
msft_df['Hourly_return'] = msft_df['Close'].pct_change(periods=1)
lag_returns_hourly = [1, 2, 3, 4, 5, 6] # range within 1 day
for lag in lag_returns_hourly:
    msft_df[f'Hourly_return_lag_{lag}H'] = msft_df['Hourly_return'].shift(lag)

#### Daily

In [7]:
msft_df['Daily_return'] = msft_df['Close'].pct_change(periods=7)
lag_returns_daily = [7,14,21,28] # 1-4 day range
for lag in lag_returns_daily:
    msft_df[f'Daily_return_lag_{lag}H'] = msft_df['Daily_return'].shift(lag)

#### Weekly

In [8]:
msft_df['Weekly_return'] = msft_df['Close'].pct_change(periods=35)
lag_returns_weekly = [35,70,105] # range within 3 weeks
for lag in lag_returns_weekly:
    msft_df[f'weekly_return_lag_{lag}H'] = msft_df['Weekly_return'].shift(lag)

## Moving Averages

#### Simple

In [9]:
sma_windows = [5, 7, 10, 21, 35, 50, 105, 140]
for window in sma_windows:
    msft_df[f'SMA_{window}H'] = msft_df['Close'].rolling(window=window).mean()

#### Exponential

In [10]:
ema_windows = [5, 7, 10, 21, 35, 50, 105, 140]
for window in ema_windows:
    msft_df[f'EMA_{window}H'] = msft_df['Close'].ewm(span=window, adjust=False).mean()

#### Golden Crosses (Bullish Signals)

Using EMA for intra-day pricing. If using daily, weekly, monthly, SMA is preferred method.

In [11]:
msft_df['Golden_Cross'] = ((msft_df['EMA_21H'] > msft_df['EMA_50H']) & (msft_df['EMA_21H'].shift(1) <= msft_df['EMA_50H'].shift(1))).astype(int)

#### Death Crosses (Bearish Signal)

In [12]:
msft_df['Death_Cross'] = ((msft_df['EMA_21H'] < msft_df['EMA_50H']) & (msft_df['EMA_21H'].shift(1) <= msft_df['EMA_50H'].shift(1))).astype(int)

## Volatility

#### Standard Deviation

Use returns rather than raw price for STD due to returns having stationarity.

In [13]:
volatility_window = [3, 7, 14, 35]

for window in volatility_window:
    msft_df[f'Volatility_StdDEV_{window}H'] = msft_df['Hourly_return'].rolling(window=window).std()

The 'true' stock prices volatility over the window using EMA.

## Technical Indicators

#### Average True Range (Volatility)

In [14]:
atr_periods = [7, 14, 35, 70]
for period in atr_periods:
    msft_df[f'ATR_{period}H'] = ta.atr(
        high = msft_df['High'],
        low = msft_df['Low'],
        close = msft_df['Close'].shift(1),
        window=period
    )

#### RSI (Momentum Indicator)

In [15]:
msft_df['RSI_14'] = ta.rsi(msft_df['Close'], length=14)
msft_df['RSI_Overbought'] = (msft_df['RSI_14'] > 70).astype(int)
msft_df['RSI_Oversold'] = (msft_df['RSI_14'] < 30).astype(int)

#### MACD (Momentum Indicator)

In [16]:
macd_results = ta.macd(msft_df['Close'], fast=12, slow=26, signal=9)
msft_df['MACD'] = macd_results[macd_results.columns[0]]
msft_df['MACD_Histogram'] = macd_results[macd_results.columns[1]]
msft_df['MACD_Signal'] = macd_results[macd_results.columns[2]]

In [17]:
msft_df['MACD_Prev'] = msft_df['MACD'].shift(1)
msft_df['MACD_Signal_Prev'] = msft_df['MACD_Signal'].shift(1)
msft_df['MACD_Cross_Up'] = ((msft_df['MACD'] > msft_df['MACD_Signal']) & (msft_df['MACD_Prev'] <= msft_df['MACD_Signal_Prev'])).astype(int)
msft_df['MACD_Cross_Down'] = ((msft_df['MACD'] < msft_df['MACD_Signal']) & (msft_df['MACD_Prev'] >= msft_df['MACD_Signal_Prev'])).astype(int)

#### Bollinger Bands (Volatility Indicator)

In [18]:
bbands_results = ta.bbands(msft_df['Close'], length=2, std=2)
msft_df['Bollinger_Lower'] = bbands_results[bbands_results.columns[0]]
msft_df['Bollinger_Middle'] = bbands_results[bbands_results.columns[1]]
msft_df['Bollinger_Upper'] = bbands_results[bbands_results.columns[2]]
msft_df['Bollinger_Bandwidth_Raw'] = bbands_results[bbands_results.columns[3]]
msft_df['Bollinger_PercentageB'] = bbands_results[bbands_results.columns[4]]

In [19]:
msft_df['Bollinger_Bandwidth'] = msft_df['Bollinger_Bandwidth_Raw'] / 100
msft_df['Price_Above_Upper_BB'] = (msft_df['Close'] > msft_df['Bollinger_Upper']).astype(int)
msft_df['Price_Below_Lower_BB'] = (msft_df['Close'] < msft_df['Bollinger_Lower']).astype(int)

#### Stochastic Oscillator (Momentum Indicator)

In [20]:
stoch_results = ta.stoch(msft_df['High'], msft_df['Low'], msft_df['Close'], k=14, d=3)
msft_df['Stochastic_K'] = stoch_results[stoch_results.columns[0]]
msft_df['Stochastic_D'] = stoch_results[stoch_results.columns[1]]

msft_df['Stochsatic_Overbought'] = (msft_df['Stochastic_K'] > 80).astype(int)
msft_df['Stochastic_Oversold'] = (msft_df['Stochastic_K'] < 20).astype(int)

#### ADX (Trend/Momentum Indicator)

In [21]:
adx_results = ta.adx(msft_df['High'], msft_df['Low'], msft_df['Close'], length=14)
msft_df['ADX_14'] = adx_results[adx_results.columns[0]]
msft_df['Positive_DI'] = adx_results[adx_results.columns[1]]
msft_df['Negative_DI'] = adx_results[adx_results.columns[2]]
msft_df['Trend_Strong_ADX'] = (msft_df['ADX_14'] > 25).astype(int)


  msft_df['ADX_14'] = adx_results[adx_results.columns[0]]
  msft_df['Positive_DI'] = adx_results[adx_results.columns[1]]
  msft_df['Negative_DI'] = adx_results[adx_results.columns[2]]
  msft_df['Trend_Strong_ADX'] = (msft_df['ADX_14'] > 25).astype(int)


## Time Stamps

In [22]:
msft_df['Hour'] = msft_df.index.hour
msft_df['Day_of_Week'] = msft_df.index.dayofweek
msft_df['Day_of_Month'] = msft_df.index.day
msft_df['Month'] = msft_df.index.month
msft_df['Year'] = msft_df.index.year
msft_df['Week_of_Year'] = msft_df.index.isocalendar().week.astype(int)

  msft_df['Hour'] = msft_df.index.hour
  msft_df['Day_of_Week'] = msft_df.index.dayofweek
  msft_df['Day_of_Month'] = msft_df.index.day
  msft_df['Month'] = msft_df.index.month
  msft_df['Year'] = msft_df.index.year
  msft_df['Week_of_Year'] = msft_df.index.isocalendar().week.astype(int)


In [None]:
msft_df['Hour_sin'] = np.sin(2 * np.pi * msft_df['Hour'] / 24)
msft_df['Hour_cos'] = np.cos(2 * np.pi * msft_df['Hour'] / 24)
msft_df['Day_of_Week_sin'] = np.sin(2 * np.pi * msft_df['Day_of_Week'] / 7)
msft_df['Day_of_Week_cos'] = np.cos(2 * np.pi * msft_df['Day_of_Week'] / 7)
msft_df = pd.get_dummies(msft_df, columns=['Day_of_Week'], prefix='Day', drop_first=False)
msft_df = pd.get_dummies(msft_df, columns=['Month'], prefix='Month', drop_first=False)

  msft_df['Hour_sin'] = np.sin(2 * np.pi * msft_df['Hour'] / 24)
  msft_df['Hour_cos'] = np.cos(2 * np.pi * msft_df['Hour'] / 24)
  msft_df['Day_of_Week_sin'] = np.sin(2 * np.pi * msft_df['Day_of_Week'] / 7)
  msft_df['Day_of_Week_cos'] = np.cos(2 * np.pi * msft_df['Day_of_Week'] / 7)


## Volume Based

#### Moving Average

In [26]:
for window in [5, 10, 20]:
    msft_df[f'Volume_SMA_{window}H'] = msft_df['Volume'].rolling(window=window).mean()
msft_df['Volume_Change'] = msft_df['Volume'].pct_change()
msft_df['Volume_Ratio_20H'] = msft_df['Volume'] / msft_df[f'Volume_SMA_20H']

#### On-Balanced Volume (Momentum Indicator)

In [None]:
msft_df['OBV'] = ta.obv(msft_df['Close'], msft_df['Volume'])
msft_df[f'OBV_EMA_9H'] = msft_df['OBV'].ewm(span=9, adjust=False).mean()

#### Price Action Patterns

In [None]:
msft_df['High_Low_Range'] = msft_df['High'] - msft_df['Low']
msft_df['Open_Close_Range'] = msft_df['Close'] - msft_df['Open']
msft_df['High_Low_Range_Pct'] = (msft_df['High'] - msft_df['Low']) / msft_df['Close'] * 100
msft_df['Open_Close_Range_Pct'] = (msft_df['Close'] -  msft_df['Open']) / msft_df['Open'] * 100

### Other

In [30]:
msft_df['RSI_x_Volume'] = msft_df['RSI_14'] * msft_df['Volume']

In [31]:
msft_df.shape

(3487, 138)

In [34]:
msft_df.dropna().shape

(3347, 138)

In [None]:
# decision tree - for feature importance 
# humance intelligence - 

SyntaxError: invalid syntax (2139997356.py, line 1)