In [2]:
import os
from pathlib import Path
import pandas as pd

## $\text{Data Loading}$

In [3]:
# Running the Ipython notebook from the 'notebooks' directory
# The data path should point to '~/*/data' where the dataset is found
data_path = Path().absolute().parent.joinpath("data", "HistoricalPrices.csv")
spx_500 = pd.read_csv(data_path)

## $\text{Data Exploration}$

In [4]:
spx_500.shape

(2768, 5)

In [5]:
spx_500.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2768 entries, 0 to 2767
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    2768 non-null   object 
 1    Open   2768 non-null   float64
 2    High   2768 non-null   float64
 3    Low    2768 non-null   float64
 4    Close  2768 non-null   float64
dtypes: float64(4), object(1)
memory usage: 108.3+ KB


In [6]:
spx_500.duplicated().sum()

np.int64(0)

### $\text{Observation}$
$\text{The historical S\&P 500 index dataset has 2767 entries, with no missing data in any column.}$  <br>
$\text{The dataframe columns \textbf{\textit{Open}}, \textbf{\textit{High}},  \textbf{\textit{Low}}, and \textbf{\textit{Close}} are properly formated to the correct data type \textbf{float64}, but \textbf{\textit{Date}} is not in a \textit{datetime} format.}$

In [7]:
spx_500

Unnamed: 0,Date,Open,High,Low,Close
0,09/30/24,5726.52,5765.14,5703.53,5762.48
1,09/27/24,5755.36,5763.78,5727.34,5738.17
2,09/26/24,5762.22,5767.37,5721.01,5745.37
3,09/25/24,5733.65,5741.03,5712.06,5722.26
4,09/24/24,5727.66,5735.32,5698.99,5732.93
...,...,...,...,...,...
2763,10/07/13,1687.15,1687.15,1674.70,1676.12
2764,10/04/13,1678.79,1691.94,1677.33,1690.50
2765,10/03/13,1692.35,1692.35,1670.36,1678.66
2766,10/02/13,1691.90,1693.87,1680.34,1693.87


In [8]:
spx_500["Date"] = pd.to_datetime(spx_500["Date"], format="%m/%d/%y", yearfirst=True)

In [9]:
spx_500.sort_values(by="Date", ascending=True, ignore_index=True, inplace=True)

In [10]:
# CHECK: If the date is montonically increasing
spx_500.set_index("Date").index.is_monotonic_increasing

True

In [11]:
spx_500

Unnamed: 0,Date,Open,High,Low,Close
0,2013-10-01,1682.41,1696.55,1682.07,1695.00
1,2013-10-02,1691.90,1693.87,1680.34,1693.87
2,2013-10-03,1692.35,1692.35,1670.36,1678.66
3,2013-10-04,1678.79,1691.94,1677.33,1690.50
4,2013-10-07,1687.15,1687.15,1674.70,1676.12
...,...,...,...,...,...
2763,2024-09-24,5727.66,5735.32,5698.99,5732.93
2764,2024-09-25,5733.65,5741.03,5712.06,5722.26
2765,2024-09-26,5762.22,5767.37,5721.01,5745.37
2766,2024-09-27,5755.36,5763.78,5727.34,5738.17


## $\text{Preprocessing \& Feature Engineering}$


- $\text{50 and 200-day Moving Averages (MOV\_AVG 50/200D)}$
- $\text{14-day Relative Strength Index (RSI\_14D)}$
- $\text{Open and Closing Prices (PX\_OPEN/CLOSE)}$
- $\text{High and Low Prices (PX\_HIGH/LOW)}$
- $\text{Daily Price High-Low Difference (PX\_HIGH\_LOW\_DIFFERENCE)}$
- $\text{Daily Volume (PX\_VOLUME)}$
- $\text{30-day Volatility (VOLATILITY\_30D)}$
- $\text{Beta (BETA\_ADJ\_OVERRIDABLE)}$
  
$\text{Along with the historical SPX data, the following additional metrics are considered:}$

- $\textbf{SPX Ratios:}$
  + $\text{Price-to-Earnings Ratio (PE\_RATIO)}$
  + $\text{Price-to-Book Ratio (PX\_TO\_BOOK\_RATIO)}$
  + $\text{Price-to-Sales Ratio (PX\_TO\_SALES\_RATIO)}$
  + $\text{Earnings Yield (EARN\_YLD)}$
  
- $\textbf{Market Metrics:}$
  + $\text{Volatility Index (VIX)}$
  + $\text{10-Year Treasury Yield (USGG10YR)}$
  + $\text{NAPM Manufacturing PMI (NAPMPMI)}$
  + $\text{Consumer Confidence Index (CONCCONF)}$

In [12]:
spx_500.columns

Index(['Date', ' Open', ' High', ' Low', ' Close'], dtype='object')

In [13]:
# FIX: Remove leading whitespace found in column name `Open`, `Close`, `High`, and `Low`
spx_500.rename(columns=str.strip, inplace=True)

In [14]:
# FIX: Substitute current column names with those found in the research paper
col_names_to_sub =  {"Open": "PX_OPEN", "Close": "PX_CLOSE", "High": "PX_HIGH", "Low": "PX_LOW"}
spx_500.rename(columns=col_names_to_sub, inplace=True)

In [15]:
spx_500.columns

Index(['Date', 'PX_OPEN', 'PX_HIGH', 'PX_LOW', 'PX_CLOSE'], dtype='object')

In [16]:
# NEW COLUMN (PX_HIGH_LOW_DIFFERENCE): difference between the `PX_HIGH` and `PX_LOW`
spx_500["PX_HIGH_LOW_DIFFERENCE"] = spx_500["PX_HIGH"] - spx_500["PX_LOW"]

In [17]:
spx_500.head()

Unnamed: 0,Date,PX_OPEN,PX_HIGH,PX_LOW,PX_CLOSE,PX_HIGH_LOW_DIFFERENCE
0,2013-10-01,1682.41,1696.55,1682.07,1695.0,14.48
1,2013-10-02,1691.9,1693.87,1680.34,1693.87,13.53
2,2013-10-03,1692.35,1692.35,1670.36,1678.66,21.99
3,2013-10-04,1678.79,1691.94,1677.33,1690.5,14.61
4,2013-10-07,1687.15,1687.15,1674.7,1676.12,12.45


### $\text{Moving Average (50\&200-days)}$

$\textbf{formula:}$
$$\text{MOV\_AVG}=\frac{1}{N}\sum{_{i=0}^{N-1}}\textit{Close Price}_{i}$$

In [18]:
# NEW COLUMN (MOVING_AVG_50D): Moving average for 50 SPX market days
spx_500["MOVING_AVG_50D"] = spx_500["PX_CLOSE"].rolling(50).mean()

In [19]:
# NEW COLUMN (MOVING_AVG_200D): Moving average for 200 SPX market days
spx_500["MOVING_AVG_200D"] = spx_500["PX_CLOSE"].rolling(200).mean()

### $\text{Relative Strength Index (14-days)}$

$\textbf{formula:}$

$$\text{RSI}=100-\left(\frac{100}{1+RS}\right)$$

$\textbf{where}$

$$\text{RS}=\frac{\text{Average Gain Over 14 days}}{\text{Average Loss Over 14 days}}$$



In [20]:
def compute_rsi(df: pd.DataFrame=spx_500, window: int=14):
    delta = df["PX_CLOSE"].diff()
    avg_gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    avg_loss = (delta.where(delta < 0, 0)).rolling(window=window).mean()
    RS = avg_gain / avg_loss
    df[f"RSI_{window}D"] = 100 - (100 / (1 + RS))

In [21]:
spx_500

Unnamed: 0,Date,PX_OPEN,PX_HIGH,PX_LOW,PX_CLOSE,PX_HIGH_LOW_DIFFERENCE,MOVING_AVG_50D,MOVING_AVG_200D
0,2013-10-01,1682.41,1696.55,1682.07,1695.00,14.48,,
1,2013-10-02,1691.90,1693.87,1680.34,1693.87,13.53,,
2,2013-10-03,1692.35,1692.35,1670.36,1678.66,21.99,,
3,2013-10-04,1678.79,1691.94,1677.33,1690.50,14.61,,
4,2013-10-07,1687.15,1687.15,1674.70,1676.12,12.45,,
...,...,...,...,...,...,...,...,...
2763,2024-09-24,5727.66,5735.32,5698.99,5732.93,36.33,5522.2118,5211.58160
2764,2024-09-25,5733.65,5741.03,5712.06,5722.26,28.97,5523.3130,5217.26495
2765,2024-09-26,5762.22,5767.37,5721.01,5745.37,46.36,5526.4550,5222.96995
2766,2024-09-27,5755.36,5763.78,5727.34,5738.17,36.44,5530.3266,5228.54860


In [25]:
spx_500.to_csv("../data/PreprocessedHistoricalPrices.csv", index=False)