In [26]:
import numpy as np
import pandas as pd
import os

In [27]:
import warnings
warnings.filterwarnings("ignore")

# Read raw data

In [28]:
dir_main = os.getcwd()
dir_raw_data = os.path.join(dir_main)

## S&P500

In [29]:
sp = pd.read_csv(os.path.join(dir_raw_data, "sp500.csv"),
                sep=",",
                header=0,
                dtype={"Date": object, "Close/Last": float,
                       "Open": float, "High": float, "Low": float})

In [30]:
# Rename columns for readability
sp.rename(columns={"Date": 'date', "Close/Last": 'close',
                   "Open": 'open', "High": 'high', "Low": 'low'},
                   inplace=True)

# Set date column's data type
sp['date'] = pd.to_datetime(sp['date'])

# Sort rows by ascending order of date
sp = sp.sort_values('date')

In [31]:
# Count null values
sp.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2536 entries, 2535 to 0
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    2536 non-null   datetime64[ns]
 1   close   2536 non-null   float64       
 2   open    2536 non-null   float64       
 3   high    2536 non-null   float64       
 4   low     2536 non-null   float64       
dtypes: datetime64[ns](1), float64(4)
memory usage: 118.9 KB


In [32]:
# Confirm first and last date
print("First date:\t", sp['date'].iloc[0], sep="")
print("Latest date:\t", sp['date'].iloc[-1], sep="")

First date:	2013-11-13 00:00:00
Latest date:	2023-11-10 00:00:00


## Effective Federal Reserve Fund Rate

In [33]:
effr = pd.read_csv(os.path.join(dir_raw_data, "EFFR.csv"),
                   sep=",",
                   header=0,
                   dtype={"DATE": object, "EFFR": float},
                   na_values=["."]
                   )

In [34]:
# Rename columns for readability
effr.rename(columns={"DATE": 'date', "EFFR": 'effr'}, inplace=True)

# Set date column's data type
effr['date'] = pd.to_datetime(effr['date'])

# Sort rows by ascending order of date
effr = effr.sort_values('date')

In [35]:
# Count null values
effr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24733 entries, 0 to 24732
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    24733 non-null  datetime64[ns]
 1   effr    24712 non-null  float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 386.6 KB


In [36]:
# Get list of null dates
null_dates = effr['date'][effr['effr'].isna()]

In [37]:
# Confirm first and last date
print("First date:\t", effr['date'].iloc[0], sep="")
print("Latest date:\t", effr['date'].iloc[-1], sep="")

First date:	1954-07-01 00:00:00
Latest date:	2023-11-30 00:00:00


## US GDP

In [38]:
gdp = pd.read_csv(os.path.join(dir_raw_data, "GDP.csv"),
                  sep=",",
                  header=0,
                  dtype={"DATE": object, "GDP": float})

In [39]:
# Rename columns for readability
gdp.rename(columns={"DATE": "date", "GDP": "gdp"}, inplace=True)

# Set date column's data type
gdp['date'] = pd.to_datetime(gdp['date'])

# Sort rows by ascending order of date
gdp = gdp.sort_values('date')

In [40]:
# Count null values
gdp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307 entries, 0 to 306
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    307 non-null    datetime64[ns]
 1   gdp     307 non-null    float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 4.9 KB


In [41]:
# Confirm first and last date
print("First date:\t", gdp['date'].iloc[0], sep="")
print("Latest date:\t", gdp['date'].iloc[-1], sep="")

First date:	1947-01-01 00:00:00
Latest date:	2023-07-01 00:00:00


## MSCI ACWI ETF

In [42]:
acwi = pd.read_csv(os.path.join(dir_raw_data, "ACWI.csv"),
                  sep=",",
                  header=0,
                  dtype={"Date": object, "Open": float, "High": float,
                         "Low": float, "Close": float,
                         "Adj Close": float, "Volume": float})

In [43]:
# Remove unnecessary columns
acwi = acwi[['Date', 'Adj Close']]

# Rename columns for readability
acwi.rename(columns={"Date": "date", "Adj Close": "acwi_adj_close"}, inplace=True)

# Set date column's data type
acwi['date'] = pd.to_datetime(acwi['date'])

# Sort rows by ascending order of date
acwi = acwi.sort_values('date')

In [44]:
acwi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 251 entries, 0 to 250
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   date            251 non-null    datetime64[ns]
 1   acwi_adj_close  251 non-null    float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 4.1 KB


In [45]:
# Confirm first and last date
print("First date:\t", acwi['date'].iloc[0], sep="")
print("Latest date:\t", acwi['date'].iloc[-1], sep="")

First date:	2022-11-14 00:00:00
Latest date:	2023-11-13 00:00:00


## Dataset summary

#### S&P500 Dataset
* 2538 rows
* From `2013-11-11` to `2023-11-10`
* `Daily` basis
* Schema: `['date', 'close', 'open', 'high', 'low']`
* no null value

#### Effective Federal Reserve Fund Rate
* 2608 rows
* From `2013-11-12` to `2023-11-09`
* `Daily` basis
* Schema: `['date', 'effr']`
* effr has 96 null values for holidays or other days.
    * These dates will be removed from every datasets.

#### US GDP
* 40 rows
* From `2013-10-01` to `2023-07-01`
* `Quarterly` basis
    * Latest quarterly GDP value will be used for each given date.
* Schema: `['date', 'gdp']`
* no null value

#### MSCI ACWI ETF
* 2547 rows
* From `2013-10-01` to `2023-11-10`
* `Daily` basis
* Schema: `['date', 'acwi_adj_close']`
* no null value

# Combine(Join) Dataset

## Schema of cleaned data
`['date', 'close', 'open', 'high', 'low', 'effr', 'gdp', 'acwi_adj_close']`
* `date`: every date in `S&P500`, but from `2013-11-12` to `2023-11-09`
    * The earliest and latest dates are bounded by EFFR data
* `close`, `open`, `high`, `low`: data from `S&P500`
* `effr`: data from `Effective Federal Reserve Fund Rate`
* `gdp`: data from `US GDP`
* `acwi_adj_close`: Adjusted closing price from `Adjusted Close` in `MSCI ACWI`


In [46]:
# Generate valid date pd.series
valid_date = sp['date']
mask_validDate = (valid_date >= pd.to_datetime('2013-11-12')) &\
                 (valid_date <= pd.to_datetime('2023-11-09')) &\
                 (~valid_date.isin(null_dates))
valid_date = valid_date[mask_validDate]

In [47]:
# Change US GDP from quarterly to daily values


In [48]:
# Merge with S&P500
dataset = pd.merge(valid_date, sp, on='date', how='inner')

# Merge with US GDP; apply latest quarterly GDP at any given date
dataset = pd.merge_asof(dataset, gdp, on='date', direction='backward')

# Merge with Effective Federal Fund Rate
dataset = pd.merge(dataset, effr, on='date', how='inner')

# Merge with MSCI ACWI ETF
dataset = pd.merge(dataset, acwi, on='date', how='inner')

### Normalize each column

In [49]:
from sklearn.preprocessing import StandardScaler

In [50]:
# Standardization for every column except date
scaler = StandardScaler()
columns_to_standardize = dataset.columns.drop('date')
dataset[columns_to_standardize] = scaler.fit_transform(dataset[columns_to_standardize])

In [51]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 248 entries, 0 to 247
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   date            248 non-null    datetime64[ns]
 1   close           248 non-null    float64       
 2   open            248 non-null    float64       
 3   high            248 non-null    float64       
 4   low             248 non-null    float64       
 5   gdp             248 non-null    float64       
 6   effr            248 non-null    float64       
 7   acwi_adj_close  248 non-null    float64       
dtypes: datetime64[ns](1), float64(7)
memory usage: 15.6 KB


## Save/Read Dataset

In [52]:
dir_main = os.getcwd()
dir_prepared_data = os.path.join(dir_main)

### Save dataset

In [53]:
dataset.to_csv(os.path.join(dir_prepared_data, "dataset.csv"),
               index=False)

### Read dataset

In [54]:
dataset = pd.read_csv(os.path.join(dir_prepared_data, "dataset.csv"),
                      sep=",",
                      header=0,
                      dtype={"date": object, "close": float, "open": float,
                             "high": float, "low": float,
                             "gdp": float, "effr": float,
                             "acwi_adj_close": float})
dataset['date'] = pd.to_datetime(dataset['date'])
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 248 entries, 0 to 247
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   date            248 non-null    datetime64[ns]
 1   close           248 non-null    float64       
 2   open            248 non-null    float64       
 3   high            248 non-null    float64       
 4   low             248 non-null    float64       
 5   gdp             248 non-null    float64       
 6   effr            248 non-null    float64       
 7   acwi_adj_close  248 non-null    float64       
dtypes: datetime64[ns](1), float64(7)
memory usage: 15.6 KB


Linear Regression Model 

In [56]:
import statsmodels.formula.api as smf 
small_interaction_term_model = smf.ols(formula='close~open+high+effr+low+gdp+acwi_adj_close',
                                       data=dataset).fit()
small_interaction_term_model.summary().tables[1]


0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.722e-16,0.004,4.74e-14,1.000,-0.007,0.007
open,-0.6562,0.049,-13.360,0.000,-0.753,-0.559
high,0.7484,0.049,15.171,0.000,0.651,0.846
effr,-0.0009,0.009,-0.090,0.928,-0.020,0.018
low,0.8304,0.052,16.023,0.000,0.728,0.933
gdp,0.0099,0.011,0.868,0.386,-0.013,0.032
acwi_adj_close,0.0673,0.018,3.746,0.000,0.032,0.103


Elastic net Lin reg 

In [57]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(dataset,
                                     test_size=0.1,
                                     random_state=102)
X_train = df_train.drop(['close','date'], axis=1)
y_train=df_train['close']
X_test = df_test.drop(['close','date'], axis=1)
y_test=df_test['close']


0.01

In [58]:
from sklearn.linear_model import ElasticNet
en_mod_1 = ElasticNet(alpha=.01, l1_ratio=0.7)
en_mod_1.fit(X_train, y_train)
en_mod_1.score(X_test, y_test)

0.9956621670912759

0.025

In [59]:
en_mod_2 = ElasticNet(alpha=.025, l1_ratio=0.7)
en_mod_2.fit(X_train, y_train)
en_mod_2.score(X_test, y_test)

0.9953441781495647

0.05

In [60]:
en_mod_3 = ElasticNet(alpha=.05, l1_ratio=0.7)
en_mod_3.fit(X_train, y_train)
en_mod_3.score(X_test, y_test)

0.9937317985630174

0.1

In [61]:
en_mod_4 = ElasticNet(alpha=.1, l1_ratio=0.7)
en_mod_4.fit(X_train, y_train)
en_mod_4.score(X_test, y_test)

0.9886807003642093

In [62]:
df_slopes = pd.DataFrame({'lin_reg_mod': small_interaction_term_model, 
                         'en_mod_1': en_mod_1.coef_.T,
                         'en_mod_2': en_mod_2.coef_.T,
                         'en_mod_3': en_mod_3.coef_.T,
                         'en_mod_4': en_mod_4.coef_.T}, index=X_train.columns)
df_slopes

Unnamed: 0,lin_reg_mod,en_mod_1,en_mod_2,en_mod_3,en_mod_4
open,<statsmodels.regression.linear_model.Regressio...,0.0,0.0,0.073872,0.142449
high,<statsmodels.regression.linear_model.Regressio...,0.403721,0.394213,0.341876,0.29069
low,<statsmodels.regression.linear_model.Regressio...,0.415814,0.401804,0.349887,0.296793
gdp,<statsmodels.regression.linear_model.Regressio...,0.005356,0.006238,0.004226,0.0
effr,<statsmodels.regression.linear_model.Regressio...,0.0,0.0,0.0,0.0
acwi_adj_close,<statsmodels.regression.linear_model.Regressio...,0.170072,0.181042,0.194102,0.195458


Log model 

In [63]:
import tensorflow as tf

2023-11-16 20:02:14.849211: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
