In [32]:
import numpy as np
import pandas as pd
import os

In [33]:
import warnings
warnings.filterwarnings("ignore")

# Read raw data

In [34]:
dir_main = os.getcwd()
dir_raw_data = os.path.join(dir_main, "dataset", "raw")

## S&P500

In [35]:
sp = pd.read_csv(os.path.join(dir_raw_data, "sp500.csv"),
                sep=",",
                header=0,
                dtype={"Date": object, "Close/Last": float,
                       "Open": float, "High": float, "Low": float})

In [36]:
# Rename columns for readability
sp.rename(columns={"Date": 'date', "Close/Last": 'close',
                   "Open": 'open', "High": 'high', "Low": 'low'},
                   inplace=True)

# Set date column's data type
sp['date'] = pd.to_datetime(sp['date'])

# Sort rows by ascending order of date
sp = sp.sort_values('date')

In [37]:
# Count null values
sp.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2538 entries, 2537 to 0
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    2538 non-null   datetime64[ns]
 1   close   2538 non-null   float64       
 2   open    2538 non-null   float64       
 3   high    2538 non-null   float64       
 4   low     2538 non-null   float64       
dtypes: datetime64[ns](1), float64(4)
memory usage: 119.0 KB


In [38]:
# Confirm first and last date
print("First date:\t", sp['date'].iloc[0], sep="")
print("Latest date:\t", sp['date'].iloc[-1], sep="")

First date:	2013-11-11 00:00:00
Latest date:	2023-11-10 00:00:00


## Effective Federal Reserve Fund Rate

In [39]:
effr = pd.read_csv(os.path.join(dir_raw_data, "EFFR.csv"),
                   sep=",",
                   header=0,
                   dtype={"DATE": object, "EFFR": float},
                   na_values=["."]
                   )

In [40]:
# Rename columns for readability
effr.rename(columns={"DATE": 'date', "EFFR": 'effr'}, inplace=True)

# Set date column's data type
effr['date'] = pd.to_datetime(effr['date'])

# Sort rows by ascending order of date
effr = effr.sort_values('date')

In [41]:
# Count null values
effr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2608 entries, 0 to 2607
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    2608 non-null   datetime64[ns]
 1   effr    2512 non-null   float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 40.9 KB


In [42]:
# Get list of null dates
null_dates = effr['date'][effr['effr'].isna()]

In [43]:
# Confirm first and last date
print("First date:\t", effr['date'].iloc[0], sep="")
print("Latest date:\t", effr['date'].iloc[-1], sep="")

First date:	2013-11-12 00:00:00
Latest date:	2023-11-09 00:00:00


## US GDP

In [44]:
gdp = pd.read_csv(os.path.join(dir_raw_data, "GDP.csv"),
                  sep=",",
                  header=0,
                  dtype={"DATE": object, "GDP": float})

In [45]:
# Rename columns for readability
gdp.rename(columns={"DATE": "date", "GDP": "gdp"}, inplace=True)

# Set date column's data type
gdp['date'] = pd.to_datetime(gdp['date'])

# Sort rows by ascending order of date
gdp = gdp.sort_values('date')

In [46]:
# Count null values
gdp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    40 non-null     datetime64[ns]
 1   gdp     40 non-null     float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 772.0 bytes


In [47]:
# Confirm first and last date
print("First date:\t", gdp['date'].iloc[0], sep="")
print("Latest date:\t", gdp['date'].iloc[-1], sep="")

First date:	2013-10-01 00:00:00
Latest date:	2023-07-01 00:00:00


## MSCI ACWI ETF

In [48]:
acwi = pd.read_csv(os.path.join(dir_raw_data, "ACWI.csv"),
                  sep=",",
                  header=0,
                  dtype={"Date": object, "Open": float, "High": float,
                         "Low": float, "Close": float,
                         "Adj Close": float, "Volume": float})

In [49]:
# Remove unnecessary columns
acwi = acwi[['Date', 'Open']]

# Rename columns for readability
acwi.rename(columns={"Date": "date", "Open": "acwi_open"}, inplace=True)

# Set date column's data type
acwi['date'] = pd.to_datetime(acwi['date'])

# Sort rows by ascending order of date
acwi = acwi.sort_values('date')

In [50]:
acwi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2547 entries, 0 to 2546
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   date       2547 non-null   datetime64[ns]
 1   acwi_open  2547 non-null   float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 39.9 KB


In [51]:
# Confirm first and last date
print("First date:\t", acwi['date'].iloc[0], sep="")
print("Latest date:\t", acwi['date'].iloc[-1], sep="")

First date:	2013-10-01 00:00:00
Latest date:	2023-11-10 00:00:00


## Dataset summary

#### S&P500 Dataset
* 2538 rows
* From `2013-11-11` to `2023-11-10`
* `Daily` basis
* Schema: `['date', 'close', 'open', 'high', 'low']`
* no null value

#### Effective Federal Reserve Fund Rate
* 2608 rows
* From `2013-11-12` to `2023-11-09`
* `Daily` basis
* Schema: `['date', 'effr']`
* effr has 96 null values for holidays or other days.
    * These dates will be removed from every datasets.

#### US GDP
* 40 rows
* From `2013-10-01` to `2023-07-01`
* `Quarterly` basis
    * Latest quarterly GDP value will be used for each given date.
* Schema: `['date', 'gdp']`
* no null value

#### MSCI ACWI ETF
* 2547 rows
* From `2013-10-01` to `2023-11-10`
* `Daily` basis
* Schema: `['date', 'acwi_open']`
* no null value

# Combine(Join) Dataset

## Schema of cleaned data
`['date', 'close', 'open', 'high', 'low', 'effr', 'gdp', 'acwi_open']`
* `date`: every date in `S&P500`, but from `2013-11-12` to `2023-11-09`
    * The earliest and latest dates are bounded by EFFR data
* `close`, `open`, `high`, `low`: data from `S&P500`
* `effr`: data from `Effective Federal Reserve Fund Rate`
* `gdp`: data from `US GDP`
* `acwi_open`: Opening price from `Open` in `MSCI ACWI`

In [52]:
# Generate valid date pd.series
valid_date = sp['date']
mask_validDate = (valid_date >= pd.to_datetime('2013-11-12')) &\
                 (valid_date <= pd.to_datetime('2023-11-09')) &\
                 (~valid_date.isin(null_dates))
valid_date = valid_date[mask_validDate]

In [53]:
# Change US GDP from quarterly to daily values


In [54]:
# Merge with S&P500
dataset = pd.merge(valid_date, sp, on='date', how='inner')

# Merge with US GDP; apply latest quarterly GDP at any given date
dataset = pd.merge_asof(dataset, gdp, on='date', direction='backward')

# Merge with Effective Federal Fund Rate
dataset = pd.merge(dataset, effr, on='date', how='inner')

# Merge with MSCI ACWI ETF
dataset = pd.merge(dataset, acwi, on='date', how='inner')

### Normalize each column

In [55]:
from sklearn.preprocessing import StandardScaler

In [56]:
# Standardization for every column except date
scaler = StandardScaler()
columns_to_standardize = dataset.columns.drop('date')
dataset[columns_to_standardize] = scaler.fit_transform(dataset[columns_to_standardize])

In [57]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2498 entries, 0 to 2497
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   date       2498 non-null   datetime64[ns]
 1   close      2498 non-null   float64       
 2   open       2498 non-null   float64       
 3   high       2498 non-null   float64       
 4   low        2498 non-null   float64       
 5   gdp        2498 non-null   float64       
 6   effr       2498 non-null   float64       
 7   acwi_open  2498 non-null   float64       
dtypes: datetime64[ns](1), float64(7)
memory usage: 156.3 KB


## Save/Read Dataset

In [58]:
dir_main = os.getcwd()
dir_prepared_data = os.path.join(dir_main, "dataset", "prepared")

### Save dataset

In [59]:
dataset.to_csv(os.path.join(dir_prepared_data, "dataset.csv"),
               index=False)

### Read dataset

In [60]:
dataset = pd.read_csv(os.path.join(dir_prepared_data, "dataset.csv"),
                      sep=",",
                      header=0,
                      dtype={"date": object, "close": float, "open": float,
                             "high": float, "low": float,
                             "gdp": float, "effr": float,
                             "acwi_open": float})
dataset['date'] = pd.to_datetime(dataset['date'])
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2498 entries, 0 to 2497
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   date       2498 non-null   datetime64[ns]
 1   close      2498 non-null   float64       
 2   open       2498 non-null   float64       
 3   high       2498 non-null   float64       
 4   low        2498 non-null   float64       
 5   gdp        2498 non-null   float64       
 6   effr       2498 non-null   float64       
 7   acwi_open  2498 non-null   float64       
dtypes: datetime64[ns](1), float64(7)
memory usage: 156.3 KB


Linear regression

In [62]:
import statsmodels.formula.api as smf 
lin_reg_model = smf.ols(formula='close~open+high+effr+low+gdp+acwi_open',
                                       data=dataset).fit()
lin_reg_model.summary()


0,1,2,3
Dep. Variable:,close,R-squared:,1.0
Model:,OLS,Adj. R-squared:,1.0
Method:,Least Squares,F-statistic:,2418000.0
Date:,"Wed, 29 Nov 2023",Prob (F-statistic):,0.0
Time:,22:11:59,Log-Likelihood:,7284.5
No. Observations:,2498,AIC:,-14560.0
Df Residuals:,2491,BIC:,-14510.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.518e-16,0.000,5.78e-13,1.000,-0.001,0.001
open,-0.7196,0.016,-44.126,0.000,-0.752,-0.688
high,0.9034,0.016,57.440,0.000,0.873,0.934
effr,-0.0008,0.001,-1.183,0.237,-0.002,0.000
low,0.8136,0.012,65.486,0.000,0.789,0.838
gdp,0.0017,0.002,0.853,0.394,-0.002,0.006
acwi_open,0.0013,0.002,0.594,0.552,-0.003,0.006

0,1,2,3
Omnibus:,456.021,Durbin-Watson:,2.096
Prob(Omnibus):,0.0,Jarque-Bera (JB):,8386.911
Skew:,0.306,Prob(JB):,0.0
Kurtosis:,11.956,Cond. No.,179.0


In [63]:
lin_reg_model = smf.ols(formula='close~open+effr+gdp+acwi_open',
                                       data=dataset).fit()
lin_reg_model.summary()

0,1,2,3
Dep. Variable:,close,R-squared:,0.999
Model:,OLS,Adj. R-squared:,0.999
Method:,Least Squares,F-statistic:,646000.0
Date:,"Wed, 29 Nov 2023",Prob (F-statistic):,0.0
Time:,22:11:59,Log-Likelihood:,5129.3
No. Observations:,2498,AIC:,-10250.0
Df Residuals:,2493,BIC:,-10220.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.518e-16,0.001,2.44e-13,1.000,-0.001,0.001
open,0.9542,0.008,116.931,0.000,0.938,0.970
effr,-0.0052,0.001,-3.758,0.000,-0.008,-0.002
gdp,0.0238,0.004,5.380,0.000,0.015,0.033
acwi_open,0.0256,0.005,4.931,0.000,0.015,0.036

0,1,2,3
Omnibus:,324.47,Durbin-Watson:,2.114
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1993.425
Skew:,-0.445,Prob(JB):,0.0
Kurtosis:,7.285,Cond. No.,29.6
