In [4]:
import numpy as np
import pandas as pd
import os

In [5]:
import warnings
warnings.filterwarnings("ignore")

In [6]:
dir_main = os.getcwd()
dir_raw_data = os.path.join(dir_main, "dataset", "raw")

In [7]:
sp = pd.read_csv(os.path.join(dir_raw_data, "sp500.csv"),
                sep=",",
                header=0,
                dtype={"Date": object, "Close/Last": float,
                       "Open": float, "High": float, "Low": float})

In [8]:
# Rename columns for readability
sp.rename(columns={"Date": 'date', "Close/Last": 'close',
                   "Open": 'open', "High": 'high', "Low": 'low'},
                   inplace=True)

# Set date column's data type
sp['date'] = pd.to_datetime(sp['date'])

# Sort rows by ascending order of date
sp = sp.sort_values('date')

In [9]:
# Count null values
sp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2538 entries, 2537 to 0
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    2538 non-null   datetime64[ns]
 1   close   2538 non-null   float64       
 2   open    2538 non-null   float64       
 3   high    2538 non-null   float64       
 4   low     2538 non-null   float64       
dtypes: datetime64[ns](1), float64(4)
memory usage: 119.0 KB


In [10]:
# Confirm first and last date
print("First date:\t", sp['date'].iloc[0], sep="")
print("Latest date:\t", sp['date'].iloc[-1], sep="")

First date:	2013-11-11 00:00:00
Latest date:	2023-11-10 00:00:00


In [11]:
effr = pd.read_csv(os.path.join(dir_raw_data, "EFFR.csv"),
                   sep=",",
                   header=0,
                   dtype={"DATE": object, "EFFR": float},
                   na_values=["."]
                   )

In [12]:
# Rename columns for readability
effr.rename(columns={"DATE": 'date', "EFFR": 'effr'}, inplace=True)

# Set date column's data type
effr['date'] = pd.to_datetime(effr['date'])

# Sort rows by ascending order of date
effr = effr.sort_values('date')

In [13]:
# Count null values
effr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2608 entries, 0 to 2607
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    2608 non-null   datetime64[ns]
 1   effr    2512 non-null   float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 61.1 KB


In [14]:
# Get list of null dates
null_dates = effr['date'][effr['effr'].isna()]

In [15]:
# Confirm first and last date
print("First date:\t", effr['date'].iloc[0], sep="")
print("Latest date:\t", effr['date'].iloc[-1], sep="")

First date:	2013-11-12 00:00:00
Latest date:	2023-11-09 00:00:00


In [16]:
gdp = pd.read_csv(os.path.join(dir_raw_data, "GDP.csv"),
                  sep=",",
                  header=0,
                  dtype={"DATE": object, "GDP": float})

In [17]:
# Rename columns for readability
gdp.rename(columns={"DATE": "date", "GDP": "gdp"}, inplace=True)

# Set date column's data type
gdp['date'] = pd.to_datetime(gdp['date'])

# Sort rows by ascending order of date
gdp = gdp.sort_values('date')

In [18]:
# Count null values
gdp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40 entries, 0 to 39
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    40 non-null     datetime64[ns]
 1   gdp     40 non-null     float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 960.0 bytes


In [19]:
# Confirm first and last date
print("First date:\t", gdp['date'].iloc[0], sep="")
print("Latest date:\t", gdp['date'].iloc[-1], sep="")

First date:	2013-10-01 00:00:00
Latest date:	2023-07-01 00:00:00


In [20]:
acwi = pd.read_csv(os.path.join(dir_raw_data, "ACWI.csv"),
                  sep=",",
                  header=0,
                  dtype={"Date": object, "Open": float, "High": float,
                         "Low": float, "Close": float,
                         "Adj Close": float, "Volume": float})

In [21]:
# Remove unnecessary columns
acwi = acwi[['Date', 'Adj Close']]

# Rename columns for readability
acwi.rename(columns={"Date": "date", "Adj Close": "acwi_adj_close"}, inplace=True)

# Set date column's data type
acwi['date'] = pd.to_datetime(acwi['date'])

# Sort rows by ascending order of date
acwi = acwi.sort_values('date')

In [22]:
acwi.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2547 entries, 0 to 2546
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   date            2547 non-null   datetime64[ns]
 1   acwi_adj_close  2547 non-null   float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 59.7 KB


In [23]:
# Confirm first and last date
print("First date:\t", acwi['date'].iloc[0], sep="")
print("Latest date:\t", acwi['date'].iloc[-1], sep="")

First date:	2013-10-01 00:00:00
Latest date:	2023-11-10 00:00:00


In [24]:
# Generate valid date pd.series
valid_date = sp['date']
mask_validDate = (valid_date >= pd.to_datetime('2013-11-12')) &\
                 (valid_date <= pd.to_datetime('2023-11-09')) &\
                 (~valid_date.isin(null_dates))
valid_date = valid_date[mask_validDate]

In [25]:
# Merge with S&P500
dataset = pd.merge(valid_date, sp, on='date', how='inner')

# Merge with US GDP; apply latest quarterly GDP at any given date
dataset = pd.merge_asof(dataset, gdp, on='date', direction='backward')

# Merge with Effective Federal Fund Rate
dataset = pd.merge(dataset, effr, on='date', how='inner')

# Merge with MSCI ACWI ETF
dataset = pd.merge(dataset, acwi, on='date', how='inner')

In [26]:
from sklearn.preprocessing import StandardScaler

In [27]:
# Standardization for every column except date
scaler = StandardScaler()
columns_to_standardize = dataset.columns.drop('date')
dataset[columns_to_standardize] = scaler.fit_transform(dataset[columns_to_standardize])

In [28]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2498 entries, 0 to 2497
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   date            2498 non-null   datetime64[ns]
 1   close           2498 non-null   float64       
 2   open            2498 non-null   float64       
 3   high            2498 non-null   float64       
 4   low             2498 non-null   float64       
 5   gdp             2498 non-null   float64       
 6   effr            2498 non-null   float64       
 7   acwi_adj_close  2498 non-null   float64       
dtypes: datetime64[ns](1), float64(7)
memory usage: 175.6 KB


In [29]:
dir_main = os.getcwd()
dir_prepared_data = os.path.join(dir_main, "dataset", "prepared")

In [30]:
dataset.to_csv(os.path.join(dir_prepared_data, "dataset.csv"),
               index=False)

In [31]:
dataset = pd.read_csv(os.path.join(dir_prepared_data, "dataset.csv"),
                      sep=",",
                      header=0,
                      dtype={"date": object, "close": float, "open": float,
                             "high": float, "low": float,
                             "gdp": float, "effr": float,
                             "acwi_adj_close": float})
dataset['date'] = pd.to_datetime(dataset['date'])
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2498 entries, 0 to 2497
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   date            2498 non-null   datetime64[ns]
 1   close           2498 non-null   float64       
 2   open            2498 non-null   float64       
 3   high            2498 non-null   float64       
 4   low             2498 non-null   float64       
 5   gdp             2498 non-null   float64       
 6   effr            2498 non-null   float64       
 7   acwi_adj_close  2498 non-null   float64       
dtypes: datetime64[ns](1), float64(7)
memory usage: 156.2 KB


In [32]:
dataset.head()

Unnamed: 0,date,close,open,high,low,gdp,effr,acwi_adj_close
0,2013-11-12,-1.345435,-1.34297,-1.348965,-1.341466,-1.356437,-0.762869,-1.287741
1,2013-11-13,-1.329387,-1.348735,-1.337583,-1.34333,-1.356437,-0.762869,-1.272056
2,2013-11-14,-1.319719,-1.328121,-1.32697,-1.321207,-1.356437,-0.756076,-1.259139
3,2013-11-15,-1.311241,-1.31925,-1.31952,-1.309411,-1.356437,-0.756076,-1.240225
4,2013-11-18,-1.318699,-1.310099,-1.314943,-1.312417,-1.356437,-0.756076,-1.242993


In [33]:

dataset = dataset.sort_values(by= "date")
dataset.head()

Unnamed: 0,date,close,open,high,low,gdp,effr,acwi_adj_close
0,2013-11-12,-1.345435,-1.34297,-1.348965,-1.341466,-1.356437,-0.762869,-1.287741
1,2013-11-13,-1.329387,-1.348735,-1.337583,-1.34333,-1.356437,-0.762869,-1.272056
2,2013-11-14,-1.319719,-1.328121,-1.32697,-1.321207,-1.356437,-0.756076,-1.259139
3,2013-11-15,-1.311241,-1.31925,-1.31952,-1.309411,-1.356437,-0.756076,-1.240225
4,2013-11-18,-1.318699,-1.310099,-1.314943,-1.312417,-1.356437,-0.756076,-1.242993


In [34]:
dataset['increase'] = False
for i in range(1, len(dataset)):
    if dataset['acwi_adj_close'].iloc[i] > dataset['acwi_adj_close'].iloc[i - 1]:
        dataset['increase'].iloc[i] = True
dataset['increase'] = dataset['increase'].astype(int)

In [35]:
dataset.head()

Unnamed: 0,date,close,open,high,low,gdp,effr,acwi_adj_close,increase
0,2013-11-12,-1.345435,-1.34297,-1.348965,-1.341466,-1.356437,-0.762869,-1.287741,0
1,2013-11-13,-1.329387,-1.348735,-1.337583,-1.34333,-1.356437,-0.762869,-1.272056,1
2,2013-11-14,-1.319719,-1.328121,-1.32697,-1.321207,-1.356437,-0.756076,-1.259139,1
3,2013-11-15,-1.311241,-1.31925,-1.31952,-1.309411,-1.356437,-0.756076,-1.240225,1
4,2013-11-18,-1.318699,-1.310099,-1.314943,-1.312417,-1.356437,-0.756076,-1.242993,0


In [36]:
feature_col = ['open', 'high', 'effr', 'low', 'gdp', 'acwi_adj_close']
x= dataset[feature_col]
y= dataset['increase']

In [37]:
import statsmodels.formula.api as smf
log_mod_full = smf.logit(formula='increase~open+high+effr+low+gdp+acwi_adj_close', data=dataset).fit()
log_mod_full.summary()

Optimization terminated successfully.
         Current function value: 0.447026
         Iterations 8


0,1,2,3
Dep. Variable:,increase,No. Observations:,2498.0
Model:,Logit,Df Residuals:,2491.0
Method:,MLE,Df Model:,6.0
Date:,"Wed, 22 Nov 2023",Pseudo R-squ.:,0.352
Time:,20:21:32,Log-Likelihood:,-1116.7
converged:,True,LL-Null:,-1723.1
Covariance Type:,nonrobust,LLR p-value:,7.572e-259

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.1691,0.061,2.782,0.005,0.050,0.288
open,-159.3939,7.239,-22.017,0.000,-173.583,-145.205
high,80.7166,5.424,14.883,0.000,70.087,91.347
effr,-0.0553,0.153,-0.360,0.719,-0.356,0.246
low,74.7371,4.613,16.200,0.000,65.695,83.779
gdp,0.3304,0.455,0.727,0.467,-0.561,1.221
acwi_adj_close,3.6860,0.736,5.011,0.000,2.244,5.128


In [38]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=16)

In [39]:
from sklearn.linear_model import LogisticRegression


logreg = LogisticRegression(random_state=16)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)


In [40]:
from sklearn import metrics

cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

array([[117, 178],
       [ 61, 269]], dtype=int64)

In [41]:
y_pred_proba = logreg.predict_proba(X_test)[:, 1]

coefficients = logreg.coef_
intercept = logreg.intercept_

from sklearn.metrics import roc_auc_score
auc_roc = roc_auc_score(y_test, y_pred_proba)

print("Coefficients:", coefficients)
print("Intercept:", intercept)
print("AUC:", auc_roc)

Coefficients: [[-6.33018026  0.46761162 -0.08043321  2.22308747  0.43884825  3.27290028]]
Intercept: [0.18356843]
AUC: 0.7052593733949666


The AUC shows that the the model has fairly accurate predictions however it is not the greatest since it is not within the ideal 0.8-0.9 range.