In [85]:
import yfinance as yf
from matplotlib import pyplot as plt

Get SP500 history (1927 -> present day)

In [86]:
sp500 = yf.Ticker("^GSPC")

In [87]:
sp500 = sp500.history(period="max")

In [88]:
sp500

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1927-12-30 00:00:00-05:00,17.660000,17.660000,17.660000,17.660000,0,0.0,0.0
1928-01-03 00:00:00-05:00,17.760000,17.760000,17.760000,17.760000,0,0.0,0.0
1928-01-04 00:00:00-05:00,17.719999,17.719999,17.719999,17.719999,0,0.0,0.0
1928-01-05 00:00:00-05:00,17.549999,17.549999,17.549999,17.549999,0,0.0,0.0
1928-01-06 00:00:00-05:00,17.660000,17.660000,17.660000,17.660000,0,0.0,0.0
...,...,...,...,...,...,...,...
2024-10-23 00:00:00-04:00,5834.500000,5834.850098,5762.410156,5797.419922,3532650000,0.0,0.0
2024-10-24 00:00:00-04:00,5817.799805,5817.799805,5784.919922,5809.859863,3543030000,0.0,0.0
2024-10-25 00:00:00-04:00,5826.750000,5862.819824,5799.979980,5808.120117,3501280000,0.0,0.0
2024-10-28 00:00:00-04:00,5833.930176,5842.919922,5823.080078,5823.520020,3691280000,0.0,0.0


Remove last two columns because they do not apply to an Index Fund

In [89]:
del sp500["Dividends"]
del sp500["Stock Splits"]

Add and fill new column, Tomorrow

In [90]:
sp500["Tomorrow"] = sp500["Close"].shift(-1)

Add and fill new column, Target (will tomorrow's price increase)

In [91]:
sp500["Target"] = (sp500["Tomorrow"] > sp500["Close"]).astype(int)

Avoid historically irrelevant data

In [92]:
sp500 = sp500.loc["1990-01-01":].copy()

Random Forest b/c they can represent non-linear tendencies and are not prone to over-fitting

In [93]:
from sklearn.ensemble import RandomForestClassifier

In [94]:
from sklearn.metrics import precision_score
import pandas as pd

Back Testing Intuition: 
- train off first 10 years of data, predict 11th year
- train off first 11 years of data, predict 12th year
- train off first 12 years of data, predict 13th year...

In [95]:
def predict(train, test, predictors, model):
    model.fit(train[predictors], train["Target"])
    predictions = model.predict_proba(test[predictors])[:,1]  # probability instead of '0' or '1'
    predictions[predictions >= .6] = 1  # only buy if 60% confidence or more
    predictions[predictions < .6] = 0 
    predictions = pd.Series(predictions, index=test.index, name="Predictions")
    combined = pd.concat([test["Target"], predictions], axis=1)
    return combined

In [96]:
def backtest(data, model, predictors, start=2500, step=250):
    all_predictions = []

    for i in range(start, data.shape[0], step):
        train = data.iloc[:i].copy()
        test = data.iloc[i:i+step].copy()
        predictions = predict(train, test, predictors, model)
        all_predictions.append(predictions)
    
    return pd.concat(all_predictions)

In [97]:
horizons = [2, 5, 60, 250, 1000]  # days ago we want to look at rolling means (only account for trading days)

new_predictors = []

for horizon in horizons:
    rolling_averages = sp500.rolling(horizon).mean()
    
    ratio_columns = f"Close_Ratio_{horizon}"
    sp500[ratio_columns] = sp500["Close"] / rolling_averages["Close"]

    # rolling sum = number of days since horizon that the stock increased
    trend_column = f"Trend_{horizon}"
    sp500[trend_column] = sp500.shift(1).rolling(horizon).sum()["Target"]

    new_predictors += [ratio_columns, trend_column]

In [98]:
sp500 = sp500.dropna()  # drop rows with NaN values ( b/c no trends have formed in first 1000 days)

In [99]:
model = RandomForestClassifier(n_estimators=200, min_samples_split=50, random_state=1)

In [100]:
predictions = backtest(sp500, model, new_predictors)

In [101]:
predictions["Predictions"].value_counts()  # we are trading 16.1% of days

Predictions
0.0    4424
1.0     849
Name: count, dtype: int64

In [102]:
precision_score(predictions["Target"], predictions["Predictions"])

np.float64(0.574793875147232)

If you buy the S&P500 every day from 1990-01-01 -> 2024-10-29: 

- 54.542 % of days it will increase

If you buy the S&p500 on days the model said to buy:

- 57.479 % of days it will increase 

In [103]:
predictions["Target"].value_counts() / predictions.shape[0]

Target
1    0.54542
0    0.45458
Name: count, dtype: float64

Notes for the future:

- There are exchanges that trade overnight (outisde of United States), the increase/decrease of those stock prices may influence/correlate to the S&P500 so gathering that data and using that to train the model may increase preformance

- Senitiment Analysis is said to be a huge factor in stock prices, and webscraping twitter and other platforms may give a big advantage to the model. 

- Scraping general data from News Articles, especially about general macro-economic conditions / inflation

- Maybe also consider data from key stocks/sectors of the S&P500 (ie if tech sector is looking down-term maybe the S&P500 will decrease in 6 months)

- This model uses daily data. Perhaps hourly/minutely/tick data would lead to more accurate positions

- Incorporate RSI based data

- Maybe build some relative volume indicator

- Maybe buiuld a model entirely based off the semi strong efficient market hypothesis (prices reflect all publicly known and available information, past information regarding price, volume, and returns is independent of future prices.)