In [6]:
import yfinance as yf
import pandas as pd
import os

In [10]:
sp500= yf.Ticker("^GSPC")
sp500 = sp500.history(period="max")
sp500.to_csv("sp500.csv")

In [11]:
sp500

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1927-12-30 00:00:00-05:00,17.660000,17.660000,17.660000,17.660000,0,0.0,0.0
1928-01-03 00:00:00-05:00,17.760000,17.760000,17.760000,17.760000,0,0.0,0.0
1928-01-04 00:00:00-05:00,17.719999,17.719999,17.719999,17.719999,0,0.0,0.0
1928-01-05 00:00:00-05:00,17.549999,17.549999,17.549999,17.549999,0,0.0,0.0
1928-01-06 00:00:00-05:00,17.660000,17.660000,17.660000,17.660000,0,0.0,0.0
...,...,...,...,...,...,...,...
2024-08-26 00:00:00-04:00,5639.660156,5651.620117,5602.339844,5616.839844,2938570000,0.0,0.0
2024-08-27 00:00:00-04:00,5602.890137,5631.180176,5593.479980,5625.799805,2798990000,0.0,0.0
2024-08-28 00:00:00-04:00,5624.509766,5627.029785,5560.950195,5592.180176,3053450000,0.0,0.0
2024-08-29 00:00:00-04:00,5607.299805,5646.950195,5583.709961,5591.959961,3065640000,0.0,0.0


In [12]:
sp500.index

DatetimeIndex(['1927-12-30 00:00:00-05:00', '1928-01-03 00:00:00-05:00',
               '1928-01-04 00:00:00-05:00', '1928-01-05 00:00:00-05:00',
               '1928-01-06 00:00:00-05:00', '1928-01-09 00:00:00-05:00',
               '1928-01-10 00:00:00-05:00', '1928-01-11 00:00:00-05:00',
               '1928-01-12 00:00:00-05:00', '1928-01-13 00:00:00-05:00',
               ...
               '2024-08-19 00:00:00-04:00', '2024-08-20 00:00:00-04:00',
               '2024-08-21 00:00:00-04:00', '2024-08-22 00:00:00-04:00',
               '2024-08-23 00:00:00-04:00', '2024-08-26 00:00:00-04:00',
               '2024-08-27 00:00:00-04:00', '2024-08-28 00:00:00-04:00',
               '2024-08-29 00:00:00-04:00', '2024-08-30 00:00:00-04:00'],
              dtype='datetime64[ns, America/New_York]', name='Date', length=24283, freq=None)

In [13]:
sp500.plot.line(y="Close", use_index=True)

ImportError: matplotlib is required for plotting when the default backend "matplotlib" is selected.

In [14]:
if 'Dividends' in sp500.columns:
    del sp500['Dividends']
if 'Stock Splits' in sp500.columns:
    del sp500['Stock Splits']

In [15]:
sp500["Tomorrow"] = sp500["Close"].shift(-1)

In [16]:
sp500["Target"] = (sp500["Tomorrow"] > sp500["Close"]).astype(int)
#what we want to predict
#making a row that compares tomorrow to today

In [17]:
#remove all data before 1990
sp500 = sp500.loc["1990-01-01":].copy()

In [18]:
from sklearn.ensemble import RandomForestClassifier

#n_estimators are the number of indiv descision trees. More = more accuracy
#min samples split protect against overfit
model = RandomForestClassifier(n_estimators=100, min_samples_split=100, random_state=1)

train = sp500.iloc[:-100]
#last 100 rows are in the test set
test = sp500.iloc[-100:]

predictors = ["Close", "Volume", "Open", "High", "Low"]
model.fit(train[predictors], train["Target"])

ModuleNotFoundError: No module named 'sklearn'

In [None]:
RandomForestClassifier(min_samples_split=100,random_state=1)

In [None]:
#accuracy metric
from sklearn.metrics import precision_score

preds = model.predict(test[predictors])

In [None]:
import pandas as pd
#changing numpy array to panda
preds = pd.Series(preds, index=test.index)

In [None]:
precision_score(test["Target"],preds)

In [None]:
combined = pd.concat([test["Target"],preds], axis=1)

In [None]:
combined.plot()

In [None]:
def predict(train, test, predictors, model):
    model.fit(train[predictors], train["Target"])
    preds = model.predict(test[predictors])
    preds = pd.Series(preds, index=test.index, name="Predictions")
    combined = pd.concat([test["Target"], preds], axis=1)
    return combined
        

In [None]:
def backtest(data, model, predictors, start=2500, step=250):
    #first 10 years of data, predict year elevem
    #first 11 years of data, predict year twelve
    #etc...
    all_predictions = []

    for i in range(start, data.shape[0], step):
        train = data.iloc[0:i].copy()
        test = data.iloc[i:(i+step)].copy()
        predictions = predict(train, test, predictors, model)
        all_predictions.append(predictions)
    return pd.concat(all_predictions)

In [None]:
predictions = backtest(sp500, model, predictors)

In [None]:
predictions["Predictions"].value_counts()

In [None]:
precision_score(predictions["Target"], predictions["Predictions"])

In [None]:
predictions["Target"].value_counts() / predictions.shape[0]

In [None]:
#adding more predictors
horizons = [2,5,60,250,1000]
new_predictors = []

for horizon in horizons:
    rolling_averages = sp500.rolling(horizon).mean()

    ratio_column = f"Close_Ratio_{horizon}"
    sp500[ratio_column] = sp500["Close"] / rolling_averages["Close"]

    trend_column = f"Trend_{horizon}"
    sp500[trend_column] = sp500.shift(1).rolling(horizon).sum()["Target"]

    new_predictors += [ratio_column, trend_column]
    

In [None]:
sp500

In [None]:
model = RandomForestClassifier(n_estimators=200, min_samples_split=50, random_state=1) 

In [None]:
def predict(train, test, predictors, model):
    model.fit(train[predictors], train["Target"])
    preds = model.predict_proba(test[predictors])[:,1] #returns probability of stock prices going up or down rather than 0 or 1
    preds[preds >= .6] = 1 #raise confidence to considert going up
    preds[preds <.6] = 0
    preds = pd.Series(preds, index=test.index, name="Predictions")
    combined = pd.concat([test["Target"], preds], axis=1)
    return combined

In [None]:
predictions = backtest(sp500, model, new_predictors)

In [None]:
precision_score(predictions["Target"], predictions["Predictions"])