In [None]:
import yfinance as yf

sp500= yf.Ticker("^GSPC")
sp500 = sp500.history(period = 'max')
sp500

In [None]:
sp500.index

In [None]:
sp500.plot.line(y="Close", use_index=True, figsize=(10,6))

# Data wrangling

In [None]:
del sp500['Dividends']
del sp500['Stock Splits']

In [None]:
sp500['Tomorrow'] = sp500['Close'].shift(-1)

In [None]:
sp500

In [None]:
sp500["Target"] = (sp500['Tomorrow']>sp500['Close']).astype(int)

In [None]:
sp500

In [None]:
sp500 = sp500.loc['1990-01-01':].copy()

In [None]:
sp500

# Training an Initial Machine learning model

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators = 100, min_samples_split = 100, random_state=1)

train = sp500.iloc[:-100]
test = sp500.iloc[-100:]

predictors = ["Close", "Volume","Open","High","Low"]
model.fit(train[predictors], train["Target"])

In [None]:
from sklearn.metrics import precision_score

preds = model.predict(test[predictors])

In [None]:
import pandas as pd

preds = pd.Series(preds, index = test.index)

In [None]:
preds

In [None]:
precision_score(test["Target"], preds)

In [None]:
combined = pd.concat([test["Target"], preds], axis=1)

combined.plot()

# Building a Backtesting System

In [None]:
def predict(train, test, predictors, model):
    model.fit(train[predictors], train["Target"])
    preds = model.predict(test[predictors])
    preds = pd.Series(preds, index=test.index, name="Predictions")
    combined = pd.concat([test["Target"], preds], axis=1)
    return combined

In [None]:
def backtest(data, model, predictors, start=2500, step=250):
    all_predictions = []
    
    for i in range(start, data.shape[0], step):
        train = data.iloc[0:i].copy()
        test = data.iloc[i:(i+step)].copy()
        predictions = predict(train, test, predictors, model)
        all_predictions.append(predictions)
    return pd.concat(all_predictions)

In [None]:
predictions = backtest(sp500, model, predictors)

In [None]:
predictions["Predictions"].value_counts()

In [None]:
precision_score(predictions["Target"], predictions["Predictions"])

In [None]:
predictions["Target"].value_counts() / predictions.shape[0]

# Adding more predictors to improve accuracy

In [None]:
horizons = [2, 5, 60, 250, 1000]

new_predictors= []

for horizon in horizons:
    rolling_avarages = sp500.rolling(horizon).mean()
    
    ratio_column = f"CLose_Ratio{horizon}"
    sp500[ratio_column] = sp500.shift(1).rolling(horizon).sum()["Target"]
    
    trend_column = f"Trend_{horizon}"
    sp500[trend_column] = sp500.shift(1).rolling(horizon).sum()["Target"]
    
    new_predictors += [ratio_column, trend_column]
    

In [None]:
sp500 = sp500.dropna()


In [None]:
sp500

# Improving the model

In [None]:
model = RandomForestClassifier(n_estimators = 200, min_samples_split=50, random_state=1)

In [None]:
def predict(train, test, predictors, model):
    model.fit(train[predictors], train["Target"])
    preds = model.predict_proba(test[predictors])[:,1]
    preds[preds >= .6] = 1
    preds[preds < .6] = 0
    preds = pd.Series(preds, index=test.index, name="Predictions")
    combined = pd.concat([test["Target"], preds], axis=1)
    return combined

In [None]:
predictions = backtest(sp500, model , new_predictors)

In [None]:
predictions["Predictions"].value_counts()

In [None]:
precision_score(predictions["Target"], predictions["Predictions"])