In [None]:
import yfinance as yf
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score
import pandas as pd
from sklearn.model_selection import GridSearchCV

In [None]:
# Data Preparation
sp500 = yf.Ticker("^GSPC")
sp500 = sp500.history(period="max")
del sp500["Dividends"]
del sp500["Stock Splits"]
sp500["Tomorrow"] = sp500["Close"].shift(-1)
sp500["Target"] = (sp500["Tomorrow"] > sp500["Close"]).astype(int)
sp500 = sp500.loc["1990-01-01":].copy()

In [None]:
# Initial Model Training
train = sp500.iloc[:-100]
test = sp500.iloc[-100:]
predictors = ["Close", "Volume", "Open", "High", "Low"]
model = RandomForestClassifier(n_estimators=100, min_samples_split=100, random_state=1)
model.fit(train[predictors], train["Target"])

In [None]:
# Hyperparameter Tuning with Grid Search
param_grid = {
    'n_estimators': [100, 200, 300],
    'min_samples_split': [50, 100, 150],
    'max_depth': [None, 10, 20, 30]
}
rf = RandomForestClassifier(random_state=1)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='precision', n_jobs=-1)
grid_search.fit(train[predictors], train['Target'])
print("Best parameters found: ", grid_search.best_params_)
print("Best precision score: ", grid_search.best_score_)

In [None]:
# Retrain the model with the best parameters
best_params = grid_search.best_params_
model = RandomForestClassifier(**best_params, random_state=1)
model.fit(train[predictors], train['Target'])

In [None]:
# Additional Feature Engineering
horizons = [2, 5, 60, 250, 1000]
new_predictors = []
for horizon in horizons:
    rolling_averages = sp500.rolling(horizon).mean()
    ratio_column = f"Close_Ratio_{horizon}"
    sp500[ratio_column] = sp500["Close"] / rolling_averages["Close"]
    trend_column = f"Trend_{horizon}"
    sp500[trend_column] = sp500.shift(1).rolling(horizon).sum()["Target"]
    new_predictors += [ratio_column, trend_column]
sp500 = sp500.dropna()

In [None]:
# Prediction and Backtesting
def predict(train, test, predictors, model):
    model.fit(train[predictors], train["Target"])
    preds = model.predict_proba(test[predictors])[:,1]
    preds[preds >= 0.6] = 1
    preds[preds < 0.6] = 0
    preds = pd.Series(preds, index=test.index, name="Predictions")
    combined = pd.concat([test["Target"], preds], axis=1)
    return combined

predictions = backtest(sp500, model, new_predictors)
print(predictions["Predictions"].value_counts() / predictions.shape[0])
print(precision_score(predictions["Target"], predictions["Predictions"]))