In [None]:
# pip install yfinance

In [None]:
#IMPORTS
import yfinance as yf
import pandas as pd
import matplotlib.pyplot as plt

## Cleaning and visualizing data 

In [None]:
sp500=yf.Ticker("^GSPC")
sp500 = sp500.history(period="max")
sp500
#each row is the price in a single trading day
#open : opening price, high: highest price during the day low: lowest price during the day, close: closing price
#volume: total volume that was traded that day

In [None]:
sp500.plot.line(y="Close",use_index=True)

In [None]:
del sp500["Dividends"]
del sp500["Stock Splits"]

## Setting up target

In [None]:
#Tomorrow's price is the closing price of the day after. 
sp500["Tomorrow"] = sp500["Close"].shift(-1) 
sp500

In [None]:
sp500["Target"] = (sp500["Tomorrow"] > sp500["Close"]).astype(int)
sp500


In [None]:
sp500 = sp500.loc["1990-01-01":].copy()
sp500

## Training the model

In [None]:
from sklearn.ensemble import RandomForestClassifier 
model = RandomForestClassifier(n_estimators=200,min_samples_split=40,random_state=1)
train = sp500.iloc[:-3000]
test = sp500.iloc[-3000:]
predictors = ["Close","Volume","Open","High","Low"]
model.fit(train[predictors] , train["Target"])

In [None]:
#Measure how accurate the model is
from sklearn.metrics import precision_score
preds = model.predict(test[predictors])
preds = pd.Series(preds,index=test.index)
print(preds)
precision_score(test["Target"],preds)

In [None]:
# Comparing our predictions with the actual target
combined = pd.concat([test["Target"],preds],axis=1)
combined.plot()

In [None]:
#Back testing

def predict(train,test,predict,model):
    model.fit(train[predictors] , train["Target"])
    preds = model.predict(test[predictors])
    preds = pd.Series(preds,index=test.index, name="Predictions")
    combined = pd.concat([test["Target"],preds],axis=1)
    return combined
# take 10 years of data predict 11th year, take 11 years of data predict 12th year ... 
def backtest(data,model,predictors,start=2500,step=250):
    all_predictions= []
    for i in range(start,data.shape[0],test):
        train = data.iloc[0:i].copy()
        test = data.iloc[i:(i+step)].copy
        predictions  = predict(train,test,predictors,model)
        all_predictions.append(predictions)
    return pd.concat(all_predictions)

predictions = backtest(sp500,model,predictors)
predictions["Predictions"].value_counts()
precision_score(predictions["Target"],predictions["Predictions"])