In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import datetime
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

## Predict

In [3]:
symbol = str(input("Enter stock symbol"))
period1 = int(time.mktime(datetime.datetime(1971, 2, 5, 23, 59).timetuple())) # Just let this be the starting point for the dataset
period2 = int(time.mktime(datetime.datetime.today().timetuple()))
interval = '1d'

# Scraping the stock data 
data = f'https://query1.finance.yahoo.com/v7/finance/download/{symbol}?period1={period1}&period2={period2}&interval={interval}&events=history&includeAdjustedClose=true'

Enter stock symbolTSLA


In [5]:
def cleaning_data(symbol, dataset): # Preprocessing the dataset
    df = pd.read_csv(dataset)
    df.to_csv(f'{symbol}.csv')
    df = pd.read_csv(f"{symbol}.csv")
    df = df.drop(["Unnamed: 0", "Adj Close"], axis = 1)
    df["Target"] = ((df["Close"] - df["Open"]) > 0).astype(int)
    df.set_index('Date', inplace = True)
    df = df.dropna()
    horizons = [2, 5, 60, 250, 1000]
    predictors = []

    for horizon in horizons:
        rolling_averages = df.rolling(horizon).mean()

        ratio = f"Close_Ratio_{horizon}"
        df[ratio] = df["Close"].shift(1) / rolling_averages["Close"]

        trend = f"Trend_Ratio"
        df[trend] = df["Target"].shift(1).rolling(horizon).sum()

        predictors += [ratio, trend]
        
    df = df.dropna()
    
    return df, predictors

In [6]:
df, predictors = cleaning_data(symbol, data)

In [7]:
def predict(train, test, predictors, model): # Predicting the data
  model.fit(train[predictors], train["Target"])
  preds = model.predict_proba(test[predictors])[:,1] 
  preds[preds >= .6] = 1
  preds[preds < .6] = 0
  preds = pd.Series(preds, index = test.index, name = "Predictions")
  combined = pd.concat([test["Target"], preds], axis = 1)
  return combined

In [8]:
def backtesting(data, model, predictors, start = 1250, step = 250): # Testing the machine learning model
  all_preds = [] 

  for i in range(start, data.shape[0], step):
    train = data.iloc[0:i].copy()
    test = data.iloc[i:(i+step)].copy()
    all = predict(train, test, predictors, model)
    all_preds.append(all)

  return pd.concat(all_preds)

In [9]:
RFC_model = RandomForestClassifier(n_estimators = 200, min_samples_split = 50, random_state = 1) # Machine learning model

In [11]:
predictions = backtesting(df, RFC_model, predictors)

In [13]:
predictions["Predictions"].value_counts()

Predictions
0.0    795
1.0    463
Name: count, dtype: int64

In [14]:
predictions

Unnamed: 0_level_0,Target,Predictions
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-06-07,0,0.0
2019-06-10,1,1.0
2019-06-11,0,1.0
2019-06-12,0,0.0
2019-06-13,1,1.0
...,...,...
2024-05-30,1,1.0
2024-05-31,0,0.0
2024-06-03,0,0.0
2024-06-04,0,0.0


In [15]:
print(classification_report(predictions["Target"], predictions["Predictions"]))

              precision    recall  f1-score   support

           0       0.69      0.89      0.77       617
           1       0.85      0.61      0.71       641

    accuracy                           0.75      1258
   macro avg       0.77      0.75      0.74      1258
weighted avg       0.77      0.75      0.74      1258



In [18]:
tomorrow_data = df.iloc[[-1]]
tomorrow_prediction = RFC_model.predict(tomorrow_data[predictors])
# Result of the prediction
if tomorrow_prediction == 1:
    print("Tomorrow's price is predicted to go up.")
else:
    print("Tomorrow's price is predicted to go down.")

Tomorrow's price is predicted to go up.
