In [2]:
#Imports
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from datetime import datetime
import numpy as np
import warnings
warnings.filterwarnings("ignore")


In [3]:
global prices_df

In [None]:
#copy prices since want to add features for random forest
#Note: the code below for some reason generates a namerror message; however
#code itself works, with rf_prices_df being copied from prices_df, so I'm assuming
#it's a python environmental error
rf_prices_df = prices_df.copy()

In [None]:
#Create features of lagging prices

# Define number of lags
num_lags = 7

# Create lagged features (so "close_lag1" represents closing prices from previous day; "close_lag2" represents
# prices from 2 days ago, etc.
for i in range(1, num_lags + 1):
    rf_prices_df[f'Close_Lag{i}'] = rf_prices_df['Close'].shift(i)

# Drop rows with NaN values (first 7 rows in this case)
rf_prices_df = rf_prices_df.dropna()

# Split data into train and test sets - use all but the last 90 rows for training, 
# then use the last 90 rows for backtesting
train_df = rf_prices_df.iloc[:-90]
test_df = rf_prices_df.iloc[-90:]

# Separate lagged features and target variable
X_train = train_df[['Close_Lag1', 'Close_Lag2', 'Close_Lag3', 'Close_Lag4', 'Close_Lag5', 'Close_Lag6', 'Close_Lag7']]
y_train = train_df['Close']



## Create and Train the Model

In [None]:
# Initialize the Random Forest model: 100 trees; 
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model using the historical training data
rf_model.fit(X_train, y_train)

## Predict using Historic Data (Backtest)

In [None]:
# Separate lagged features and target variable
X_test = test_df[['Close_Lag1', 'Close_Lag2', 'Close_Lag3', 'Close_Lag4', 'Close_Lag5', 'Close_Lag6', 'Close_Lag7']]
y_true = test_df['Close']


In [None]:
# Make predictions for the last 90 days of the historical data
predictions = rf_model.predict(X_test)

In [None]:
# Evaluate the model
mse = mean_squared_error(y_true, predictions)
print(f'Mean Squared Error: {mse}')

## Predict the Future

In [None]:
# Define the number of days to predict into the future
days_to_predict = 30

# Generate future dates starting from the last known date in prices_df
last_known_date = rf_prices_df.index[-1]
rf_future_dates = pd.date_range(start=last_known_date + pd.Timedelta(days=1), periods=days_to_predict, freq='D')

# Create a DataFrame for future predictions
rf_future_df = pd.DataFrame(index=rf_future_dates)

# Get the last 7 rows from the historical, to create new dataframe
last7_df = rf_prices_df.tail(7)

# Drop everything except Close
last7_df = last7_df["Close"]

# Append to future - needed so we can calculate lag
rf_future_df = pd.concat([last7_df, rf_future_df])

# Name the column
rf_future_df.columns =['Close']



In [None]:
# Iterate through future dates to predict stock prices
# Need to calculate lag and predict at same time, since need
# lag to calculate prices, but need prices to calculate lag
      
for date in rf_future_dates:
    # Create lagged features for the current future date
    lagged_features = []
    for i in range(1, num_lags + 1):
        index_position = rf_future_df.index.get_loc(date)
        # Back up i rows
        previous_index_position = index_position - i
        # append that day's price
        lagged_features.append(rf_future_df.loc[rf_future_df.index[previous_index_position], "Close"])
    
    # Predict stock price for the current future date using lagged features; predict returns
    # an array, so get the first element (is also the only element, in this case)
    prediction = rf_model.predict([lagged_features])[0]
 
    # Add the predicted price to future_df
    rf_future_df.loc[date, 'Close'] = prediction

In [None]:
# Clean up rf_future_dt:

# Drop the first 7 rows, to get rid of the historical data
rf_future_df = rf_future_df.drop(rf_future_df.index[:7])

# Rename Close to Predicted Close
rf_future_df = rf_future_df.rename(columns={'Close': 'Predicted_Close'})

In [None]:
display(rf_future_df.head())
display(rf_future_df.tail())

In [None]:
# Plot settings
plt.figure(figsize=(12, 6))
plt.title('Historical and Predicted Stock Prices using Random Forest')
plt.xlabel('Date')
plt.ylabel('Stock Price')

# Plot the historical and predicted stock prices
plt.plot(prices_df['Close'], label='Historical Prices')
plt.plot(rf_future_df.index, rf_future_df['Predicted_Close'], label='Predicted Prices')

# Add legend
plt.legend()

# Save and show plot
plt.savefig('rf_predict.png')
plt.show()

In [None]:
# print Importances from the Training data

# Saving feature names for later use
feature_list = list(X_train.columns)

# Convert to numpy array
features = np.array(X_train)

# Get numerical feature importances
importances = list(rf_model.feature_importances_)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];