# random_forest_model.ipynb

## Notebook Purpose
This notebook is designed to develop and train a Random Forest model using the preprocessed cryptocurrency data. The trained model will be used for making predictions in subsequent notebooks. This approach allows us to leverage the ensemble learning capabilities of Random Forests to make more accurate predictions.

## Instructions
1. **Import Necessary Libraries**:
   - Import `pandas` for data manipulation.
   - Import `train_test_split` from `sklearn.model_selection` for splitting the data.
   - Import `RandomForestRegressor` from `sklearn.ensemble` for the model.
   - Import `joblib` for saving the trained model.

2. **Load Preprocessed Data**:
   - Load the preprocessed CSV file created in the previous notebooks.

3. **Train the Random Forest Model**:
   - Split the data into training and testing sets.
   - Train the Random Forest model on the historical data.

4. **Save the Trained Model**:
   - Save the trained model to a file for later use in making predictions.

5. **Evaluate Model Performance**:
   - Evaluate the model's performance using appropriate metrics (e.g., R^2 score).

## Example Code
```python
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import joblib

# Load preprocessed data
data_path = 'data/historical_data/btc_usd_preprocessed.csv'  # Update this path based on the selected cryptocurrency
data = pd.read_csv(data_path, parse_dates=['Date'], index_col='Date')

# Split the data into training and testing sets
X = data[['Open', 'High', 'Low', 'Close', 'Volume']]
y = data['Close'].shift(-1).dropna()
X = X[:-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Save the trained model
joblib.dump(model, 'models/random_forest_model.pkl')

# Display model performance
print(f"Model trained. R^2 score on test data: {model.score(X_test, y_test)}")


In [None]:
# Cell 1: Imports
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from datetime import datetime
import numpy as np
import warnings
warnings.filterwarnings("ignore")

# Define global variable
global prices_df


In [None]:
# Cell 2: Copy prices DataFrame
# Creating a copy of the prices DataFrame to add features for Random Forest
rf_prices_df = prices_df.copy()


In [None]:
# Cell 3: Create lagging features
# Define the number of lags
num_lags = 7

# Create lagged features
for i in range(1, num_lags + 1):
    rf_prices_df[f'Close_Lag{i}'] = rf_prices_df['Close'].shift(i)

# Drop rows with NaN values (first 7 rows in this case)
rf_prices_df = rf_prices_df.dropna()


In [None]:
# Cell 4: Split data into train and test sets
# Using all but the last 90 rows for training and the last 90 rows for backtesting
train_df = rf_prices_df.iloc[:-90]
test_df = rf_prices_df.iloc[-90:]

# Separate lagged features and target variable
X_train = train_df[[f'Close_Lag{i}' for i in range(1, num_lags + 1)]]
y_train = train_df['Close']


In [None]:
# Cell 5: Create and train the Random Forest model
# Initialize the Random Forest model with 100 trees
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model using the historical training data
rf_model.fit(X_train, y_train)


In [None]:
# Cell 6: Backtest with historical data
# Separate lagged features and target variable
X_test = test_df[[f'Close_Lag{i}' for i in range(1, num_lags + 1)]]
y_true = test_df['Close']

# Make predictions for the last 90 days of the historical data
predictions = rf_model.predict(X_test)

# Evaluate the model using mean squared error
mse = mean_squared_error(y_true, predictions)
print(f'Mean Squared Error: {mse}')


In [None]:
# Cell 7: Predict the future
# Define the number of days to predict into the future
days_to_predict = 30

# Generate future dates starting from the last known date in prices_df
last_known_date = rf_prices_df.index[-1]
rf_future_dates = pd.date_range(start=last_known_date + pd.Timedelta(days=1), periods=days_to_predict, freq='D')

# Create a DataFrame for future predictions
rf_future_df = pd.DataFrame(index=rf_future_dates)

# Get the last 7 rows from the historical data to create the new DataFrame
last7_df = rf_prices_df.tail(7)

# Drop everything except Close
last7_df = last7_df[['Close']]

# Append to future DataFrame (needed to calculate lag)
rf_future_df = pd.concat([last7_df, rf_future_df])

# Iterate through future dates to predict stock prices
for date in rf_future_dates:
    # Create lagged features for the current future date
    lagged_features = []
    for i in range(1, num_lags + 1):
        index_position = rf_future_df.index.get_loc(date)
        previous_index_position = index_position - i
        lagged_features.append(rf_future_df.iloc[previous_index_position]['Close'])
    
    # Predict stock price for the current future date using lagged features
    prediction = rf_model.predict([lagged_features])[0]
 
    # Add the predicted price to future_df
    rf_future_df.loc[date, 'Close'] = prediction

# Drop the first 7 rows to get rid of the historical data
rf_future_df = rf_future_df.iloc[7:]

# Rename Close to Predicted Close
rf_future_df.rename(columns={'Close': 'Predicted_Close'}, inplace=True)

# Display the first and last few rows of the predictions
display(rf_future_df.head())
display(rf_future_df.tail())


In [None]:
# Cell 8: Plot historical and predicted stock prices
plt.figure(figsize=(12, 6))
plt.title('Historical and Predicted Stock Prices using Random Forest')
plt.xlabel('Date')
plt.ylabel('Stock Price')

# Plot the historical and predicted stock prices
plt.plot(prices_df['Close'], label='Historical Prices')
plt.plot(rf_future_df.index, rf_future_df['Predicted_Close'], label='Predicted Prices')

# Add legend
plt.legend()

# Save and show plot
plt.savefig('rf_predict.png')
plt.show()


In [None]:
# Cell 9: Print feature importances
# Saving feature names for later use
feature_list = list(X_train.columns)

# Get numerical feature importances
importances = rf_model.feature_importances_

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key=lambda x: x[1], reverse=True)

# Print out the feature importances
for feature, importance in feature_importances:
    print(f'Variable: {feature:20} Importance: {importance}')
