# random_forest_model-checkpoint.ipynb

## Notebook Purpose
This notebook is designed to implement, train, and evaluate a Random Forest model on historical cryptocurrency data. It will also use the trained model to make predictions and visualize the results.

## Instructions
1. **Import Necessary Libraries**:
   - Import `pandas` for data manipulation.
   - Import `RandomForestRegressor` from `sklearn.ensemble` for the Random Forest model.
   - Import `matplotlib` for data visualization.
   - Import other required libraries as needed.

2. **Prepare Data**:
   - Load the preprocessed data from the CSV file.
   - Create lagged features to use as input for the model.
   - Split the data into training and testing sets.

3. **Train the Model**:
   - Initialize the Random Forest model.
   - Train the model using the training data.
   - Evaluate the model using the testing data.

4. **Predict Using Historic Data**:
   - Use the trained model to make predictions on the test set.
   - Calculate and print the Mean Squared Error (MSE) of the predictions.

5. **Predict Future Prices**:
   - Generate future dates for prediction.
   - Use the trained model to predict future prices.
   - Visualize the predicted future prices alongside the historical prices.

6. **Feature Importance**:
   - Display the feature importances from the Random Forest model.

## Example Code
```python
# Import necessary libraries
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
import numpy as np
import warnings
warnings.filterwarnings("ignore")

# Load the preprocessed data
data_path = 'data/historical_data/btc_usd_preprocessed.csv'  # Update this path based on the selected cryptocurrency
prices_df = pd.read_csv(data_path, parse_dates=['Date'], index_col='Date')

# Prepare data
# Create lagged features
num_lags = 7
rf_prices_df = prices_df.copy()
for i in range(1, num_lags + 1):
    rf_prices_df[f'Close_Lag{i}'] = rf_prices_df['Close'].shift(i)
rf_prices_df = rf_prices_df.dropna()

# Split data into training and testing sets
train_df = rf_prices_df.iloc[:-90]
test_df = rf_prices_df.iloc[-90:]
X_train = train_df[['Close_Lag1', 'Close_Lag2', 'Close_Lag3', 'Close_Lag4', 'Close_Lag5', 'Close_Lag6', 'Close_Lag7']]
y_train = train_df['Close']
X_test = test_df[['Close_Lag1', 'Close_Lag2', 'Close_Lag3', 'Close_Lag4', 'Close_Lag5', 'Close_Lag6', 'Close_Lag7']]
y_true = test_df['Close']

# Train the model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict using historic data
predictions = rf_model.predict(X_test)
mse = mean_squared_error(y_true, predictions)
print(f'Mean Squared Error: {mse}')

# Predict future prices
days_to_predict = 30
last_known_date = rf_prices_df.index[-1]
rf_future_dates = pd.date_range(start=last_known_date + pd.Timedelta(days=1), periods=days_to_predict, freq='D')
rf_future_df = pd.DataFrame(index=rf_future_dates)
last7_df = rf_prices_df.tail(7)[["Close"]]
rf_future_df = pd.concat([last7_df, rf_future_df])
for date in rf_future_dates:
    lagged_features = [rf_future_df.loc[date - pd.Timedelta(days=i), "Close"] for i in range(1, num_lags + 1)]
    prediction = rf_model.predict([lagged_features])[0]
    rf_future_df.loc[date, 'Close'] = prediction
rf_future_df = rf_future_df.drop(rf_future_df.index[:7])
rf_future_df = rf_future_df.rename(columns={'Close': 'Predicted_Close'})

# Plot historical and predicted prices
plt.figure(figsize=(12, 6))
plt.title('Historical and Predicted Stock Prices using Random Forest')
plt.xlabel('Date')
plt.ylabel('Stock Price')
plt.plot(prices_df['Close'], label='Historical Prices')
plt.plot(rf_future_df.index, rf_future_df['Predicted_Close'], label='Predicted Prices')
plt.legend()
plt.savefig('rf_predict.png')
plt.show()

# Display feature importances
feature_list = list(X_train.columns)
importances = list(rf_model.feature_importances_)
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
feature_importances = sorted(feature_importances, key=lambda x: x[1], reverse=True)
for feature, importance in feature_importances:
    print(f'Variable: {feature:20} Importance: {importance}')


In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
import numpy as np
import warnings
warnings.filterwarnings("ignore")

# Global variable for prices DataFrame
global prices_df


In [None]:
# Load the preprocessed data
data_path = 'data/historical_data/btc_usd_preprocessed.csv'  # Update this path based on the selected cryptocurrency
prices_df = pd.read_csv(data_path, parse_dates=['Date'], index_col='Date')

# Create a copy of the prices DataFrame for Random Forest
rf_prices_df = prices_df.copy()

# Define number of lags
num_lags = 7

# Create lagged features
for i in range(1, num_lags + 1):
    rf_prices_df[f'Close_Lag{i}'] = rf_prices_df['Close'].shift(i)

# Drop rows with NaN values (first 7 rows in this case)
rf_prices_df = rf_prices_df.dropna()


In [None]:
# Split data into training and testing sets - use all but the last 90 rows for training, 
# then use the last 90 rows for backtesting
train_df = rf_prices_df.iloc[:-90]
test_df = rf_prices_df.iloc[-90:]

# Separate lagged features and target variable
X_train = train_df[['Close_Lag1', 'Close_Lag2', 'Close_Lag3', 'Close_Lag4', 'Close_Lag5', 'Close_Lag6', 'Close_Lag7']]
y_train = train_df['Close']

X_test = test_df[['Close_Lag1', 'Close_Lag2', 'Close_Lag3', 'Close_Lag4', 'Close_Lag5', 'Close_Lag6', 'Close_Lag7']]
y_true = test_df['Close']


In [None]:
# Initialize the Random Forest model: 100 trees
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model using the historical training data
rf_model.fit(X_train, y_train)


In [None]:
# Make predictions for the last 90 days of the historical data
predictions = rf_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_true, predictions)
print(f'Mean Squared Error: {mse}')


In [None]:
# Define the number of days to predict into the future
days_to_predict = 30

# Generate future dates starting from the last known date in prices_df
last_known_date = rf_prices_df.index[-1]
rf_future_dates = pd.date_range(start=last_known_date + pd.Timedelta(days=1), periods=days_to_predict, freq='D')

# Create a DataFrame for future predictions
rf_future_df = pd.DataFrame(index=rf_future_dates)

# Get the last 7 rows from the historical data to create a new DataFrame
last7_df = rf_prices_df.tail(7)[["Close"]]

# Append to future DataFrame - needed so we can calculate lag
rf_future_df = pd.concat([last7_df, rf_future_df])

# Iterate through future dates to predict stock prices
for date in rf_future_dates:
    # Create lagged features for the current future date
    lagged_features = [rf_future_df.loc[date - pd.Timedelta(days=i), "Close"] for i in range(1, num_lags + 1)]
    
    # Predict stock price for the current future date using lagged features; predict returns an array, so get the first element
    prediction = rf_model.predict([lagged_features])[0]
    
    # Add the predicted price to future_df
    rf_future_df.loc[date, 'Close'] = prediction

# Drop the first 7 rows, to get rid of the historical data
rf_future_df = rf_future_df.drop(rf_future_df.index[:7])

# Rename 'Close' to 'Predicted_Close'
rf_future_df = rf_future_df.rename(columns={'Close': 'Predicted_Close'})

display(rf_future_df.head())
display(rf_future_df.tail())


In [None]:
# Plot settings
plt.figure(figsize=(12, 6))
plt.title('Historical and Predicted Stock Prices using Random Forest')
plt.xlabel('Date')
plt.ylabel('Stock Price')

# Plot the historical and predicted stock prices
plt.plot(prices_df['Close'], label='Historical Prices')
plt.plot(rf_future_df.index, rf_future_df['Predicted_Close'], label='Predicted Prices')

# Add legend
plt.legend()

# Save and show plot
plt.savefig('rf_predict.png')
plt.show()


In [None]:
# Display feature importances from the training data

# Saving feature names for later use
feature_list = list(X_train.columns)

# Get numerical feature importances
importances = list(rf_model.feature_importances_)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key=lambda x: x[1], reverse=True)

# Print out the feature and importances 
for feature, importance in feature_importances:
    print(f'Variable: {feature:20} Importance: {importance}')
