In [12]:
import pandas as pd
import numpy as np


def load_and_preprocess_data(file_path, crypto_symbol, features):
    """
    Load and preprocess data for the specified cryptocurrency and selected features.
    """
    # Load the data
    crypto_data = pd.read_csv(file_path)
    crypto_data['Date'] = pd.to_datetime(crypto_data['Date'])
    crypto_data.set_index('Date', inplace=True)
    
    # Ensure all required features exist in the dataset
    missing_features = [f for f in features if f not in crypto_data.columns]
    if missing_features:
        raise ValueError(f"The following features are missing in the dataset: {missing_features}")
    
    # Filter for the specific cryptocurrency and selected features
    crypto_series = crypto_data[crypto_data['Symbol'] == crypto_symbol][features]

    # Feature engineering
    # Moving Averages
    crypto_series['Moving_Avg_7'] = crypto_series['Adj'].rolling(window=7).mean()
    crypto_series['Moving_Avg_14'] = crypto_series['Adj'].rolling(window=14).mean()
    crypto_series['Moving_Avg_30'] = crypto_series['Adj'].rolling(window=30).mean()

    # Lagged Features
    crypto_series['Lag_1'] = crypto_series['Adj'].shift(1)
    crypto_series['Lag_2'] = crypto_series['Adj'].shift(2)
    crypto_series['Lag_3'] = crypto_series['Adj'].shift(3)

    # Volatility 
    crypto_series['Volatility_7'] = crypto_series['Adj'].rolling(window=7).std()
    crypto_series['Volatility_14'] = crypto_series['Adj'].rolling(window=14).std()

    # Exponential Moving Average 
    crypto_series['EMA_7'] = crypto_series['Adj'].ewm(span=7, adjust=False).mean()
    crypto_series['EMA_14'] = crypto_series['Adj'].ewm(span=14, adjust=False).mean()

    # Momentum Indicator: Relative Strength Index (RSI)
    delta = crypto_series['Adj'].diff()
    gain = np.where(delta > 0, delta, 0)
    loss = np.where(delta < 0, -delta, 0)
    avg_gain = pd.Series(gain).rolling(window=14).mean()
    avg_loss = pd.Series(loss).rolling(window=14).mean()
    rs = avg_gain / avg_loss
    crypto_series['RSI'] = 100 - (100 / (1 + rs))

    # Moving Average Convergence Divergence 
    crypto_series['MACD'] = crypto_series['Adj'].ewm(span=12, adjust=False).mean() - \
                            crypto_series['Adj'].ewm(span=26, adjust=False).mean()
    crypto_series['Signal_Line'] = crypto_series['MACD'].ewm(span=9, adjust=False).mean()

    # Fill NaN values with backward and forward fill
    crypto_series.fillna(method='bfill', inplace=True)
    crypto_series.fillna(method='ffill', inplace=True)

    
    crypto_series.reset_index(inplace=True)

    return crypto_series


    

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.graph_objects as go


def mean_absolute_percentage_error(y_true, y_pred):
    """
    Calculate Mean Absolute Percentage Error (MAPE).
    """
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100


def run_random_forest_pipeline(crypto_series, crypto_symbol):
    
    # Prepare features and target
    features = ['Open', 'High', 'Low', 'Volume', 
                'Moving_Avg_7', 'Moving_Avg_14', 'Moving_Avg_30', 
                'Lag_1', 'Lag_2', 'Lag_3', 
                'Volatility_7', 'Volatility_14', 
                'EMA_7', 'EMA_14']
    target = 'Adj'

    # Ensure all required features exist in the dataset
    missing_features = [f for f in features if f not in crypto_series.columns]
    if missing_features:
        raise ValueError(f"The following features are missing in the dataset: {missing_features}")

    X = crypto_series[features]
    y = crypto_series[target]

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)

    # Random Forest model
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)

    # Predictions
    rf_predictions = rf_model.predict(X_test)

    # Calculate MAPE and accuracy
    mape = mean_absolute_percentage_error(y_test, rf_predictions)
    accuracy = 100 - mape

    # Generate buy and sell signals
    buy_signals = crypto_series[(crypto_series['Moving_Avg_7'] > crypto_series['Moving_Avg_30']) &
                                (crypto_series['Moving_Avg_14'] > crypto_series['Moving_Avg_30']) &
                                (crypto_series['Moving_Avg_7'].shift(1) <= crypto_series['Moving_Avg_30'].shift(1))]
    sell_signals = crypto_series[(crypto_series['Moving_Avg_7'] < crypto_series['Moving_Avg_30']) &
                                 (crypto_series['Moving_Avg_14'] < crypto_series['Moving_Avg_30']) &
                                 (crypto_series['Moving_Avg_7'].shift(1) >= crypto_series['Moving_Avg_30'].shift(1))]

    
    market_states = []
    for i in range(len(y_test)):
        if rf_predictions[i] > y_test.iloc[i]:
            market_states.append("Uptrend")
        elif rf_predictions[i] < y_test.iloc[i]:
            market_states.append("Downtrend")
        else:
            market_states.append("Stable")

    market_state_df = pd.DataFrame({
        'Date': crypto_series['Date'][-len(y_test):],
        'Actual Price': y_test.values,
        'Predicted Price': rf_predictions,
        'Market State': market_states
    })

    # Plot 
    fig = go.Figure()

    # candlestick chart
    fig.add_trace(go.Candlestick(
        x=crypto_series['Date'],
        open=crypto_series['Open'],
        high=crypto_series['High'],
        low=crypto_series['Low'],
        close=crypto_series['Adj'],
        name="OHLC"
    ))

    # predicted values 
    fig.add_trace(go.Scatter(
        x=crypto_series['Date'][-len(y_test):],
        y=rf_predictions,
        mode='lines',
        name='Predicted Data',
        line=dict(color='orange', dash='dot')
    ))

    # moving averages
    fig.add_trace(go.Scatter(
        x=crypto_series['Date'],
        y=crypto_series['Moving_Avg_7'],
        mode='lines',
        name='7-Day MA',
        line=dict(color='blue')
    ))
    fig.add_trace(go.Scatter(
        x=crypto_series['Date'],
        y=crypto_series['Moving_Avg_14'],
        mode='lines',
        name='14-Day MA',
        line=dict(color='purple')
    ))
    fig.add_trace(go.Scatter(
        x=crypto_series['Date'],
        y=crypto_series['Moving_Avg_30'],
        mode='lines',
        name='30-Day MA',
        line=dict(color='green')
    ))

    # buy signals
    fig.add_trace(go.Scatter(
        x=buy_signals['Date'],
        y=buy_signals['Adj'],
        mode='markers',
        name='Buy Signals',
        marker=dict(color='green', size=10, symbol='triangle-up')
    ))

    #  sell signals
    fig.add_trace(go.Scatter(
        x=sell_signals['Date'],
        y=sell_signals['Adj'],
        mode='markers',
        name='Sell Signals',
        marker=dict(color='red', size=10, symbol='triangle-down')
    ))

    
    fig.update_layout(
        title=f"Interactive Candlestick Chart with Buy & Sell Signals and Moving Averages for {crypto_symbol}",
        xaxis_title="Date",
        yaxis_title="Price (USD)",
        xaxis_rangeslider_visible=True,
        template="plotly_white",
        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)
    )

    return accuracy, fig, market_state_df