In [2]:
import pandas as pd
from fetch_crypto_data import fetch_crypto_data, calculate_metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import LabelEncoder


In [None]:
# Set parameters
crypto_pair = "ETHUSDT"  # or any other pair
start_date = "2000-10-01"  # specify the starting date

# Step 1: Fetch historical crypto data
data = fetch_crypto_data(crypto_pair, start_date)

# Step 2: Calculate trading metrics
# Replace variable1 and variable2 with integers according to your needs (e.g., 7 and 5)
variable1 = 5
variable2 = 7
data_with_metrics = calculate_metrics(data, variable1, variable2)


In [10]:
# Check if the 'Pair' column does not exist in the DataFrame 'data_with_metrics'
if 'Pair' not in data_with_metrics.columns:
    # If 'Pair' column is missing, create it and assign the value of 'crypto_pair'
    data_with_metrics['Pair'] = crypto_pair

# Initialize a LabelEncoder to convert categorical text data into numerical format
label_encoder = LabelEncoder()

# Fit the LabelEncoder on the 'Pair' column and transform the values into encoded labels
# The encoded labels are stored in a new column 'Pair_encoded'
data_with_metrics['Pair_encoded'] = label_encoder.fit_transform(data_with_metrics['Pair'])

In [5]:
def train_model(data, variable1, variable2):
    """
    Trains a machine learning model to predict future price percentage differences, with
    an additional feature for the cryptocurrency pair.

    Parameters:
    - data (pd.DataFrame): DataFrame containing features and target variables.
    - variable1 (int): Look-back period for historical data.
    - variable2 (int): Look-forward period for future data.

    Returns:
    - model: Trained RandomForestRegressor model.
    - mse: Mean Squared Error of the model on the test set.
    - mae: Mean Absolute Error of the model on the test set.
    """
    # Dynamically define feature and target column names based on variable1 and variable2
    feature_columns = [
        f'Days_Since_High_Last_{variable1}_Days',
        f'%_Diff_From_High_Last_{variable1}_Days',
        f'Days_Since_Low_Last_{variable1}_Days',
        f'%_Diff_From_Low_Last_{variable1}_Days',
        'Pair_encoded'
    ]
    target_columns = [
        f'%_Diff_From_High_Next_{variable2}_Days',
        f'%_Diff_From_Low_Next_{variable2}_Days'
    ]

    # Drop rows with NaN values in either features or targets
    combined_data = pd.concat([data[feature_columns], data[target_columns]], axis=1).dropna()
    features = combined_data[feature_columns]
    targets = combined_data[target_columns]

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.2, random_state=42)
    
    # Initialize and train the model
    model = RandomForestRegressor(random_state=42)
    model.fit(X_train, y_train)

    # Predict on the test set and evaluate model performance
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    mae = mean_absolute_error(y_test, predictions)

    print(f"Model Mean Squared Error (MSE): {mse}")
    print(f"Model Mean Absolute Error (MAE): {mae}")

    return model, mse, mae




In [8]:
# Train the model
model, mse, mae = train_model(data_with_metrics, variable1, variable2)

Model Mean Squared Error (MSE): 2.7771032681755816
Model Mean Absolute Error (MAE): 0.8187734291394608


In [11]:
def predict_outcomes(model, days_since_high, diff_from_high, days_since_low, diff_from_low, pair_encoded):
    """
    Predicts future price differences using the trained model.
    """
    # Create a DataFrame with the input features for prediction
    input_data = pd.DataFrame([[days_since_high, diff_from_high, days_since_low, diff_from_low, pair_encoded]],
                              columns=[f'Days_Since_High_Last_{variable1}_Days', 
                                       f'%_Diff_From_High_Last_{variable1}_Days',
                                       f'Days_Since_Low_Last_{variable1}_Days', 
                                       f'%_Diff_From_Low_Last_{variable1}_Days',
                                       'Pair_encoded'])
    
    # Use the trained model to make a prediction based on the input data
    prediction = model.predict(input_data)

    # Extract the predicted differences from the model's output
    predicted_high_diff, predicted_low_diff = prediction[0]
    
    # Print the predicted percentage differences from future high and low
    print(f"Predicted % Diff from Future High: {predicted_high_diff}")
    print(f"Predicted % Diff from Future Low: {predicted_low_diff}")
    
    # Return the predicted differences for further use
    return predicted_high_diff, predicted_low_diff

# Example prediction values for testing the prediction function
days_since_high = 3  # Number of days since the last high price
diff_from_high = 4.2  # Percentage difference from the last high price
days_since_low = 2  # Number of days since the last low price
diff_from_low = -1.5  # Percentage difference from the last low price
pair_encoded = label_encoder.transform([crypto_pair])[0]  # Encode the crypto pair using the LabelEncoder

# Call the predict_outcomes function with the example values to get predictions
predicted_high, predicted_low = predict_outcomes(
    model, 
    days_since_high, 
    diff_from_high, 
    days_since_low, 
    diff_from_low,
    pair_encoded
)

Predicted % Diff from Future High: -32.908452076072734
Predicted % Diff from Future Low: 52.569752759960835
