# NO PROFIT/LOSS TARGET
# LOG(X1) - LOG(X0)
# SHOULD CLOSE AT THE END OF THE DAY

In [None]:
import sys
import numpy as np
import pandas as pd
import warnings
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import *
from sklearn.mixture import GaussianMixture
from sktime.forecasting.model_selection import SlidingWindowSplitter
from numba import jit
import joblib
import os
import shutil
import json
from sktime.forecasting.model_selection import SlidingWindowSplitter

sys.path.append(
    os.path.abspath(
        "/projects/genomic-ml/da2343/ml_project_2/unsupervised/kmeans"
    )
)
from utils import *

warnings.filterwarnings("ignore")

# Constants
INITIAL_CAPITAL = 100
RISK_FREE_RATE = 0.01

# Load trading parameters from CSV
trading_params = pd.read_csv("params.csv")
param_row = 0
param_dict = dict(
    trading_params.iloc[param_row, :]
)  # Assume first row of trading_params.csv

# Extract trading parameters
INSTRUMENT = param_dict["instrument"]
MAX_CLUSTER_LABELS = int(param_dict["max_cluster_labels"])
PRICE_HISTORY_LENGTH = int(param_dict["price_history_length"])
NUM_PERCEPTUALLY_IMPORTANT_POINTS = int(param_dict["num_perceptually_important_points"])
DISTANCE_MEASURE = int(param_dict["distance_measure"])
NUM_CLUSTERS = int(param_dict["num_clusters"])
ATR_MULTIPLIER = int(param_dict["atr_multiplier"])
CLUSTERING_ALGORITHM = param_dict["clustering_algorithm"]
RANDOM_SEED = int(param_dict["random_seed"])
TRAIN_PERIOD = int(param_dict["train_period"])
TEST_PERIOD = int(param_dict["test_period"])


# Define clustering algorithms
clustering_estimator_dict = {
    "kmeans": KMeans(n_clusters=NUM_CLUSTERS, random_state=RANDOM_SEED),
    "gaussian_mixture": GaussianMixture(
        n_components=NUM_CLUSTERS, covariance_type="tied", random_state=RANDOM_SEED
    ),
}


# @jit(nopython=True)
def calculate_sharpe_ratio(returns):
    excess_returns = returns - RISK_FREE_RATE
    return np.mean(excess_returns) / (np.std(returns) + 1e-6)


@jit(nopython=True)
def find_perceptually_important_points(price_data, num_points):
    point_indices = np.zeros(num_points, dtype=np.int64)
    point_prices = np.zeros(num_points, dtype=np.float64)
    point_indices[0], point_indices[1] = 0, len(price_data) - 1
    point_prices[0], point_prices[1] = price_data[0], price_data[-1]

    for current_point in range(2, num_points):
        max_distance, max_distance_index, insert_index = 0.0, -1, -1
        for i in range(1, len(price_data) - 1):
            left_adj = (
                np.searchsorted(point_indices[:current_point], i, side="right") - 1
            )
            right_adj = left_adj + 1
            distance = calculate_point_distance(
                price_data,
                point_indices[:current_point],
                point_prices[:current_point],
                i,
                left_adj,
                right_adj,
            )
            if distance > max_distance:
                max_distance, max_distance_index, insert_index = distance, i, right_adj

        point_indices[insert_index + 1 : current_point + 1] = point_indices[
            insert_index:current_point
        ]
        point_prices[insert_index + 1 : current_point + 1] = point_prices[
            insert_index:current_point
        ]
        point_indices[insert_index], point_prices[insert_index] = (
            max_distance_index,
            price_data[max_distance_index],
        )

    return point_indices, point_prices


@jit(nopython=True)
def calculate_point_distance(
    data, point_indices, point_prices, index, left_adj, right_adj
):
    time_diff = point_indices[right_adj] - point_indices[left_adj]
    price_diff = point_prices[right_adj] - point_prices[left_adj]
    slope = price_diff / time_diff
    intercept = point_prices[left_adj] - point_indices[left_adj] * slope
    x, y = index, data[index]

    if DISTANCE_MEASURE == 1:
        return (
            (point_indices[left_adj] - x) ** 2 + (point_prices[left_adj] - y) ** 2
        ) ** 0.5 + (
            (point_indices[right_adj] - x) ** 2 + (point_prices[right_adj] - y) ** 2
        ) ** 0.5
    elif DISTANCE_MEASURE == 2:
        return abs((slope * x + intercept) - y) / (slope**2 + 1) ** 0.5
    else:  # DISTANCE_MEASURE == 3
        return abs((slope * x + intercept) - y)


# @jit(nopython=True)
def determine_trade_outcome(future_highs, future_lows, take_profit, stop_loss):  
    # Check if the first value hits TP or SL
    if future_highs[0] >= take_profit:
        return 1
    if future_lows[0] <= stop_loss:
        return -1
    
    tp_hit = np.argmax(future_highs >= take_profit)
    sl_hit = np.argmax(future_lows <= stop_loss)

    if tp_hit == 0 and sl_hit == 0:
        return 0
    elif (tp_hit < sl_hit and tp_hit != 0) or (tp_hit != 0 and sl_hit == 0):
        return 1
    elif (sl_hit < tp_hit and sl_hit != 0) or (sl_hit != 0 and tp_hit == 0):
        return -1
    else:
        return 0


def prepare_data(price_subset, full_price_data = None, last_test_index = None):
    data_list = []
    scaler = StandardScaler()

    for index in range(PRICE_HISTORY_LENGTH, len(price_subset)):
        price_history = (
            price_subset["close"]
            .iloc[max(0, index - PRICE_HISTORY_LENGTH) : index]
            .values
        )
        if len(price_history) < PRICE_HISTORY_LENGTH:
            break

        _, important_points = find_perceptually_important_points(
            price_history, NUM_PERCEPTUALLY_IMPORTANT_POINTS
        )
        scaled_points = scaler.fit_transform(important_points.reshape(-1, 1)).flatten()
        data_point = {
            f"price_point_{i}": scaled_points[i]
            for i in range(NUM_PERCEPTUALLY_IMPORTANT_POINTS)
        }

        j = index - 1
        data_point.update(
            price_subset.iloc[j][
                ["year", "month", "day_of_week", "hour", "minute"]
            ].to_dict()
        )
        current_log_price = price_subset["log_close"].iloc[j]
        eod_log_price = price_subset["close"].iloc[j + 1]
        
        data_point["trade_outcome"] = 0


        data_list.append(data_point)

    return pd.DataFrame(data_list)


def evaluate_cluster_performance_df(price_data_df, train_best_clusters_df, clustering_model):
    price_point_columns = [f"price_point_{i}" for i in range(NUM_PERCEPTUALLY_IMPORTANT_POINTS)]
    all_columns = price_point_columns + ["day_of_week", "hour", "minute", "trade_outcome"]
    price_data = price_data_df[all_columns].values
    train_best_clusters = train_best_clusters_df[["cluster_label", "signal"]].values

    # Predict cluster labels
    predicted_labels = clustering_model.predict(price_data[:, :-1])
    cluster_performance_list = []

    for cluster_label, signal in train_best_clusters:
        mask = predicted_labels == cluster_label
        cluster_cumulative_return = np.cumsum(price_data[mask, -1])
        if signal == 0:
            cluster_cumulative_return = -cluster_cumulative_return

        cluster_trade_outcomes = price_data[mask, -1]
        metrics = calculate_evaluation_metrics(cluster_cumulative_return, cluster_trade_outcomes)
        metric_names = ["calmar_ratio", "annualized_return", "max_drawdown", "actual_return", "num_trades"]

        cluster_performance = {
            "signal": signal,
            "cluster_label": cluster_label,
            **dict(zip(metric_names, metrics))
        }
        cluster_performance_list.append(cluster_performance)

    return pd.DataFrame(cluster_performance_list)


@jit(nopython=True)
def calculate_max_drawdown(portfolio_values):
    peak = portfolio_values[0]
    max_drawdown = 0.0

    for value in portfolio_values[1:]:
        if value > peak:
            peak = value
        drawdown = (peak - value) / peak
        if drawdown > max_drawdown:
            max_drawdown = drawdown

    return max_drawdown


@jit(nopython=True)
def calculate_trading_metrics(trade_outcomes):
    if len(trade_outcomes) == 0:
        return 0, 0.0, 0.0, 0.0, 0.0, 0

    cumulative_return = np.cumsum(trade_outcomes)
    signal = 1 if cumulative_return[-1] > 0 else 0
    if signal == 0:
        cumulative_return = -cumulative_return

    portfolio_values = np.zeros(len(cumulative_return) + 1)
    portfolio_values[0] = INITIAL_CAPITAL
    portfolio_values[1:] = cumulative_return + INITIAL_CAPITAL

    start_value, end_value = portfolio_values[0], portfolio_values[-1]
    annualized_return = (end_value / start_value) - 1
    max_drawdown = calculate_max_drawdown(portfolio_values)
    calmar_ratio = annualized_return / (max_drawdown + 1e-6)
    actual_return = end_value - start_value
    return (
        signal,
        calmar_ratio,
        annualized_return,
        max_drawdown,
        actual_return,
        len(trade_outcomes),
    )


@jit(nopython=True)
def calculate_evaluation_metrics(cumulative_return, trade_outcomes):
    if len(trade_outcomes) == 0:
        return 0.0, 0.0, 0.0, 0.0, 0

    portfolio_values = np.zeros(len(cumulative_return) + 1)
    portfolio_values[0] = INITIAL_CAPITAL
    portfolio_values[1:] = cumulative_return + INITIAL_CAPITAL

    start_value, end_value = portfolio_values[0], portfolio_values[-1]
    annualized_return = (end_value / start_value) - 1
    max_drawdown = calculate_max_drawdown(portfolio_values)
    calmar_ratio = annualized_return / (max_drawdown + 1e-6)
    actual_return = end_value - start_value
    
    return (
        calmar_ratio,
        annualized_return,
        max_drawdown,
        actual_return,
        len(trade_outcomes),
    )


def cluster_and_evaluate_price_data(price_data_df):
    price_point_columns = [f"price_point_{i}" for i in range(NUM_PERCEPTUALLY_IMPORTANT_POINTS)]
    feature_columns = price_point_columns + ["day_of_week", "hour", "minute"]
    price_features = price_data_df[feature_columns].values

    clustering_model = clustering_estimator_dict[CLUSTERING_ALGORITHM]
    clustering_model.fit(price_features)
    price_data_df["cluster_label"] = clustering_model.predict(price_features)

    top_clusters_df = (
        price_data_df.groupby("cluster_label")["trade_outcome"]
        .sum()
        .abs()
        .nlargest(MAX_CLUSTER_LABELS)
        .reset_index()
    )

    best_clusters_list = []
    for cluster_label in top_clusters_df["cluster_label"]:
        cluster_trade_outcomes = price_data_df[
            price_data_df["cluster_label"] == cluster_label
        ]["trade_outcome"].values
        metrics = calculate_trading_metrics(cluster_trade_outcomes)
        best_clusters_list.append(
            {
                "signal": metrics[0],
                "cluster_label": cluster_label,
                "calmar_ratio": metrics[1],
                "annualized_return": metrics[2],
                "max_drawdown": metrics[3],
                "actual_return": metrics[4],
                "num_trades": metrics[5],
            }
        )
    return pd.DataFrame(best_clusters_list), clustering_model


# PROJECT_DIR = "/projects/genomic-ml/da2343/ml_project_2"
PROJECT_DIR = "/Users/newuser/Projects/robust_algo_trader"
# Load the config file
config_path = f"{PROJECT_DIR}/settings/config.json"
with open(config_path) as f:
    config = json.load(f)

instrument_dict = config["traded_instruments"][INSTRUMENT.split("_M15")[0]]
time_scaler = joblib.load(f"{PROJECT_DIR}/unsupervised/kmeans/ts_scaler_2018.joblib")
price_data = pd.read_csv(
    f"{PROJECT_DIR}/data/gen_oanda_data/{INSTRUMENT}_raw_data.csv",
    parse_dates=["time"],
    index_col="time",
)

# Filter date range and apply time scaling
price_data = price_data.loc["2019-01-01":"2024-06-01"]
price_data["year"] = price_data.index.year
price_data["month"] = price_data.index.month
price_data["day_of_week"] = price_data.index.dayofweek
price_data["hour"] = price_data.index.hour
price_data["minute"] = price_data.index.minute
# Calculate ATR as abs(high - low)
price_data["atr"] = (price_data["high"] - price_data["low"]).abs()
# Clip the ATR values
price_data["atr_clipped"] = np.clip(price_data["atr"], instrument_dict['atr_min'], instrument_dict['atr_max'])
# Round time columns after scaling
time_columns = ["day_of_week", "hour", "minute"]
price_data[time_columns] = time_scaler.transform(price_data[time_columns])
# Round price columns
columns_to_round = ['open', 'high', 'low', 'close', 'atr', 'atr_clipped', "day_of_week", "hour", "minute"]
price_data[columns_to_round] = price_data[columns_to_round].round(6)


# print the head of the price data
print("price_data.head()")
print(price_data.head())

# Initialize the sliding window splitter for backtesting
window_splitter = OrderedSlidingWindowSplitter(
    train_weeks=TRAIN_PERIOD, test_weeks=TEST_PERIOD, step_size=1
)

backtest_results = []
for window, (train_indices, test_indices) in enumerate(window_splitter.split(price_data), 1):
    print(f"Processing window {window}...")
    train_data = price_data.iloc[train_indices, :]
    test_data = price_data.iloc[test_indices, :]
    last_test_index = test_indices[-1]

    # Prepare training data and perform clustering
    print("Preparing training data and clustering...")
    train_price_data = prepare_data(train_data)
    train_best_clusters, clustering_model = cluster_and_evaluate_price_data(
        train_price_data
    )
    if train_best_clusters.empty:
        continue

    # Prepare test data and evaluate cluster performance
    print("Preparing test data and evaluating cluster performance...")
    test_price_data = prepare_data(test_data, price_data, last_test_index)
    test_cluster_performance = evaluate_cluster_performance_df(
        test_price_data, train_best_clusters, clustering_model
    )
    if test_cluster_performance.empty:
        continue

    # Compile results for this window
    print("Compiling results...")
    window_result = {
        "window": window,
        "train_total_annualized_return": train_best_clusters[
            "annualized_return"
        ].sum(),
        "train_total_actual_return": train_best_clusters["actual_return"].sum(),
        "train_total_trades": train_best_clusters["num_trades"].sum(),
        "test_total_annualized_return": test_cluster_performance[
            "annualized_return"
        ].sum(),
        "test_total_actual_return": test_cluster_performance["actual_return"].sum(),
        "test_total_trades": test_cluster_performance["num_trades"].sum(),
    }
    backtest_results.append(window_result)

# Compile final results
results_df = pd.DataFrame(backtest_results)
results_df["train_cumulative_actual_return"] = results_df[
    "train_total_actual_return"
].cumsum()
results_df["train_sharpe_ratio"] = calculate_sharpe_ratio(
    results_df["train_total_annualized_return"].values
)
results_df["test_cumulative_actual_return"] = results_df[
    "test_total_actual_return"
].cumsum()
results_df["test_sharpe_ratio"] = calculate_sharpe_ratio(
    results_df["test_total_annualized_return"].values
)
results_df["test_inverse_sharpe_ratio"] = calculate_sharpe_ratio(
    -1 * results_df["test_total_annualized_return"].values
)

# Add constant parameters to the results
results_df["max_cluster_labels"] = MAX_CLUSTER_LABELS
results_df["num_clusters"] = NUM_CLUSTERS
results_df["clustering_algorithm"] = CLUSTERING_ALGORITHM
results_df["train_period"] = TRAIN_PERIOD
results_df["test_period"] = TEST_PERIOD
results_df["random_seed"] = RANDOM_SEED
results_df["instrument"] = INSTRUMENT
results_df["num_perceptually_important_points"] = NUM_PERCEPTUALLY_IMPORTANT_POINTS

# save results to csv
out_file = f"results/{param_row}.csv"
results_df.to_csv(out_file, encoding="utf-8", index=False)
print("Backtesting completed.")

In [1]:
import sys
import numpy as np
import pandas as pd
import warnings
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import *
from sklearn.mixture import GaussianMixture
from sktime.forecasting.model_selection import SlidingWindowSplitter
from numba import jit
import joblib
import os
import shutil
import json
from sktime.forecasting.model_selection import SlidingWindowSplitter


INSTRUMENT = "EUR_USD_M15"
PROJECT_DIR = "/projects/genomic-ml/da2343/ml_project_2"
# PROJECT_DIR = "/Users/newuser/Projects/robust_algo_trader"
# Load the config file
config_path = f"{PROJECT_DIR}/settings/config.json"
with open(config_path) as f:
    config = json.load(f)

instrument_dict = config["traded_instruments"][INSTRUMENT.split("_M15")[0]]
time_scaler = joblib.load(f"{PROJECT_DIR}/unsupervised/kmeans/ts_scaler_2018.joblib")
price_data = pd.read_csv(
    f"{PROJECT_DIR}/data/gen_oanda_data/{INSTRUMENT}_raw_data.csv",
    parse_dates=["time"],
    index_col="time",
)

# Filter date range and apply time scaling
price_data = price_data.loc["2019-01-01":"2020-01-01"]
price_data['log_close'] = np.log(price_data['close'])
price_data['log_close_diff'] = price_data['log_close'].diff().fillna(0)
price_data["year"] = price_data.index.year
price_data["month"] = price_data.index.month
price_data["day_of_week"] = price_data.index.dayofweek
price_data["hour"] = price_data.index.hour
price_data["minute"] = price_data.index.minute
time_columns = ["day_of_week", "hour", "minute"]
price_data[time_columns] = time_scaler.transform(price_data[time_columns])
# Round price columns
columns_to_round = ['open', 'high', 'low', 'close', 'log_close', "day_of_week", "hour", "minute"]
price_data[columns_to_round] = price_data[columns_to_round].round(6)


In [2]:
price_data

Unnamed: 0_level_0,open,high,low,close,volume,log_close,log_close_diff,year,month,day_of_week,hour,minute
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2019-01-01 22:00:00+00:00,1.14649,1.14658,1.14632,1.14632,10,0.136557,0.000000,2019,1,-0.696396,1.516820,-1.341716
2019-01-01 22:15:00+00:00,1.14632,1.14653,1.14630,1.14648,36,0.136696,0.000140,2019,1,-0.696396,1.516820,-0.447275
2019-01-01 22:30:00+00:00,1.14653,1.14653,1.14616,1.14637,107,0.136600,-0.000096,2019,1,-0.696396,1.516820,0.447167
2019-01-01 22:45:00+00:00,1.14638,1.14650,1.14618,1.14641,484,0.136635,0.000035,2019,1,-0.696396,1.516820,1.341608
2019-01-01 23:00:00+00:00,1.14640,1.14672,1.14579,1.14660,768,0.136801,0.000166,2019,1,-0.696396,1.661283,-1.341716
...,...,...,...,...,...,...,...,...,...,...,...,...
2020-01-01 22:45:00+00:00,1.12155,1.12162,1.12149,1.12152,31,0.114685,0.000000,2020,1,-0.032217,1.516820,1.341608
2020-01-01 23:00:00+00:00,1.12149,1.12224,1.12149,1.12201,195,0.115122,0.000437,2020,1,-0.032217,1.661283,-1.341716
2020-01-01 23:15:00+00:00,1.12198,1.12210,1.12198,1.12210,16,0.115202,0.000080,2020,1,-0.032217,1.661283,-0.447275
2020-01-01 23:30:00+00:00,1.12212,1.12218,1.12210,1.12210,11,0.115202,0.000000,2020,1,-0.032217,1.661283,0.447167
