In [None]:
import sys
import pandas as pd
import numpy as np
import talib
import warnings
import json
import matplotlib.pyplot as plt
import bisect
from sklearn.preprocessing import *
import mpl_toolkits.mplot3d
from sklearn.cluster import *
from sklearn.mixture import *
import ffn as ffn
import empyrical as ep
import pickle
import joblib
# from sklearn.externals.joblib import dump, load
from sktime.forecasting.model_selection import SlidingWindowSplitter

In [None]:
warnings.filterwarnings("ignore")
params_df = pd.read_csv("params.csv")


param_row = 50

# Get the parameters for this task
param_dict = dict(params_df.iloc[param_row, :])

ONE_DAY = 4 * 24
INIT_CAPITAL = 100
N_CLOSE_PTS = int(param_dict["n_close_pts"])
N_PERC_PTS = int(param_dict["n_perc_pts"])
DIST_MEASURE = int(param_dict["dist_measure"])
N_CLUSTERS = int(param_dict["n_clusters"])
ATR_MULTIPLIER = int(param_dict["atr_multiplier"])
ALGORITHM = param_dict["algorithm"]
MAX_K_LABELS = 2 

random_state = int(param_dict["random_state"])
train_size = int(param_dict["train_size"] * ONE_DAY)
test_size = int(param_dict["test_size"] * ONE_DAY)

estimators = {
    "kmeans" : KMeans(n_clusters=N_CLUSTERS, random_state=random_state),
    "mini_batch_kmeans" : MiniBatchKMeans(n_clusters=N_CLUSTERS, random_state=random_state),
    "birch" : Birch(n_clusters=N_CLUSTERS),
    "gaussian_mixture" : GaussianMixture(n_components=N_CLUSTERS, covariance_type='tied', random_state=random_state)
}

def m_ulcer_index(series):
    drawdown = (series - series.cummax()) / series.cummax()
    squared_average = (drawdown**2).mean()
    return squared_average**0.5

# DIST_MEASURE
# 1 = Euclidean Distance
# 2 = Perpendicular Distance
# 3 = Vertical Distance
def find_pips(data, n_pips):
    pips_x = [0, len(data) - 1]  # Index
    pips_y = [data[0], data[-1]]  # Price
    for curr_point in range(2, n_pips):
        md = 0.0  # Max distance
        md_i = -1  # Max distance index
        insert_index = -1
        # Use a single loop to iterate over all the points
        for i in range(1, len(data) - 1):
            left_adj = bisect.bisect_right(pips_x, i) - 1
            right_adj = left_adj + 1
            # Calculate the distance from the point to the line segment
            d = distance(data, pips_x, pips_y, i, left_adj, right_adj)
            # Update the maximum distance and the insert index
            if d > md:
                md = d
                md_i = i
                insert_index = right_adj
        # Insert the new pip
        pips_x.insert(insert_index, md_i)
        pips_y.insert(insert_index, data[md_i])
    return pips_x, pips_y

# Define a helper function to calculate the distance
def distance(data, pips_x, pips_y, i, left_adj, right_adj):
    time_diff = pips_x[right_adj] - pips_x[left_adj]
    price_diff = pips_y[right_adj] - pips_y[left_adj]
    slope = price_diff / time_diff
    intercept = pips_y[left_adj] - pips_x[left_adj] * slope
    dist_funcs = {
        1: lambda x, y: ((pips_x[left_adj] - x) ** 2 + (pips_y[left_adj] - y) ** 2)
        ** 0.5
        + ((pips_x[right_adj] - x) ** 2 + (pips_y[right_adj] - y) ** 2) ** 0.5,
        2: lambda x, y: abs((slope * x + intercept) - y) / (slope**2 + 1) ** 0.5,
        3: lambda x, y: abs((slope * x + intercept) - y),
    }
    return dist_funcs[DIST_MEASURE](i, data[i])

def get_pips_df(sub_df, is_train=True):
    pips_y_list = []
    # loop through the data
    for index in range(N_CLOSE_PTS, len(sub_df)):
        try:
            x_close = sub_df["log_close"].iloc[index - N_CLOSE_PTS : index].to_numpy()
            pips_x, pips_y = find_pips(x_close, N_PERC_PTS)
            scaled_pips_y = (
                StandardScaler()
                .fit_transform(np.array(pips_y).reshape(-1, 1))
                .reshape(-1)
            )
            pips_y_dict = {f"pip_{i}": scaled_pips_y[i] for i in range(N_PERC_PTS)}
            j = index - 1
            pips_y_dict["year"] = sub_df["year"].iloc[j]
            pips_y_dict["month"] = sub_df["month"].iloc[j]
            pips_y_dict["day_of_week"] = sub_df["day_of_week"].iloc[j]
            pips_y_dict["hour"] = sub_df["hour"].iloc[j]
            pips_y_dict["minute"] = sub_df["minute"].iloc[j]
            # future features
            tp = sub_df["log_close"].iloc[j] + (
                ATR_MULTIPLIER * sub_df["log_atr"].iloc[j]
            )
            sl = sub_df["log_close"].iloc[j] - (
                ATR_MULTIPLIER * sub_df["log_atr"].iloc[j]
            )
            for k in range(index, len(sub_df)):
                if sub_df["log_close"].iloc[k] >= tp:
                    pips_y_dict["future_return"] = 1 if is_train else -1
                    break
                elif sub_df["log_close"].iloc[k] <= sl:
                    pips_y_dict["future_return"] = -1 if is_train else 1
                    break
                else:
                    pips_y_dict["future_return"] = 0
            pips_y_list.append(pips_y_dict)
        except Exception as e:
            break
    pips_y_df = pd.DataFrame(pips_y_list)
    return pips_y_df

def cluster_and_filter_pips_df(pips_train_df):
    pips_train_np = pips_train_df[
            [
                "pip_0",
                "pip_1",
                "pip_2",
                "pip_3",
                "pip_4",
                "day_of_week",
                "hour",
                "minute",
            ]
        ].to_numpy()
    estimator = estimators[ALGORITHM]
    estimator.fit(pips_train_np)
    pips_train_df["k_label"] = estimator.predict(pips_train_np)
    
    # group by k_label and calculate the cumulative sum of future returns
    filter_k_labels_df = (
        pips_train_df.groupby("k_label")["future_return"]
        .sum()
        .reset_index()
        .abs()
        .sort_values(by="future_return", ascending=False)
        .head(MAX_K_LABELS)
    )

    best_k_labels_list = []
    for k_label in filter_k_labels_df["k_label"]:
        pips_y_sub_df = pips_train_df[(pips_train_df["k_label"] == k_label)]
        k_label_cumsum = pips_y_sub_df["future_return"].cumsum().reset_index(drop=True)
        if k_label_cumsum.iloc[-1] > 0:
            signal = 1
        else:
            signal = 0
            k_label_cumsum = -k_label_cumsum

        # Add a constant value to the series
        # Put INIT_CAPITAL as the first value
        portfolio = pd.concat(
            [pd.Series([INIT_CAPITAL]), (k_label_cumsum + INIT_CAPITAL)]
        ).reset_index(drop=True)
        
        if not portfolio.empty:
            start_k_label_cumsum = portfolio.iloc[0]
            end_k_label_cumsum = portfolio.iloc[-1]
        else:
            continue

        annualized_return = (end_k_label_cumsum / start_k_label_cumsum) - 1
        ulcer_index = m_ulcer_index(portfolio)
        max_drawdown = abs(ffn.calc_max_drawdown(portfolio)) + 0.001
        calmar_ratio = annualized_return / max_drawdown
      
        best_k_labels_list.append(
            {
                "signal": signal,
                "k_label": k_label,
                "calmar_ratio": calmar_ratio,
                "ulcer_index": ulcer_index,
                "annualized_return": annualized_return,
                "max_drawdown": max_drawdown,
                "actual_return": end_k_label_cumsum - start_k_label_cumsum,
                "n_trades": len(k_label_cumsum),
            }
        )
    best_k_labels_df = pd.DataFrame(best_k_labels_list)
    return best_k_labels_df, kmeans

def filter_pips_df(pips_y_df, train_best_k_labels_df, estimator):
    pips_y_df["k_label"] = estimator.predict(pips_y_df[
        ["pip_0", "pip_1", "pip_2", "pip_3", "pip_4", "day_of_week", "hour", "minute"]
    ].to_numpy())

    test_k_labels_list = []

    for i in range(len(train_best_k_labels_df)):
        k_label = train_best_k_labels_df.iloc[i]["k_label"]
        signal = train_best_k_labels_df.iloc[i]["signal"]
        pips_y_copy_df = pips_y_df[(pips_y_df["k_label"] == k_label)]
        k_label_cumsum = pips_y_copy_df["future_return"].cumsum().reset_index(drop=True)
        if signal == 0:
            k_label_cumsum = -k_label_cumsum
        # Add a constant value to the series
        portfolio = pd.concat(
            [pd.Series([INIT_CAPITAL]), (k_label_cumsum + INIT_CAPITAL)]
        ).reset_index(drop=True)

        if not portfolio.empty:
            start_k_label_cumsum = portfolio.iloc[0]
            end_k_label_cumsum = portfolio.iloc[-1]
        else:
            continue

        annualized_return = (end_k_label_cumsum / start_k_label_cumsum) - 1
        ulcer_index = m_ulcer_index(portfolio)
        max_drawdown = abs(ffn.calc_max_drawdown(portfolio)) + 0.001
        calmar_ratio = annualized_return / max_drawdown

        test_k_labels_list.append(
            {
                "signal": signal,
                "k_label": k_label,
                "calmar_ratio": calmar_ratio,
                "ulcer_index": ulcer_index,
                "annualized_return": annualized_return,
                "max_drawdown": max_drawdown,
                "actual_return": end_k_label_cumsum - start_k_label_cumsum,
                "n_trades": len(k_label_cumsum),
            }
        )
    return pd.DataFrame(test_k_labels_list)


ohlcv_data = pd.read_csv(
    # "/Users/newuser/Projects/robust_algo_trader/data/gen_oanda_data/GBP_USD_M15_raw_data.csv",
    "/projects/genomic-ml/da2343/ml_project_2/data/gen_oanda_data/GBP_USD_M15_raw_data.csv",
    parse_dates=["time"],
)
ohlcv_data = ohlcv_data.set_index("time")
ohlcv_data["year"] = ohlcv_data.index.year
ohlcv_data["month"] = ohlcv_data.index.month
ohlcv_data["day_of_week"] = ohlcv_data.index.dayofweek
ohlcv_data["hour"] = ohlcv_data.index.hour
ohlcv_data["minute"] = ohlcv_data.index.minute
ohlcv_data["log_close"] = np.log(ohlcv_data["close"])
ohlcv_data["log_high"] = np.log(ohlcv_data["high"])
ohlcv_data["log_low"] = np.log(ohlcv_data["low"])
ohlcv_data["log_atr"] = talib.ATR(ohlcv_data["log_high"], ohlcv_data["log_low"], ohlcv_data["log_close"], timeperiod=1)
start_date = "2007-01-01"
end_date = "2015-01-01"
ohlcv_data = ohlcv_data[start_date:end_date]
df = ohlcv_data.copy()

splitter = SlidingWindowSplitter(
    window_length=train_size,
    fh=np.arange(1, test_size + 1),
    step_length=test_size,
)

return_df_list = []
for i, (train_idx, test_idx) in enumerate(splitter.split(df)):
    if i < 30:
        continue
            
    df_train = df.iloc[train_idx, :]
    df_test = df.iloc[test_idx, :]

    # TRAINING
    pips_train_df = get_pips_df(df_train)
    # ts_scaler = StandardScaler().fit(pips_train_df[["day_of_week", "hour", "minute"]])
    
    # Save the scaler to a file
    # joblib.dump(ts_scaler, 'ts_scaler.joblib')
    
    # load the saved file
    ts_scaler = joblib.load('ts_scaler.joblib')
        
    pips_train_df[["day_of_week", "hour", "minute"]] = ts_scaler.transform(
        pips_train_df[["day_of_week", "hour", "minute"]]
    )
    train_best_k_labels_df, estimator = cluster_and_filter_pips_df(pips_train_df)
    # ACCEPT OR REJECT THE TRAIN MODEL
    if train_best_k_labels_df.empty:
        continue

    # TESTING
    pips_test_df = get_pips_df(df_test, is_train=False)
    # pips_test_df = get_pips_df(df_test)
    pips_test_df[["day_of_week", "hour", "minute"]] = ts_scaler.transform(
        pips_test_df[["day_of_week", "hour", "minute"]]
    )
    test_k_labels_df = filter_pips_df(pips_test_df, train_best_k_labels_df, estimator)
    if test_k_labels_df.empty:
        continue

    return_df_list.append(
        {
            "window": i,
            "train_sum_annualized_return": train_best_k_labels_df["annualized_return"].sum(),
            "train_sum_actual_return": train_best_k_labels_df["actual_return"].sum(),
            "train_n_trades": train_best_k_labels_df["n_trades"].sum(),

            "test_sum_annualized_return": test_k_labels_df["annualized_return"].sum(),
            "test_sum_actual_return": test_k_labels_df["actual_return"].sum(),
            "test_n_trades": test_k_labels_df["n_trades"].sum(),
        }
    )

    if i >= 35:
        break


In [None]:

return_df["train_cumsum_annualized_return"] = return_df["train_sum_annualized_return"].cumsum()
return_df["train_cumsum_actual_return"] = return_df["train_sum_actual_return"].cumsum()
return_df["train_sharpe_ratio"] = calc_sharpe_ratio(return_df["train_sum_annualized_return"].to_numpy())
return_df["train_sortino_ratio"] = calc_sortino_ratio(return_df["train_sum_annualized_return"].to_numpy())
return_df["train_calmar_ratio"] = calc_calmar_ratio(return_df["train_sum_annualized_return"].to_numpy())

return_df["test_cumsum_annualized_return"] = return_df["test_sum_annualized_return"].cumsum()
return_df["test_cumsum_actual_return"] = return_df["test_sum_actual_return"].cumsum()

return_df["test_sharpe_ratio"] = calc_sharpe_ratio(return_df["test_sum_annualized_return"].to_numpy())
return_df["test_negative_sharpe_ratio"] = calc_sharpe_ratio(-1* return_df["test_sum_annualized_return"].to_numpy())

return_df["test_sortino_ratio"] = calc_sortino_ratio(return_df["test_sum_annualized_return"].to_numpy())
return_df["test_negative_sortino_ratio"] = calc_sortino_ratio(-1* return_df["test_sum_annualized_return"].to_numpy())
return_df["test_calmar_ratio"] = calc_calmar_ratio(return_df["test_sum_annualized_return"].to_numpy())
return_df["test_negative_calmar_ratio"] = calc_calmar_ratio(-1* return_df["test_sum_annualized_return"].to_numpy())


# return_df["n_close_pts"] = N_CLOSE_PTS
# return_df["n_perc_pts"] = N_PERC_PTS
# return_df["dist_measure"] = DIST_MEASURE
# return_df["n_clusters"] = N_CLUSTERS
return_df["train_size"] = train_size
return_df["test_size"] = test_size
return_df["random_state"] = random_state
return_df

In [None]:
import numpy as np

def calc_sharpe_ratio(portfolio_returns):
    risk_free_rate = 0.01
    excess_returns = np.array(portfolio_returns) - risk_free_rate
    standard_deviation = np.std(portfolio_returns)
    sharpe_ratio = np.mean(excess_returns) / standard_deviation
    return sharpe_ratio


def treynor_ratio(portfolio_returns, benchmark_returns, beta):
    excess_returns = np.array(portfolio_returns) - np.array(benchmark_returns)
    treynor_ratio = np.mean(excess_returns) / beta
    return treynor_ratio

def calc_sortino_ratio(portfolio_returns):
    risk_free_rate = 0.01
    excess_returns = np.array(portfolio_returns) - risk_free_rate
    downside_returns = excess_returns[excess_returns < 0]
    downside_std = np.std(downside_returns)
    sortino_ratio = np.mean(excess_returns) / downside_std
    return sortino_ratio

def calc_calmar_ratio(portfolio_returns):
    max_drawdown = ep.max_drawdown(portfolio_returns) + 0.001
    calmar_ratio = np.mean(portfolio_returns) / max_drawdown
    return calmar_ratio

# Example portfolio returns (annualized) and risk-free rate
portfolio_returns = [0.13, -0.09, -0.04, 0.14]  # Example returns
portfolio_returns_sum = sum(portfolio_returns)

# Calculate and print the Sharpe Ratio
sharpe_ratio = calc_sharpe_ratio(portfolio_returns)
print(f"The Sharpe Ratio is: {sharpe_ratio:.2f}")

sortino_ratio = calc_sortino_ratio(portfolio_returns)
print(f"The Sortino Ratio is: {sortino_ratio:.2f}")


In [None]:
import pandas as pd
import numpy as np
import talib
import joblib

# Load the saved time scaler
ts_scaler = joblib.load("ts_scaler_2018.joblib")

# Read the CSV file
df = pd.read_csv(
    "/projects/genomic-ml/da2343/ml_project_2/data/gen_oanda_data/GBP_USD_M15_raw_data.csv",
    parse_dates=["time"],
    index_col="time"
)

# Extract date components efficiently
df["year"] = df.index.year
df["month"] = df.index.month
df["day_of_week"] = df.index.dayofweek
df["hour"] = df.index.hour
df["minute"] = df.index.minute

# Calculate ATR
df["atr"] = talib.ATR(df["high"].values, df["low"].values, df["close"].values, timeperiod=1)
df['atr_clipped'] = np.clip(df['atr'], 0.00068, 0.00176)

# Filter date range
df = df.loc["2019-01-01":"2024-01-01"]

# Apply time scaling and rounding in one step
time_columns = ["day_of_week", "hour", "minute"]
df[time_columns] = np.round(ts_scaler.transform(df[time_columns]), 6)

# Round ATR columns
df[["atr", "atr_clipped"]] = df[["atr", "atr_clipped"]].round(6)

print(df)

In [None]:
import sys
import numpy as np
import pandas as pd
import talib
import warnings
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import *
from sklearn.mixture import GaussianMixture
from sktime.forecasting.model_selection import SlidingWindowSplitter
from numba import jit
import joblib

warnings.filterwarnings("ignore")

# Constants
CANDLES_PER_DAY = 4 * 24  # 15-minute candles
INITIAL_CAPITAL = 100
RISK_FREE_RATE = 0.01

# Load trading parameters from CSV
trading_params = pd.read_csv("params.csv")
param_row = 2942
param_dict = dict(
    trading_params.iloc[param_row, :]
)  # Assume first row of trading_params.csv

# Extract trading parameters
MAX_CLUSTER_LABELS = int(param_dict["max_cluster_labels"])
PRICE_HISTORY_LENGTH = int(param_dict["price_history_length"])
NUM_PERCEPTUALLY_IMPORTANT_POINTS = int(param_dict["num_perceptually_important_points"])
DISTANCE_MEASURE = int(param_dict["distance_measure"])
NUM_CLUSTERS = int(param_dict["num_clusters"])
ATR_MULTIPLIER = int(param_dict["atr_multiplier"])
# ATR_MULTIPLIER = 30
CLUSTERING_ALGORITHM = param_dict["clustering_algorithm"]
RANDOM_SEED = int(param_dict["random_seed"])
TRAIN_PERIOD = int(param_dict["train_period"] * CANDLES_PER_DAY)
TEST_PERIOD = int(param_dict["test_period"] * CANDLES_PER_DAY)

# Define clustering algorithms
clustering_models = {
    "kmeans": KMeans(n_clusters=NUM_CLUSTERS, random_state=RANDOM_SEED),
    "gaussian_mixture": GaussianMixture(
        n_components=NUM_CLUSTERS, covariance_type="tied", random_state=RANDOM_SEED
    ),
}


@jit(nopython=True)
def calculate_sharpe_ratio(returns):
    excess_returns = returns - RISK_FREE_RATE
    return np.mean(excess_returns) / np.std(returns)


@jit(nopython=True)
def find_perceptually_important_points(price_data, num_points):
    point_indices = np.zeros(num_points, dtype=np.int64)
    point_prices = np.zeros(num_points, dtype=np.float64)
    point_indices[0], point_indices[1] = 0, len(price_data) - 1
    point_prices[0], point_prices[1] = price_data[0], price_data[-1]

    for current_point in range(2, num_points):
        max_distance, max_distance_index, insert_index = 0.0, -1, -1
        for i in range(1, len(price_data) - 1):
            left_adj = (
                np.searchsorted(point_indices[:current_point], i, side="right") - 1
            )
            right_adj = left_adj + 1
            distance = calculate_point_distance(
                price_data,
                point_indices[:current_point],
                point_prices[:current_point],
                i,
                left_adj,
                right_adj,
            )
            if distance > max_distance:
                max_distance, max_distance_index, insert_index = distance, i, right_adj

        point_indices[insert_index + 1 : current_point + 1] = point_indices[
            insert_index:current_point
        ]
        point_prices[insert_index + 1 : current_point + 1] = point_prices[
            insert_index:current_point
        ]
        point_indices[insert_index], point_prices[insert_index] = (
            max_distance_index,
            price_data[max_distance_index],
        )

    return point_indices, point_prices


@jit(nopython=True)
def calculate_point_distance(
    data, point_indices, point_prices, index, left_adj, right_adj
):
    time_diff = point_indices[right_adj] - point_indices[left_adj]
    price_diff = point_prices[right_adj] - point_prices[left_adj]
    slope = price_diff / time_diff
    intercept = point_prices[left_adj] - point_indices[left_adj] * slope
    x, y = index, data[index]

    if DISTANCE_MEASURE == 1:
        return (
            (point_indices[left_adj] - x) ** 2 + (point_prices[left_adj] - y) ** 2
        ) ** 0.5 + (
            (point_indices[right_adj] - x) ** 2 + (point_prices[right_adj] - y) ** 2
        ) ** 0.5
    elif DISTANCE_MEASURE == 2:
        return abs((slope * x + intercept) - y) / (slope**2 + 1) ** 0.5
    else:  # DISTANCE_MEASURE == 3
        return abs((slope * x + intercept) - y)



@jit(nopython=True)
def determine_trade_outcome(future_highs, future_lows, take_profit, stop_loss):
    """
    Determine the outcome of a trade based on future high and low prices and TP/SL levels.

    Args:
    futur high prices.e_highs (np.array): Array of future
    future_lows (np.array): Array of future low prices.
    take_profit (float): Take profit level.
    stop_loss (float): Stop loss level.

    Returns:
    int: 1 for profit, -1 for loss, 0 for no action.
    """
    
    # Check if the first value hits TP or SL
    if future_highs[0] >= take_profit:
        return 1
    if future_lows[0] <= stop_loss:
        return -1
    
    tp_hit = np.argmax(future_highs >= take_profit)
    sl_hit = np.argmax(future_lows <= stop_loss)

    if tp_hit == 0 and sl_hit == 0:
        return 0
    elif (tp_hit < sl_hit and tp_hit != 0) or (tp_hit != 0 and sl_hit == 0):
        return 1
    elif (sl_hit < tp_hit and sl_hit != 0) or (sl_hit != 0 and tp_hit == 0):
        return -1
    else:
        return 0




def prepare_test_data(price_subset, full_price_data, last_test_index):
    test_data_list = []
    scaler = StandardScaler()

    for index in range(PRICE_HISTORY_LENGTH, len(price_subset)):
        price_history = (
            price_subset["close"].iloc[index - PRICE_HISTORY_LENGTH : index].values
        )
        if len(price_history) < PRICE_HISTORY_LENGTH:
            continue

        _, important_points = find_perceptually_important_points(
            price_history, NUM_PERCEPTUALLY_IMPORTANT_POINTS
        )
        scaled_points = scaler.fit_transform(important_points.reshape(-1, 1)).flatten()

        data_point = {
            f"price_point_{i}": scaled_points[i]
            for i in range(NUM_PERCEPTUALLY_IMPORTANT_POINTS)
        }
        data_point.update(
            price_subset.iloc[index - 1][
                ["year", "month", "day_of_week", "hour", "minute"]
            ].to_dict()
        )

        current_price = price_subset["close"].iloc[index - 1]
        current_atr = price_subset["atr_clipped"].iloc[index - 1]
        take_profit = current_price + (ATR_MULTIPLIER * current_atr)
        stop_loss = current_price - (ATR_MULTIPLIER * current_atr)

        future_highs = price_subset["high"].iloc[index:].values
        future_lows = price_subset["low"].iloc[index:].values

        data_point["trade_outcome"] = determine_trade_outcome(
            future_highs, future_lows, take_profit, stop_loss
        )
        if data_point["trade_outcome"] == 0:
            future_highs_full = full_price_data["high"].iloc[last_test_index:].values
            future_lows_full = full_price_data["low"].iloc[last_test_index:].values
            data_point["trade_outcome"] = determine_trade_outcome(
                future_highs_full, future_lows_full, take_profit, stop_loss
            )

        test_data_list.append(data_point)

    return pd.DataFrame(test_data_list)


# Define the dtype for our structured array
performance_dtype = np.dtype(
    [
        ("signal", np.int64),
        ("cluster_label", np.int64),
        ("calmar_ratio", np.float64),
        ("annualized_return", np.float64),
        ("max_drawdown", np.float64),
        ("actual_return", np.float64),
        ("num_trades", np.int64),
    ]
)


def evaluate_cluster_performance_df(
    price_data_df, train_best_clusters_df, clustering_model
):
    """
    Evaluate cluster performance and return results as a DataFrame.

    Args:
    price_data_df (pd.DataFrame): DataFrame of price data.
    train_best_clusters_df (pd.DataFrame): DataFrame of best performing clusters from training.
    clustering_model (object): Trained clustering model.

    Returns:
    pd.DataFrame: DataFrame of cluster performance metrics.
    """
    price_data = price_data_df[
        [
            "price_point_0",
            "price_point_1",
            "price_point_2",
            "price_point_3",
            "price_point_4",
            "day_of_week",
            "hour",
            "minute",
            "trade_outcome",
        ]
    ].values
    train_best_clusters = train_best_clusters_df[["cluster_label", "signal"]].values

    # Predict cluster labels and evaluate performance
    # Remove the trade_outcome column from the price data before
    predicted_labels = clustering_model.predict(price_data[:, :-1])
    cluster_performance_list = np.zeros(
        len(train_best_clusters), dtype=performance_dtype
    )

    for i in range(len(train_best_clusters)):
        cluster_label, signal = train_best_clusters[i]
        mask = predicted_labels == cluster_label
        cluster_cumulative_return = np.cumsum(price_data[mask, -1])
        if signal == 0:
            cluster_cumulative_return = -cluster_cumulative_return

        cluster_trade_outcomes = price_data[mask, -1]
        metrics = calculate_evaluation_metrics(
            cluster_cumulative_return, cluster_trade_outcomes
        )
        cluster_performance_list[i]["signal"] = signal
        cluster_performance_list[i]["cluster_label"] = cluster_label
        cluster_performance_list[i]["calmar_ratio"] = metrics[0]
        cluster_performance_list[i]["annualized_return"] = metrics[1]
        cluster_performance_list[i]["max_drawdown"] = metrics[2]
        cluster_performance_list[i]["actual_return"] = metrics[3]
        cluster_performance_list[i]["num_trades"] = metrics[4]

    return pd.DataFrame(cluster_performance_list)


@jit(nopython=True)
def calculate_max_drawdown(portfolio_values):
    """
    Calculate the maximum drawdown of a portfolio.

    Args:
    portfolio_values (np.array): Array of portfolio values over time.

    Returns:
    float: Maximum drawdown as a percentage.
    """
    peak = portfolio_values[0]
    max_drawdown = 0.0

    for value in portfolio_values[1:]:
        if value > peak:
            peak = value
        drawdown = (peak - value) / peak
        if drawdown > max_drawdown:
            max_drawdown = drawdown

    return max_drawdown


@jit(nopython=True)
def calculate_trading_metrics(trade_outcomes):
    """
    Calculate trading metrics based on trade outcomes.

    Args:
    trade_outcomes (np.array): Array of trade outcomes.

    Returns:
    tuple: Calculated trading metrics.
    """
    if len(trade_outcomes) == 0:
        return 0, 0.0, 0.0, 0.0, 0.0, 0

    cumulative_return = np.cumsum(trade_outcomes)
    signal = 1 if cumulative_return[-1] > 0 else 0
    if signal == 0:
        cumulative_return = -cumulative_return

    portfolio_values = np.zeros(len(cumulative_return) + 1)
    portfolio_values[0] = INITIAL_CAPITAL
    portfolio_values[1:] = cumulative_return + INITIAL_CAPITAL

    start_value, end_value = portfolio_values[0], portfolio_values[-1]
    annualized_return = (end_value / start_value) - 1
    max_drawdown = calculate_max_drawdown(portfolio_values)
    calmar_ratio = annualized_return / (max_drawdown + 1e-6)
    actual_return = end_value - start_value
    return (
        signal,
        calmar_ratio,
        annualized_return,
        max_drawdown,
        actual_return,
        len(trade_outcomes),
    )


@jit(nopython=True)
def calculate_evaluation_metrics(cumulative_return, trade_outcomes):
    if len(trade_outcomes) == 0:
        return 0.0, 0.0, 0.0, 0.0, 0

    portfolio_values = np.zeros(len(cumulative_return) + 1)
    portfolio_values[0] = INITIAL_CAPITAL
    portfolio_values[1:] = cumulative_return + INITIAL_CAPITAL

    start_value, end_value = portfolio_values[0], portfolio_values[-1]
    annualized_return = (end_value / start_value) - 1
    max_drawdown = calculate_max_drawdown(portfolio_values)
    calmar_ratio = annualized_return / (max_drawdown + 1e-6)
    actual_return = end_value - start_value
    
    return (
        calmar_ratio,
        annualized_return,
        max_drawdown,
        actual_return,
        len(trade_outcomes),
    )


def cluster_and_evaluate_price_data(price_data_df):
    price_features = price_data_df[
        [
            "price_point_0",
            "price_point_1",
            "price_point_2",
            "price_point_3",
            "price_point_4",
            "day_of_week",
            "hour",
            "minute",
        ]
    ].values
    clustering_model = clustering_models[CLUSTERING_ALGORITHM]
    clustering_model.fit(price_features)
    price_data_df["cluster_label"] = clustering_model.predict(price_features)

    top_clusters_df = (
        price_data_df.groupby("cluster_label")["trade_outcome"]
        .sum()
        .abs()
        .nlargest(MAX_CLUSTER_LABELS)
        .reset_index()
    )

    best_clusters_list = []
    for cluster_label in top_clusters_df["cluster_label"]:
        cluster_trade_outcomes = price_data_df[
            price_data_df["cluster_label"] == cluster_label
        ]["trade_outcome"].values
        metrics = calculate_trading_metrics(cluster_trade_outcomes)
        best_clusters_list.append(
            {
                "signal": metrics[0],
                "cluster_label": cluster_label,
                "calmar_ratio": metrics[1],
                "annualized_return": metrics[2],
                "max_drawdown": metrics[3],
                "actual_return": metrics[4],
                "num_trades": metrics[5],
            }
        )

    return pd.DataFrame(best_clusters_list), clustering_model


def prepare_training_data(price_subset):
    training_data_list = []
    scaler = StandardScaler()

    for index in range(PRICE_HISTORY_LENGTH, len(price_subset)):
        price_history = (
            price_subset["close"]
            .iloc[max(0, index - PRICE_HISTORY_LENGTH) : index]
            .values
        )
        if len(price_history) < PRICE_HISTORY_LENGTH:
            break

        _, important_points = find_perceptually_important_points(
            price_history, NUM_PERCEPTUALLY_IMPORTANT_POINTS
        )
        scaled_points = scaler.fit_transform(important_points.reshape(-1, 1)).flatten()

        data_point = {
            f"price_point_{i}": scaled_points[i]
            for i in range(NUM_PERCEPTUALLY_IMPORTANT_POINTS)
        }
        data_point.update(
            price_subset.iloc[index - 1][
                ["year", "month", "day_of_week", "hour", "minute"]
            ].to_dict()
        )

        current_price = price_subset["close"].iloc[index - 1]
        current_atr = price_subset["atr_clipped"].iloc[index - 1]
        take_profit = current_price + (ATR_MULTIPLIER * current_atr)
        stop_loss = current_price - (ATR_MULTIPLIER * current_atr)

        future_highs = price_subset["high"].iloc[index:].values
        future_lows = price_subset["low"].iloc[index:].values

        if len(future_highs) > 0:
            data_point["trade_outcome"] = determine_trade_outcome(
                future_highs, future_lows, take_profit, stop_loss
            )
        else:
            data_point["trade_outcome"] = 0

        training_data_list.append(data_point)

    return pd.DataFrame(training_data_list)



time_scaler = joblib.load("/projects/genomic-ml/da2343/ml_project_2/unsupervised/kmeans/ts_scaler_2018.joblib")
price_data = pd.read_csv(
    "/projects/genomic-ml/da2343/ml_project_2/data/gen_oanda_data/GBP_USD_M15_raw_data.csv",
    parse_dates=["time"],
    index_col="time",
)

price_data["year"] = price_data.index.year
price_data["month"] = price_data.index.month
price_data["day_of_week"] = price_data.index.dayofweek
price_data["hour"] = price_data.index.hour
price_data["minute"] = price_data.index.minute
price_data["atr"] = talib.ATR(
    price_data["high"].values,
    price_data["low"].values,
    price_data["close"].values,
    timeperiod=1,
)
price_data["atr_clipped"] = np.clip(price_data["atr"], 0.00068, 0.00176)

# Filter date range and apply time scaling
price_data = price_data.loc["2019-01-01":"2019-10-01"]
time_columns = ["day_of_week", "hour", "minute"]
price_data[time_columns] = np.round(
    time_scaler.transform(price_data[time_columns]), 6
)
price_data[["atr", "atr_clipped"]] = price_data[["atr", "atr_clipped"]].round(6)

# Initialize the sliding window splitter for backtesting
window_splitter = SlidingWindowSplitter(
    window_length=TRAIN_PERIOD,
    fh=np.arange(1, TEST_PERIOD + 1),
    step_length=TEST_PERIOD,
)

backtest_results = []
for window, (train_indices, test_indices) in enumerate(
    window_splitter.split(price_data)
):
    print(f"Processing window {window}...")
    train_data = price_data.iloc[train_indices, :]
    test_data = price_data.iloc[test_indices, :]
    last_test_index = test_indices[-1]

    # Prepare training data and perform clustering
    print("Preparing training data and clustering...")
    train_price_data = prepare_training_data(train_data)
    train_best_clusters, clustering_model = cluster_and_evaluate_price_data(
        train_price_data
    )
    if train_best_clusters.empty:
        continue

    # Prepare test data and evaluate cluster performance
    print("Preparing test data and evaluating cluster performance...")
    test_price_data = prepare_test_data(test_data, price_data, last_test_index)
    test_cluster_performance = evaluate_cluster_performance_df(
        test_price_data, train_best_clusters, clustering_model
    )
    if test_cluster_performance.empty:
        continue

    # Compile results for this window
    print("Compiling results...")
    window_result = {
        "window": window,
        "train_total_annualized_return": train_best_clusters[
            "annualized_return"
        ].sum(),
        "train_total_actual_return": train_best_clusters["actual_return"].sum(),
        "train_total_trades": train_best_clusters["num_trades"].sum(),
        "test_total_annualized_return": test_cluster_performance[
            "annualized_return"
        ].sum(),
        "test_total_actual_return": test_cluster_performance["actual_return"].sum(),
        "test_total_trades": test_cluster_performance["num_trades"].sum(),
    }
    backtest_results.append(window_result)

# Compile final results
results_df = pd.DataFrame(backtest_results)
results_df["train_cumulative_annualized_return"] = results_df[
    "train_total_annualized_return"
].cumsum()
results_df["train_cumulative_actual_return"] = results_df[
    "train_total_actual_return"
].cumsum()
results_df["train_sharpe_ratio"] = calculate_sharpe_ratio(
    results_df["train_total_annualized_return"].values
)

results_df["test_cumulative_annualized_return"] = results_df[
    "test_total_annualized_return"
].cumsum()
results_df["test_cumulative_actual_return"] = results_df[
    "test_total_actual_return"
].cumsum()
results_df["test_sharpe_ratio"] = calculate_sharpe_ratio(
    results_df["test_total_annualized_return"].values
)
results_df["test_inverse_sharpe_ratio"] = calculate_sharpe_ratio(
    -1 * results_df["test_total_annualized_return"].values
)

# Add constant parameters to the results
results_df["max_cluster_labels"] = MAX_CLUSTER_LABELS
results_df["num_clusters"] = NUM_CLUSTERS
results_df["clustering_algorithm"] = CLUSTERING_ALGORITHM
results_df["train_period"] = TRAIN_PERIOD
results_df["test_period"] = TEST_PERIOD
results_df["random_seed"] = RANDOM_SEED

print("Backtesting completed.")


In [None]:
55+871+10

In [None]:
# 1 -> 55
# -1 -> 871
# 0 -> 10
test_price_data

In [None]:
# 1 -> 1572
# -1 -> 2835
# 0 -> 369
train_price_data

In [None]:
first_results_df['test_cumulative_actual_return'].plot()

In [None]:
 second_results_df