In [81]:
import pandas as pd
import numpy as np
from plotnine import *
import seaborn as sns
import matplotlib.pyplot as plt

"EUR_USD_M15", "GBP_USD_M15", "USD_JPY_M15", "USD_CHF_M15", 
"USD_CAD_M15", "AUD_USD_M15", "AUD_JPY_M15", "AUD_CAD_M15", "EUR_GBP_M15", 
        
"EUR_JPY_M15", "GBP_CHF_M15", "GBP_JPY_M15", 
"EUR_CHF_M15", "AUD_NZD_M15", "CAD_JPY_M15", "NZD_USD_M15", 
"EUR_CAD_M15"

In [84]:
def filter_csv_pandas(input_file, instrument_column, target_instrument):
    # Initialize an empty list to store the filtered chunks
    filtered_chunks = []

    # Iterate through the CSV file in chunks
    for chunk in pd.read_csv(input_file, chunksize=10000):
        # Filter rows for target instrument and non-GMM algorithms
        if instrument_column in chunk.columns and 'clustering_algorithm' in chunk.columns:
            filtered_chunk = chunk[
                (chunk[instrument_column] == target_instrument) & 
                (chunk['clustering_algorithm'].isin(['kmeans', 'birch']))
            ]
            
            # If the filtered chunk is not empty, add it to our list
            if not filtered_chunk.empty:
                filtered_chunks.append(filtered_chunk)
    
    # Concatenate all filtered chunks into a single DataFrame
    if filtered_chunks:
        return pd.concat(filtered_chunks, ignore_index=True)
    else:
        return pd.DataFrame()  # Return an empty DataFrame if no matching rows found


# time = "2024-12-26_13:35"
# time = "2024-12-27_17:41"

time = "2025-01-02_17:34"
time = "2025-01-02_17:35"


raw_df = filter_csv_pandas(
    f"/projects/genomic-ml/da2343/ml_project_2/unsupervised/kmeans/results/{time}_results.csv",
    "instrument", 
    "NZD_USD_M15"
)
raw_df

Unnamed: 0,window,train_actual_return,train_num_trades,train_direction,test_actual_return,test_num_trades,test_direction,train_average_return,train_sharpe_ratio,train_profit_factor,...,total_windows,reverse_test,num_clusters,clustering_algorithm,train_period,test_period,random_seed,instrument,num_perceptually_important_points,price_history_length
0,1,0.047422,87,short,0.000000,0,long,0.260405,0.734480,1000.0,...,273,True,6,birch,12,1,42,NZD_USD_M15,4,24
1,2,0.038858,87,short,0.000000,0,long,0.260405,0.734480,1000.0,...,273,True,6,birch,12,1,42,NZD_USD_M15,4,24
2,3,0.040127,87,short,0.000000,0,long,0.260405,0.734480,1000.0,...,273,True,6,birch,12,1,42,NZD_USD_M15,4,24
3,4,1.005402,553,long,0.212028,37,short,0.260405,0.734480,1000.0,...,273,True,6,birch,12,1,42,NZD_USD_M15,4,24
4,5,1.292699,436,long,-0.122414,43,short,0.260405,0.734480,1000.0,...,273,True,6,birch,12,1,42,NZD_USD_M15,4,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
343975,297,0.355149,131,short,0.024534,24,long,0.176613,1.115016,1000.0,...,301,True,7,kmeans,4,1,623,NZD_USD_M15,4,24
343976,298,0.200110,104,short,0.194410,54,long,0.176613,1.115016,1000.0,...,301,True,7,kmeans,4,1,623,NZD_USD_M15,4,24
343977,299,0.189842,93,long,0.020338,37,short,0.176613,1.115016,1000.0,...,301,True,7,kmeans,4,1,623,NZD_USD_M15,4,24
343978,300,0.015653,24,short,0.000000,0,long,0.176613,1.115016,1000.0,...,301,True,7,kmeans,4,1,623,NZD_USD_M15,4,24


In [None]:

def optimize_training_period(raw_df):
    """
    Analyze performance across different training periods with seed stability consideration
    """
    # Filter for fixed parameters except training period
    params_df = raw_df[
        (raw_df['num_clusters'] == 5) &
        (raw_df['clustering_algorithm'] == 'kmeans') &
        (raw_df['test_period'] == 1) &
        (raw_df['reverse_test'] == True)
    ].copy()
    
    # First calculate performance metrics for each combination
    group_cols = ['instrument', 'train_period', 'random_seed']
    param_performance = params_df.groupby(group_cols).agg({
        'test_profit_factor': 'mean',
        'test_sharpe_ratio': 'mean',
        'test_win_ratio': 'mean',
        'test_num_trades': 'mean',
        'test_avg_trades_per_window': 'mean'
    }).reset_index()
    
    # Calculate stability metrics across seeds for each instrument and training period
    stability_metrics = param_performance.groupby(['instrument', 'train_period']).agg({
        'test_profit_factor': 'std',
        'test_sharpe_ratio': 'std',
        'test_avg_trades_per_window': 'std'
    }).reset_index()
    
    stability_metrics.columns = ['instrument', 'train_period', 'pf_std', 'sharpe_std', 'trades_std']
    
    # Merge stability metrics back
    param_performance = param_performance.merge(
        stability_metrics, 
        on=['instrument', 'train_period']
    )
    
    # Calculate stability score (lower std = higher score)
    param_performance['stability_score'] = (
        (1 / (1 + param_performance['pf_std'])) +
        (1 / (1 + param_performance['sharpe_std'])) +
        (1 / (1 + param_performance['trades_std']))
    ) / 3
    
    # Normalize stability score
    param_performance['stability_score'] = (param_performance['stability_score'] - 
                                          param_performance['stability_score'].min()) / (
                                          param_performance['stability_score'].max() - 
                                          param_performance['stability_score'].min())
    
    # Calculate combined score with reduced stability weight (10%)
    param_performance['combined_score'] = (
        param_performance['test_profit_factor'] * 0.35 +
        param_performance['test_sharpe_ratio'] * 0.35 +
        (param_performance['test_avg_trades_per_window'] / 
         param_performance['test_avg_trades_per_window'].max()) * 0.2 +
        param_performance['stability_score'] * 0.1  # Reduced to 10%
    )
    
    
    # Sort by instrument and combined score
    best_params = param_performance.sort_values(['instrument', 'combined_score'], ascending=[True, False])
    
    # Create display DataFrame
    display_df = pd.DataFrame({
        'Instrument': best_params['instrument'],
        'Train Period': best_params['train_period'].astype(str) + 'w',
        'Seed': best_params['random_seed'],
        'PF': best_params['test_profit_factor'].round(3),
        'PF σ': best_params['pf_std'].round(3),
        'Sharpe': best_params['test_sharpe_ratio'].round(3),
        'Trades': best_params['test_avg_trades_per_window'].round(1),
        'Trades σ': best_params['trades_std'].round(1),
        'Stability': best_params['stability_score'].round(3),
        'Score': best_params['combined_score'].round(3)
    })
    
    # Apply styling
    styled_df = display_df.style\
        .format({
            'PF': '{:.3f}',
            'PF σ': '{:.3f}',
            'Sharpe': '{:.3f}',
            'Trades': '{:.1f}',
            'Trades σ': '{:.1f}',
            'Stability': '{:.3f}',
            'Score': '{:.3f}',
            'Win%': '{:.1%}'
        })\
        .background_gradient(subset=['Score', 'Stability'], cmap='YlOrRd')\
        .background_gradient(subset=['PF'], cmap='YlOrRd')\
        .background_gradient(subset=['Sharpe'], cmap='YlOrRd')\
        .background_gradient(subset=['Trades'], cmap='YlOrRd')\
        .set_properties(**{
            'text-align': 'right',
            'font-family': 'monospace',
            'padding': '5px'
        })\
        .hide(axis="index")
    
    return best_params, styled_df

# Example usage:
performance_df, styled_df = optimize_training_period(raw_df)
print("Fixed Parameters:")
print("- Clusters: 5")
print("- Algorithm: kmeans")
print("- Test Period: 1 week")
print("- Reverse Test: True")
print("\nOptimizing Training Period with Stability...\n")
styled_df  # Display the styled table

In [85]:
def find_best_combinations(raw_df, n_top_combo=120, min_profit_factor=0.9, min_sharpe=0, min_trades=5):
    # Group by strategy parameters
    group_cols = ['num_clusters', 'clustering_algorithm', 'train_period', 
                  'test_period', 'reverse_test', 'random_seed']
    
    # Calculate mean performance metrics for each parameter combination
    param_performance = raw_df.groupby(group_cols).agg({
        'test_profit_factor': 'mean',
        'test_sharpe_ratio': 'mean',
        'test_win_ratio': 'mean',
        'test_num_trades': 'mean',
        'test_avg_trades_per_window': 'mean'
    }).reset_index()
    
    # Filter combinations that meet minimum performance criteria
    qualified_params = param_performance[
        (param_performance['test_profit_factor'] >= min_profit_factor) &
        # (param_performance['test_profit_factor'] < 1.8) &
        (param_performance['num_clusters'] == 5) &
        (param_performance['test_sharpe_ratio'] >= min_sharpe) &
        (param_performance['test_avg_trades_per_window'] >= min_trades) 
        # (param_performance['test_win_ratio'] > 0.5)  # Win rate must be > 50%
    ].copy()
    
    if len(qualified_params) == 0:
        print("No combinations meet the minimum criteria. Consider adjusting thresholds.")
        return None, None
    
    # Calculate combined score with cluster and training period penalties
    qualified_params['combined_score'] = (
        qualified_params['test_profit_factor'] * 0.35 +
        qualified_params['test_sharpe_ratio'] * 0.35 +
        (qualified_params['test_avg_trades_per_window'] / 
         qualified_params['test_avg_trades_per_window'].max()) * 0.2 +
        # Reward lower clusters (5 is best)
        (1 - (qualified_params['num_clusters'] - 5) / 5) * 0.05 +
        # Reward shorter training periods
        (1 - qualified_params['train_period'] / qualified_params['train_period'].max()) * 0.05
    )
    
    # Sort by combined score and get top 10
    best_params = qualified_params.sort_values('combined_score', ascending=False)
    actual_n_top = min(n_top_combo, len(best_params))
    top_combinations = best_params.head(actual_n_top)
    
    # Create display DataFrame with formatted columns
    display_df = pd.DataFrame({
        'Rank': range(1, actual_n_top + 1),
        'Clusters': top_combinations['num_clusters'],
        'Algorithm': top_combinations['clustering_algorithm'],
        'Train Period': top_combinations['train_period'].astype(str) + 'w',
        'Rev.Test': top_combinations['reverse_test'],
        'Seed': top_combinations['random_seed'],
        'PF': top_combinations['test_profit_factor'].round(3),
        'Sharpe': top_combinations['test_sharpe_ratio'].round(3),
        'Win%': top_combinations['test_win_ratio'].round(3),
        'Trades': top_combinations['test_avg_trades_per_window'].round(1),
        'Score': top_combinations['combined_score'].round(3)
    })
    
    # Apply styling to the display DataFrame
    styled_df = display_df.style\
        .format({
            'PF': '{:.3f}',
            'Sharpe': '{:.3f}',
            'Win%': '{:.1%}',
            'Trades': '{:.1f}',
            'Score': '{:.3f}'
        })\
        .background_gradient(subset=['Score'], cmap='YlOrRd')\
        .background_gradient(subset=['PF'], cmap='YlOrRd')\
        .background_gradient(subset=['Sharpe'], cmap='YlOrRd')\
        .background_gradient(subset=['Win%'], cmap='YlOrRd')\
        .background_gradient(subset=['Trades'], cmap='YlOrRd')\
        .set_properties(**{
            'text-align': 'right',
            'font-family': 'monospace',
            'padding': '5px'
        })\
        .hide(axis="index")
    
    return best_params, styled_df

# Example usage:
best_params, styled_df = find_best_combinations(raw_df)
styled_df  # Display the styled table

Rank,Clusters,Algorithm,Train Period,Rev.Test,Seed,PF,Sharpe,Win%,Trades,Score
1,5,kmeans,4w,True,370,1.341,0.088,48.3%,30.6,0.781
2,5,kmeans,4w,True,210,1.34,0.087,48.3%,30.7,0.781
3,5,kmeans,4w,True,22,1.34,0.087,48.3%,30.6,0.78
4,5,kmeans,4w,True,623,1.338,0.087,48.3%,30.6,0.779
5,5,kmeans,4w,True,594,1.335,0.087,48.0%,30.6,0.778
6,5,kmeans,4w,True,886,1.334,0.086,48.0%,30.6,0.777
7,5,kmeans,4w,True,673,1.328,0.085,48.3%,30.6,0.775
8,5,kmeans,4w,True,822,1.326,0.084,48.3%,30.5,0.774
9,5,kmeans,4w,True,66,1.309,0.08,48.0%,30.6,0.767
10,5,kmeans,4w,True,725,1.307,0.08,48.3%,30.4,0.765


In [None]:
print(best_params)

In [86]:
print(raw_df.iloc[0]['instrument'])
print(best_params.iloc[0])

NZD_USD_M15
num_clusters                          5
clustering_algorithm             kmeans
train_period                          4
test_period                           1
reverse_test                       True
random_seed                         370
test_profit_factor             1.341276
test_sharpe_ratio              0.087563
test_win_ratio                 0.483108
test_num_trades               30.584459
test_avg_trades_per_window    30.584459
combined_score                 0.780953
Name: 35, dtype: object
