In [5]:
import pandas as pd
import numpy as np
from plotnine import *
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
def filter_csv_pandas(input_file, instrument_column, target_instrument):
    # Initialize an empty list to store the filtered chunks
    filtered_chunks = []

    # Iterate through the CSV file in chunks
    for chunk in pd.read_csv(input_file, chunksize=10000):
        # Filter rows for target instrument and non-GMM algorithms
        if instrument_column in chunk.columns and 'clustering_algorithm' in chunk.columns:
            filtered_chunk = chunk[
                (chunk[instrument_column] == target_instrument) & 
                (chunk['clustering_algorithm'].isin(['kmeans', 'birch']))
            ]
            
            # If the filtered chunk is not empty, add it to our list
            if not filtered_chunk.empty:
                filtered_chunks.append(filtered_chunk)
    
    # Concatenate all filtered chunks into a single DataFrame
    if filtered_chunks:
        return pd.concat(filtered_chunks, ignore_index=True)
    else:
        return pd.DataFrame()  # Return an empty DataFrame if no matching rows found


# Example usage:
# time = "2024-12-12_15:48"
# time = "2024-12-14_04:20"

time = "2024-12-26_13:35"
# time = "2024-12-27_17:41"

raw_df = filter_csv_pandas(
    f"/projects/genomic-ml/da2343/ml_project_2/unsupervised/kmeans/results/{time}_results.csv",
    "instrument", 
    "GBP_JPY_M15"
)
raw_df

Unnamed: 0,window,train_actual_return,train_num_trades,train_direction,test_actual_return,test_num_trades,test_direction,train_average_return,train_sharpe_ratio,train_profit_factor,...,total_windows,reverse_test,num_clusters,clustering_algorithm,train_period,test_period,random_seed,instrument,num_perceptually_important_points,price_history_length
0,1,0.181744,46,long,0.014362,6,short,0.10938,1.674426,1000.0,...,201,True,20,kmeans,6,1,762,GBP_JPY_M15,4,24
1,2,0.106633,49,long,-0.013604,5,short,0.10938,1.674426,1000.0,...,201,True,20,kmeans,6,1,762,GBP_JPY_M15,4,24
2,3,0.130271,49,long,0.006710,8,short,0.10938,1.674426,1000.0,...,201,True,20,kmeans,6,1,762,GBP_JPY_M15,4,24
3,4,0.111795,56,long,-0.185786,15,short,0.10938,1.674426,1000.0,...,201,True,20,kmeans,6,1,762,GBP_JPY_M15,4,24
4,5,0.295713,61,long,-0.002760,4,short,0.10938,1.674426,1000.0,...,201,True,20,kmeans,6,1,762,GBP_JPY_M15,4,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
548677,197,0.028822,15,long,-0.001297,1,short,0.05175,1.377980,1000.0,...,201,True,40,kmeans,5,1,671,GBP_JPY_M15,4,24
548678,198,0.040093,24,long,-0.001299,1,short,0.05175,1.377980,1000.0,...,201,True,40,kmeans,5,1,671,GBP_JPY_M15,4,24
548679,199,0.051211,22,short,-0.010754,3,long,0.05175,1.377980,1000.0,...,201,True,40,kmeans,5,1,671,GBP_JPY_M15,4,24
548680,200,0.062734,16,short,-0.015303,4,long,0.05175,1.377980,1000.0,...,201,True,40,kmeans,5,1,671,GBP_JPY_M15,4,24


In [4]:
def find_best_combinations(raw_df, n_top_combo=20, min_profit_factor=1.4, min_sharpe=0.1, min_trades=5):
    # Group by strategy parameters
    group_cols = ['num_clusters', 'clustering_algorithm', 'train_period', 
                  'test_period', 'reverse_test', 'random_seed']
    
    # Calculate mean performance metrics for each parameter combination
    param_performance = raw_df.groupby(group_cols).agg({
        'test_profit_factor': 'mean',
        'test_sharpe_ratio': 'mean',
        'test_win_ratio': 'mean',
        'test_num_trades': 'mean',
        'test_avg_trades_per_window': 'mean'
    }).reset_index()
    
    # Filter combinations that meet minimum performance criteria
    qualified_params = param_performance[
        (param_performance['test_profit_factor'] >= min_profit_factor) &
        (param_performance['test_profit_factor'] < 1.8) &
        (param_performance['num_clusters'] < 10) &
        
        (param_performance['test_sharpe_ratio'] >= min_sharpe) &
        (param_performance['test_avg_trades_per_window'] >= min_trades)
    ].copy()
    
    if len(qualified_params) == 0:
        print("No combinations meet the minimum criteria. Consider adjusting thresholds.")
        return None, None
    
    # Calculate combined score (weighted average of normalized metrics)
    qualified_params['combined_score'] = (
        qualified_params['test_profit_factor'] * 0.4 +
        qualified_params['test_sharpe_ratio'] * 0.4 +
        (qualified_params['test_avg_trades_per_window'] / 
         qualified_params['test_avg_trades_per_window'].max()) * 0.2
    )
    
    # Sort by combined score and get top 10
    best_params = qualified_params.sort_values('combined_score', ascending=False)
    actual_n_top = min(n_top_combo, len(best_params))
    top_combinations = best_params.head(actual_n_top)
    
    # Create display DataFrame with formatted columns
    display_df = pd.DataFrame({
        'Rank': range(1, actual_n_top + 1),
        'Clusters': top_combinations['num_clusters'],
        'Algorithm': top_combinations['clustering_algorithm'],
        'Train Period': top_combinations['train_period'].astype(str) + 'w',
        'Rev.Test': top_combinations['reverse_test'],
        'Seed': top_combinations['random_seed'],
        'PF': top_combinations['test_profit_factor'].round(3),
        'Sharpe': top_combinations['test_sharpe_ratio'].round(3),
        'Win%': top_combinations['test_win_ratio'].round(3),
        'Trades': top_combinations['test_avg_trades_per_window'].round(1),
        'Score': top_combinations['combined_score'].round(3)
    })
    
    # Apply styling to the display DataFrame
    styled_df = display_df.style\
        .format({
            'PF': '{:.3f}',
            'Sharpe': '{:.3f}',
            'Win%': '{:.1%}',
            'Trades': '{:.1f}',
            'Score': '{:.3f}'
        })\
        .background_gradient(subset=['Score'], cmap='YlOrRd')\
        .background_gradient(subset=['PF'], cmap='YlOrRd')\
        .background_gradient(subset=['Sharpe'], cmap='YlOrRd')\
        .background_gradient(subset=['Win%'], cmap='YlOrRd')\
        .background_gradient(subset=['Trades'], cmap='YlOrRd')\
        .set_properties(**{
            'text-align': 'right',
            'font-family': 'monospace',
            'padding': '5px'
        })\
        .hide(axis="index")
    
    return best_params, styled_df

# Example usage:
best_params, styled_df = find_best_combinations(raw_df)
styled_df  # Display the styled table

Rank,Clusters,Algorithm,Train Period,Rev.Test,Seed,PF,Sharpe,Win%,Trades,Score
1,6,birch,10w,True,42,1.543,0.136,50.3%,26.9,0.872
2,7,birch,10w,True,42,1.578,0.138,53.0%,21.9,0.849
3,8,birch,10w,True,42,1.584,0.142,50.5%,19.0,0.832
4,7,kmeans,10w,True,671,1.463,0.124,51.2%,25.3,0.823
5,7,kmeans,10w,True,780,1.447,0.121,52.0%,25.4,0.816
6,7,kmeans,10w,True,435,1.435,0.117,51.8%,25.5,0.811
7,7,kmeans,10w,True,311,1.427,0.116,51.5%,25.4,0.806
8,7,kmeans,10w,True,755,1.423,0.115,51.2%,25.2,0.803
9,7,kmeans,10w,True,762,1.416,0.115,51.8%,25.2,0.8
10,7,kmeans,10w,True,564,1.414,0.113,51.2%,25.3,0.799


In [43]:
print(best_params)

     num_clusters clustering_algorithm  train_period  test_period  \
19              5                birch            14            1   
242             6               kmeans             4            1   
248             6               kmeans             4            1   
245             6               kmeans             4            1   
249             6               kmeans             4            1   
243             6               kmeans             4            1   
244             6               kmeans             4            1   
448             7                birch             8            1   
685             8               kmeans             4            1   
688             8               kmeans             4            1   
689             8               kmeans             4            1   

     reverse_test  random_seed  test_profit_factor  test_sharpe_ratio  \
19           True           42            1.447656           0.122521   
242         False        

In [85]:
print(raw_df.iloc[0]['instrument'])
print(best_params.iloc[0])

EUR_CAD_M15
num_clusters                                 6
clustering_algorithm          gaussian_mixture
train_period                                 7
test_period                                  1
reverse_test                              True
random_seed                                435
test_profit_factor                    2.196387
test_sharpe_ratio                       0.2725
test_win_ratio                        0.568528
test_num_trades                      12.116751
test_avg_trades_per_window           12.116751
combined_score                         1.16189
Name: 511, dtype: object
