In [62]:
import pandas as pd
import numpy as np
from plotnine import *
import seaborn as sns
import matplotlib.pyplot as plt

In [66]:
def filter_csv_pandas(input_file, instrument_column, target_instrument):
    # Initialize an empty list to store the filtered chunks
    filtered_chunks = []

    # Iterate through the CSV file in chunks
    for chunk in pd.read_csv(input_file, chunksize=10000):
        # Filter rows for target instrument and non-GMM algorithms
        if instrument_column in chunk.columns and 'clustering_algorithm' in chunk.columns:
            filtered_chunk = chunk[
                (chunk[instrument_column] == target_instrument) & 
                (chunk['clustering_algorithm'].isin(['kmeans', 'birch']))
            ]
            
            # If the filtered chunk is not empty, add it to our list
            if not filtered_chunk.empty:
                filtered_chunks.append(filtered_chunk)
    
    # Concatenate all filtered chunks into a single DataFrame
    if filtered_chunks:
        return pd.concat(filtered_chunks, ignore_index=True)
    else:
        return pd.DataFrame()  # Return an empty DataFrame if no matching rows found


# Example usage:
# time = "2024-12-12_15:48"
# time = "2024-12-14_04:20"

# time = "2024-12-26_13:35"
time = "2024-12-27_17:41"

raw_df = filter_csv_pandas(
    f"/projects/genomic-ml/da2343/ml_project_2/unsupervised/kmeans/results/{time}_results.csv",
    "instrument", 
    "EUR_CAD_M15"
)
raw_df

Unnamed: 0,window,train_actual_return,train_num_trades,train_direction,test_actual_return,test_num_trades,test_direction,train_average_return,train_sharpe_ratio,train_profit_factor,...,total_windows,reverse_test,num_clusters,clustering_algorithm,train_period,test_period,random_seed,instrument,num_perceptually_important_points,price_history_length
0,1,0.023404,7,short,0.000000,0,long,0.01739,1.527671,1000.0,...,201,True,30,kmeans,4,1,311,EUR_CAD_M15,4,24
1,2,0.036419,11,short,0.000000,0,long,0.01739,1.527671,1000.0,...,201,True,30,kmeans,4,1,311,EUR_CAD_M15,4,24
2,3,0.027270,9,short,0.009847,4,long,0.01739,1.527671,1000.0,...,201,True,30,kmeans,4,1,311,EUR_CAD_M15,4,24
3,4,0.012482,8,short,0.000385,2,long,0.01739,1.527671,1000.0,...,201,True,30,kmeans,4,1,311,EUR_CAD_M15,4,24
4,5,0.015913,9,long,0.003142,4,short,0.01739,1.527671,1000.0,...,201,True,30,kmeans,4,1,311,EUR_CAD_M15,4,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
563477,197,0.021668,15,long,0.000000,0,short,0.02746,2.357139,1000.0,...,201,True,25,kmeans,7,1,671,EUR_CAD_M15,4,24
563478,198,0.031182,20,long,-0.004582,1,short,0.02746,2.357139,1000.0,...,201,True,25,kmeans,7,1,671,EUR_CAD_M15,4,24
563479,199,0.033737,14,short,-0.003477,2,long,0.02746,2.357139,1000.0,...,201,True,25,kmeans,7,1,671,EUR_CAD_M15,4,24
563480,200,0.027666,18,long,0.025386,20,short,0.02746,2.357139,1000.0,...,201,True,25,kmeans,7,1,671,EUR_CAD_M15,4,24


In [67]:
def find_best_combinations(raw_df, n_top_combo=20, min_profit_factor=1.4, min_sharpe=0.1, min_trades=5):
    # Group by strategy parameters
    group_cols = ['num_clusters', 'clustering_algorithm', 'train_period', 
                  'test_period', 'reverse_test', 'random_seed']
    
    # Calculate mean performance metrics for each parameter combination
    param_performance = raw_df.groupby(group_cols).agg({
        'test_profit_factor': 'mean',
        'test_sharpe_ratio': 'mean',
        'test_win_ratio': 'mean',
        'test_num_trades': 'mean',
        'test_avg_trades_per_window': 'mean'
    }).reset_index()
    
    # Filter combinations that meet minimum performance criteria
    qualified_params = param_performance[
        (param_performance['test_profit_factor'] >= min_profit_factor) &
        (param_performance['test_profit_factor'] < 1.8) &
        (param_performance['num_clusters'] < 10) &
        (param_performance['test_sharpe_ratio'] >= min_sharpe) &
        (param_performance['test_avg_trades_per_window'] >= min_trades) &
        (param_performance['test_win_ratio'] > 0.51)  # Win rate must be > 50%
    ].copy()
    
    if len(qualified_params) == 0:
        print("No combinations meet the minimum criteria. Consider adjusting thresholds.")
        return None, None
    
    # Calculate combined score with cluster and training period penalties
    qualified_params['combined_score'] = (
        qualified_params['test_profit_factor'] * 0.35 +
        qualified_params['test_sharpe_ratio'] * 0.35 +
        (qualified_params['test_avg_trades_per_window'] / 
         qualified_params['test_avg_trades_per_window'].max()) * 0.2 +
        # Reward lower clusters (5 is best)
        (1 - (qualified_params['num_clusters'] - 5) / 5) * 0.05 +
        # Reward shorter training periods
        (1 - qualified_params['train_period'] / qualified_params['train_period'].max()) * 0.05
    )
    
    # Sort by combined score and get top 10
    best_params = qualified_params.sort_values('combined_score', ascending=False)
    actual_n_top = min(n_top_combo, len(best_params))
    top_combinations = best_params.head(actual_n_top)
    
    # Create display DataFrame with formatted columns
    display_df = pd.DataFrame({
        'Rank': range(1, actual_n_top + 1),
        'Clusters': top_combinations['num_clusters'],
        'Algorithm': top_combinations['clustering_algorithm'],
        'Train Period': top_combinations['train_period'].astype(str) + 'w',
        'Rev.Test': top_combinations['reverse_test'],
        'Seed': top_combinations['random_seed'],
        'PF': top_combinations['test_profit_factor'].round(3),
        'Sharpe': top_combinations['test_sharpe_ratio'].round(3),
        'Win%': top_combinations['test_win_ratio'].round(3),
        'Trades': top_combinations['test_avg_trades_per_window'].round(1),
        'Score': top_combinations['combined_score'].round(3)
    })
    
    # Apply styling to the display DataFrame
    styled_df = display_df.style\
        .format({
            'PF': '{:.3f}',
            'Sharpe': '{:.3f}',
            'Win%': '{:.1%}',
            'Trades': '{:.1f}',
            'Score': '{:.3f}'
        })\
        .background_gradient(subset=['Score'], cmap='YlOrRd')\
        .background_gradient(subset=['PF'], cmap='YlOrRd')\
        .background_gradient(subset=['Sharpe'], cmap='YlOrRd')\
        .background_gradient(subset=['Win%'], cmap='YlOrRd')\
        .background_gradient(subset=['Trades'], cmap='YlOrRd')\
        .set_properties(**{
            'text-align': 'right',
            'font-family': 'monospace',
            'padding': '5px'
        })\
        .hide(axis="index")
    
    return best_params, styled_df

# Example usage:
best_params, styled_df = find_best_combinations(raw_df)
styled_df  # Display the styled table

Rank,Clusters,Algorithm,Train Period,Rev.Test,Seed,PF,Sharpe,Win%,Trades,Score
1,6,kmeans,5w,True,755,1.79,0.195,57.8%,13.2,0.93
2,6,kmeans,5w,True,564,1.783,0.194,57.8%,13.3,0.929
3,6,kmeans,5w,True,671,1.782,0.194,57.8%,13.1,0.926
4,6,kmeans,5w,True,795,1.773,0.191,58.2%,13.3,0.925
5,6,kmeans,5w,True,762,1.773,0.19,57.2%,13.2,0.923
6,6,kmeans,5w,True,716,1.765,0.19,57.8%,13.2,0.92
7,6,kmeans,5w,True,435,1.76,0.189,58.2%,13.0,0.916
8,6,kmeans,5w,True,780,1.731,0.184,57.2%,13.3,0.907
9,7,kmeans,12w,True,762,1.777,0.195,55.4%,11.3,0.867
10,7,kmeans,12w,True,435,1.708,0.182,55.7%,11.2,0.838


In [None]:
print(best_params)

In [68]:
print(raw_df.iloc[0]['instrument'])
print(best_params.iloc[0])

EUR_CAD_M15
num_clusters                         6
clustering_algorithm            kmeans
train_period                         5
test_period                          1
reverse_test                      True
random_seed                        755
test_profit_factor            1.789563
test_sharpe_ratio             0.195156
test_win_ratio                  0.5775
test_num_trades                 13.195
test_avg_trades_per_window      13.195
combined_score                0.930333
Name: 275, dtype: object
