In [10]:
import pandas as pd
import numpy as np
from plotnine import *
import seaborn as sns
import matplotlib.pyplot as plt

In [20]:
def filter_csv_pandas(input_file, instrument_column, target_instrument):
    # Initialize an empty list to store the filtered chunks
    filtered_chunks = []

    # Iterate through the CSV file in chunks
    for chunk in pd.read_csv(input_file, chunksize=10000):
        # Filter rows where the instrument column has the value "A"
        if instrument_column in chunk.columns:
            filtered_chunk = chunk[chunk[instrument_column] == target_instrument]
            
            # If the filtered chunk is not empty, add it to our list
            if not filtered_chunk.empty:
                filtered_chunks.append(filtered_chunk)
    
    # Concatenate all filtered chunks into a single DataFrame
    if filtered_chunks:
        return pd.concat(filtered_chunks, ignore_index=True)
    else:
        return pd.DataFrame()  # Return an empty DataFrame if no matching rows found


time = "2024-12-12_15:48"
# time = "2024-12-14_04:20"

time = "2024-12-26_13:35"


raw_df = filter_csv_pandas(f"/projects/genomic-ml/da2343/ml_project_2/unsupervised/kmeans/results/{time}_results.csv", "instrument", "EUR_GBP_M15")
raw_df

Unnamed: 0,window,train_actual_return,train_num_trades,train_direction,test_actual_return,test_num_trades,test_direction,train_average_return,train_sharpe_ratio,train_profit_factor,...,total_windows,reverse_test,num_clusters,clustering_algorithm,train_period,test_period,random_seed,instrument,num_perceptually_important_points,price_history_length
0,1,0.245768,86,short,-0.041038,14,long,0.115888,1.352896,1000.0,...,201,True,8,kmeans,4,1,407,EUR_GBP_M15,4,24
1,2,0.225004,101,short,0.061537,28,long,0.115888,1.352896,1000.0,...,201,True,8,kmeans,4,1,407,EUR_GBP_M15,4,24
2,3,0.131730,103,short,-0.099107,36,long,0.115888,1.352896,1000.0,...,201,True,8,kmeans,4,1,407,EUR_GBP_M15,4,24
3,4,0.089028,70,short,0.043410,27,long,0.115888,1.352896,1000.0,...,201,True,8,kmeans,4,1,407,EUR_GBP_M15,4,24
4,5,0.122904,123,short,0.003507,19,long,0.115888,1.352896,1000.0,...,201,True,8,kmeans,4,1,407,EUR_GBP_M15,4,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1052283,197,0.051717,14,long,0.000854,1,short,0.034000,1.913603,1000.0,...,201,True,50,kmeans,6,1,868,EUR_GBP_M15,4,24
1052284,198,0.060997,16,long,0.000000,0,short,0.034000,1.913603,1000.0,...,201,True,50,kmeans,6,1,868,EUR_GBP_M15,4,24
1052285,199,0.014215,10,short,-0.000174,1,long,0.034000,1.913603,1000.0,...,201,True,50,kmeans,6,1,868,EUR_GBP_M15,4,24
1052286,200,0.050438,28,short,0.000425,1,long,0.034000,1.913603,1000.0,...,201,True,50,kmeans,6,1,868,EUR_GBP_M15,4,24


In [21]:
def find_best_combinations(raw_df, n_top_combo = 20, min_profit_factor=1.7, min_sharpe=0.1, min_trades=5):
    # Group by strategy parameters
    group_cols = ['num_clusters', 'clustering_algorithm', 'train_period', 
                  'test_period', 'reverse_test', 'random_seed']
    
    # Calculate mean performance metrics for each parameter combination
    param_performance = raw_df.groupby(group_cols).agg({
        'test_profit_factor': 'mean',
        'test_sharpe_ratio': 'mean',
        'test_win_ratio': 'mean',
        'test_num_trades': 'mean',
        'test_avg_trades_per_window': 'mean'
    }).reset_index()
    
    # Filter combinations that meet minimum performance criteria
    qualified_params = param_performance[
        (param_performance['test_profit_factor'] >= min_profit_factor) &
        (param_performance['test_sharpe_ratio'] >= min_sharpe) &
        (param_performance['test_avg_trades_per_window'] >= min_trades)
    ].copy()
    
    if len(qualified_params) == 0:
        print("No combinations meet the minimum criteria. Consider adjusting thresholds.")
        return None, None
    
    # Calculate combined score (weighted average of normalized metrics)
    qualified_params['combined_score'] = (
        qualified_params['test_profit_factor'] * 0.4 +
        qualified_params['test_sharpe_ratio'] * 0.4 +
        (qualified_params['test_avg_trades_per_window'] / 
         qualified_params['test_avg_trades_per_window'].max()) * 0.2
    )
    
    # Sort by combined score and get top 10
    best_params = qualified_params.sort_values('combined_score', ascending=False)
    actual_n_top = min(n_top_combo, len(best_params))
    top_combinations = best_params.head(actual_n_top)
    
    # Create display DataFrame with formatted columns
    display_df = pd.DataFrame({
        'Rank': range(1, actual_n_top + 1),
        'Clusters': top_combinations['num_clusters'],
        'Algorithm': top_combinations['clustering_algorithm'],
        'Train Period': top_combinations['train_period'].astype(str) + 'w',
        'Rev.Test': top_combinations['reverse_test'],
        'Seed': top_combinations['random_seed'],
        'PF': top_combinations['test_profit_factor'].round(3),
        'Sharpe': top_combinations['test_sharpe_ratio'].round(3),
        'Win%': top_combinations['test_win_ratio'].round(3),
        'Trades': top_combinations['test_avg_trades_per_window'].round(1),
        'Score': top_combinations['combined_score'].round(3)
    })
    
    # Apply styling to the display DataFrame
    styled_df = display_df.style\
        .format({
            'PF': '{:.3f}',
            'Sharpe': '{:.3f}',
            'Win%': '{:.1%}',
            'Trades': '{:.1f}',
            'Score': '{:.3f}'
        })\
        .background_gradient(subset=['Score'], cmap='YlOrRd')\
        .background_gradient(subset=['PF'], cmap='YlOrRd')\
        .background_gradient(subset=['Sharpe'], cmap='YlOrRd')\
        .background_gradient(subset=['Win%'], cmap='YlOrRd')\
        .background_gradient(subset=['Trades'], cmap='YlOrRd')\
        .set_properties(**{
            'text-align': 'right',
            'font-family': 'monospace',
            'padding': '5px'
        })\
        .hide(axis="index")
    
    return best_params, styled_df

# Example usage:
best_params, styled_df = find_best_combinations(raw_df)
styled_df  # Display the styled table

Rank,Clusters,Algorithm,Train Period,Rev.Test,Seed,PF,Sharpe,Win%,Trades,Score
1,8,gaussian_mixture,12w,True,936,1.964,0.227,57.5%,20.2,1.006
2,6,gaussian_mixture,12w,True,92,1.902,0.191,58.7%,25.7,1.002
3,5,gaussian_mixture,12w,True,92,1.785,0.185,58.5%,31.2,0.988
4,7,gaussian_mixture,10w,True,257,1.849,0.207,56.3%,22.2,0.965
5,7,gaussian_mixture,11w,True,257,1.832,0.197,53.9%,21.7,0.951
6,7,gaussian_mixture,11w,True,83,1.828,0.176,53.7%,22.1,0.944
7,7,gaussian_mixture,11w,True,315,1.814,0.18,56.8%,22.1,0.939
8,10,gaussian_mixture,11w,True,83,1.881,0.189,56.1%,16.3,0.932
9,8,gaussian_mixture,11w,True,936,1.826,0.19,56.3%,18.6,0.926
10,7,gaussian_mixture,12w,True,315,1.76,0.187,53.2%,22.9,0.926
