In [203]:
import pandas as pd
import numpy as np
from plotnine import *
import seaborn as sns
import matplotlib.pyplot as plt

In [209]:
def filter_csv_pandas(input_file, instrument_column, target_instrument):
    # Initialize an empty list to store the filtered chunks
    filtered_chunks = []

    # Iterate through the CSV file in chunks
    for chunk in pd.read_csv(input_file, chunksize=10000):
        # Filter rows for target instrument and non-GMM algorithms
        if instrument_column in chunk.columns and 'clustering_algorithm' in chunk.columns:
            filtered_chunk = chunk[
                (chunk[instrument_column] == target_instrument) & 
                (chunk['clustering_algorithm'].isin(['kmeans', 'birch']))
            ]
            
            # If the filtered chunk is not empty, add it to our list
            if not filtered_chunk.empty:
                filtered_chunks.append(filtered_chunk)
    
    # Concatenate all filtered chunks into a single DataFrame
    if filtered_chunks:
        return pd.concat(filtered_chunks, ignore_index=True)
    else:
        return pd.DataFrame()  # Return an empty DataFrame if no matching rows found



time = "2025-01-18_09:53"
time = "2025-01-19_01:28"
time = "2025-01-19_08:02"

raw_df = filter_csv_pandas(
    f"/projects/genomic-ml/da2343/ml_project_2/unsupervised/kmeans/results/{time}_results.csv",
    "instrument", 
    "AMZN_M15"
)

raw_df = raw_df[raw_df['test_direction'] != 'short']
raw_df

Unnamed: 0,window,train_actual_return,train_num_trades,train_direction,test_actual_return,test_num_trades,test_direction,train_average_return,train_sharpe_ratio,train_profit_factor,...,total_windows,reverse_test,num_clusters,clustering_algorithm,train_period,test_period,random_seed,instrument,num_perceptually_important_points,price_history_length
4,5,0.364975,90,long,0.001535,1,long,0.327123,1.330833,1000.0,...,299,False,7,kmeans,5,1,924,AMZN_M15,4,24
5,6,0.578906,91,long,-0.002666,2,long,0.327123,1.330833,1000.0,...,299,False,7,kmeans,5,1,924,AMZN_M15,4,24
6,7,0.402045,92,long,0.000000,0,long,0.327123,1.330833,1000.0,...,299,False,7,kmeans,5,1,924,AMZN_M15,4,24
7,8,0.458080,105,long,0.041051,2,long,0.327123,1.330833,1000.0,...,299,False,7,kmeans,5,1,924,AMZN_M15,4,24
8,9,0.258751,66,long,0.020255,1,long,0.327123,1.330833,1000.0,...,299,False,7,kmeans,5,1,924,AMZN_M15,4,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48655,275,0.124272,96,long,0.000000,0,long,0.487964,1.321813,1000.0,...,292,False,7,birch,11,1,42,AMZN_M15,4,24
48658,278,0.492570,244,long,-0.020228,1,long,0.487964,1.321813,1000.0,...,292,False,7,birch,11,1,42,AMZN_M15,4,24
48660,280,0.608490,221,long,0.000000,0,long,0.487964,1.321813,1000.0,...,292,False,7,birch,11,1,42,AMZN_M15,4,24
48663,283,0.432028,139,long,0.000000,0,long,0.487964,1.321813,1000.0,...,292,False,7,birch,11,1,42,AMZN_M15,4,24


In [210]:
def find_best_combinations(raw_df, n_top_combo=120):
    # Group by strategy parameters
    group_cols = ['num_clusters', 'clustering_algorithm', 'train_period', 
                  'test_period', 'reverse_test', 'random_seed']
    
    # Calculate mean performance metrics for each parameter combination
    qualified_params = raw_df.groupby(group_cols).agg({
        'test_profit_factor': 'mean',
        'test_sharpe_ratio': 'mean',
        'test_win_ratio': 'mean',
        'test_num_trades': 'mean',
        'test_avg_trades_per_window': 'mean'
    }).reset_index()
    
    # Calculate stability metrics across seeds for each parameter combination (excluding seed)
    stability_group = group_cols.copy()
    stability_group.remove('random_seed')
    
    stability_metrics = qualified_params.groupby(stability_group).agg({
        'test_profit_factor': 'std',
        'test_sharpe_ratio': 'std',
        'test_win_ratio': 'std',
        'test_avg_trades_per_window': 'std'
    }).reset_index()
    
    # Rename stability columns
    stability_metrics.columns = stability_group + ['pf_std', 'sharpe_std', 'win_std', 'trades_std']
    
    # Merge stability metrics back
    qualified_params = qualified_params.merge(
        stability_metrics,
        on=stability_group
    )
    
    # Calculate stability score (lower std = higher score)
    qualified_params['stability_score'] = (
        (1 / (1 + qualified_params['pf_std'])) +
        (1 / (1 + qualified_params['sharpe_std'])) +
        (1 / (1 + qualified_params['win_std'])) +
        (1 / (1 + qualified_params['trades_std']))
    ) / 4
    
    # Normalize stability score
    qualified_params['stability_score'] = (qualified_params['stability_score'] - 
                                         qualified_params['stability_score'].min()) / (
                                         qualified_params['stability_score'].max() - 
                                         qualified_params['stability_score'].min())
                                         
    # if stability score is NaN, set it to 0
    qualified_params['stability_score'] = qualified_params['stability_score'].fillna(1)                                         
    
    # Calculate combined score with stability
    qualified_params['combined_score'] = (
        qualified_params['test_profit_factor'] * 0.35 +
        qualified_params['test_sharpe_ratio'] * 0.35 +
        qualified_params['test_win_ratio'] * 0.2 +
        qualified_params['stability_score'] * 0.1
    )
    
    # Sort by combined score and get top combinations
    best_params = qualified_params.sort_values('combined_score', ascending=False)
    actual_n_top = min(n_top_combo, len(best_params))
    top_combinations = best_params.head(actual_n_top)
    
    # Create display DataFrame with formatted columns
    display_df = pd.DataFrame({
        'Rank': range(1, actual_n_top + 1),
        'Clusters': top_combinations['num_clusters'],
        'Algorithm': top_combinations['clustering_algorithm'],
        'Train Period': top_combinations['train_period'].astype(str) + 'w',
        'Rev.Test': top_combinations['reverse_test'],
        'Seed': top_combinations['random_seed'],
        'PF': top_combinations['test_profit_factor'].round(3),
        'Sharpe': top_combinations['test_sharpe_ratio'].round(3),
        'Win%': top_combinations['test_win_ratio'].round(3),
        'Trades': top_combinations['test_avg_trades_per_window'].round(1),
        'Stability': top_combinations['stability_score'].round(3),
        'Score': top_combinations['combined_score'].round(3)
    })
    
    # Apply styling to the display DataFrame
    styled_df = display_df.style\
        .format({
            'PF': '{:.3f}',
            'Sharpe': '{:.3f}',
            'Win%': '{:.1%}',
            'Trades': '{:.1f}',
            'Stability': '{:.3f}',
            'Score': '{:.3f}'
        })\
        .background_gradient(subset=['Score', 'Stability'], cmap='YlOrRd')\
        .background_gradient(subset=['PF'], cmap='YlOrRd')\
        .background_gradient(subset=['Sharpe'], cmap='YlOrRd')\
        .background_gradient(subset=['Win%'], cmap='YlOrRd')\
        .background_gradient(subset=['Trades'], cmap='YlOrRd')\
        .set_properties(**{
            'text-align': 'right',
            'font-family': 'monospace',
            'padding': '5px'
        })\
        .hide(axis="index")
    
    return best_params, styled_df

best_params, styled_df = find_best_combinations(raw_df)
styled_df  # Display the styled table

Rank,Clusters,Algorithm,Train Period,Rev.Test,Seed,PF,Sharpe,Win%,Trades,Stability,Score
1,7,kmeans,7w,False,149,1.862,0.143,52.9%,0.5,0.774,0.885
2,7,kmeans,7w,False,924,1.819,0.14,53.0%,0.5,0.774,0.869
3,7,kmeans,7w,False,207,1.784,0.136,53.0%,0.6,0.774,0.856
4,9,kmeans,12w,False,924,1.927,0.152,52.5%,0.5,0.111,0.844
5,9,kmeans,8w,False,924,1.832,0.126,53.5%,0.5,0.332,0.826
6,9,kmeans,12w,False,207,1.837,0.138,52.3%,0.5,0.111,0.807
7,7,kmeans,8w,False,149,1.639,0.113,52.3%,0.5,0.708,0.788
8,7,kmeans,8w,False,924,1.607,0.11,52.7%,0.5,0.708,0.777
9,7,kmeans,8w,False,207,1.546,0.102,52.2%,0.5,0.708,0.752
10,9,kmeans,8w,False,207,1.627,0.101,53.7%,0.5,0.332,0.745


In [None]:
print(best_params)

In [None]:
print(raw_df.iloc[0]['instrument'])
print(best_params.iloc[0])

In [None]:
 instrument = [
        "AAPL_M15", "MSFT_M15", "AMZN_M15", "GOOGL_M15", "NVDA_M15", "TSLA_M15", "META_M15", 
        "AMD_M15", "NFLX_M15", "INTC_M15", "CSCO_M15", "PEP_M15", "COST_M15", "QCOM_M15", 
        "TXN_M15", "SBUX_M15", "AMGN_M15", "AVGO_M15", "ADP_M15", "ISRG_M15", "MU_M15", 
        "MDLZ_M15", "BKNG_M15", "GILD_M15", "FISV_M15", "ATVI_M15", "ADI_M15", "LRCX_M15", 
        "KLAC_M15", "MAR_M15", "MCHP_M15", "EA_M15", "EXC_M15", "ILMN_M15", "IDXX_M15", 
        "MNST_M15", "PAYX_M15", "LULU_M15", "ORLY_M15", "VRTX_M15", "REGN_M15", "ASML_M15", 
        "CSX_M15", "SNPS_M15", "CDNS_M15", "DXCM_M15", "KDP_M15", "MTD_M15", "SWKS_M15", 
        "WDAY_M15"
    ]