In [1]:
import pandas as pd
import numpy as np
from plotnine import *
import seaborn as sns
import matplotlib.pyplot as plt

In [5]:
def filter_csv_pandas(input_file, instrument_column, target_instrument):
    # Initialize an empty list to store the filtered chunks
    filtered_chunks = []

    # Iterate through the CSV file in chunks
    for chunk in pd.read_csv(input_file, chunksize=10000):
        # Filter rows where the instrument column has the value "A"
        if instrument_column in chunk.columns:
            filtered_chunk = chunk[chunk[instrument_column] == target_instrument]
            
            # If the filtered chunk is not empty, add it to our list
            if not filtered_chunk.empty:
                filtered_chunks.append(filtered_chunk)
    
    # Concatenate all filtered chunks into a single DataFrame
    if filtered_chunks:
        return pd.concat(filtered_chunks, ignore_index=True)
    else:
        return pd.DataFrame()  # Return an empty DataFrame if no matching rows found


time = "2024-12-12_15:48"
# time = "2024-12-14_04:20"


raw_df = filter_csv_pandas(f"/projects/genomic-ml/da2343/ml_project_2/unsupervised/kmeans/results/{time}_results.csv", "instrument", "EUR_USD_M15")

raw_df["test_profit_factor"]

0         0.608912
1         0.608912
2         0.608912
3         0.608912
4         0.608912
            ...   
108249    1.012352
108250    1.012352
108251    1.012352
108252    1.012352
108253    1.012352
Name: test_profit_factor, Length: 108254, dtype: float64

In [6]:
def find_best_combinations(raw_df, min_profit_factor=1.2, min_sharpe=0.1, min_trades=10):
    # Group by parameters except instrument and random_seed
    group_cols = ['num_clusters', 'clustering_algorithm', 'train_period', 
                 'test_period', 'reverse_test']
    
    # Calculate mean metrics across instruments and seeds
    param_performance = raw_df.groupby(group_cols).agg({
        'test_profit_factor': 'mean',
        'test_sharpe_ratio': 'mean',
        'test_win_ratio': 'mean',
        'test_num_trades': 'mean',
        'test_avg_trades_per_window': 'mean'
    }).reset_index()
    
    # Filter for minimum performance thresholds
    qualified_params = param_performance[
        (param_performance['test_profit_factor'] >= min_profit_factor) &
        (param_performance['test_sharpe_ratio'] >= min_sharpe) &
        (param_performance['test_avg_trades_per_window'] >= min_trades)
    ].copy()
    
    if len(qualified_params) == 0:
        print("No combinations meet the minimum criteria. Consider adjusting thresholds.")
        return None
    
    # Create a combined score with trade frequency consideration
    qualified_params['combined_score'] = (
        qualified_params['test_profit_factor'] * 0.4 +
        qualified_params['test_sharpe_ratio'] * 0.4 +
        (qualified_params['test_avg_trades_per_window'] / 
         qualified_params['test_avg_trades_per_window'].max()) * 0.2
    )
    
    # Sort by combined score
    best_params = qualified_params.sort_values('combined_score', ascending=False)
    
    # Get unique top combinations
    top_params = best_params.head(5)
    print(raw_df['instrument'].unique()[0])
    print("Top 5 Parameter Combinations:")
    print("-" * 50)
    for _, row in top_params.iterrows():
        print(f"\nCombination Details:")
        print(f"Clusters: {row['num_clusters']}")
        print(f"Algorithm: {row['clustering_algorithm']}")
        print(f"Train Period: {row['train_period']} weeks")
        print(f"Reverse Test: {row['reverse_test']}")
        
        print(f"Performance Metrics:")
        print(f"- Profit Factor: {row['test_profit_factor']:.3f}")
        print(f"- Sharpe Ratio: {row['test_sharpe_ratio']:.3f}")
        print(f"- Win Ratio: {row['test_win_ratio']:.3f}")
        print(f"- Avg Trades per Window: {row['test_avg_trades_per_window']:.1f}")
        print(f"- Combined Score: {row['combined_score']:.3f}")
        
    
    return best_params

find_best_combinations(raw_df)

EUR_USD_M15
Top 5 Parameter Combinations:
--------------------------------------------------

Combination Details:
Clusters: 6
Algorithm: kmeans
Train Period: 8 weeks
Reverse Test: False
Performance Metrics:
- Profit Factor: 1.679
- Sharpe Ratio: 0.177
- Win Ratio: 0.537
- Avg Trades per Window: 12.9
- Combined Score: 0.894

Combination Details:
Clusters: 5
Algorithm: kmeans
Train Period: 8 weeks
Reverse Test: False
Performance Metrics:
- Profit Factor: 1.593
- Sharpe Ratio: 0.158
- Win Ratio: 0.490
- Avg Trades per Window: 14.9
- Combined Score: 0.875

Combination Details:
Clusters: 4
Algorithm: birch
Train Period: 7 weeks
Reverse Test: True
Performance Metrics:
- Profit Factor: 1.411
- Sharpe Ratio: 0.114
- Win Ratio: 0.556
- Avg Trades per Window: 17.0
- Combined Score: 0.810

Combination Details:
Clusters: 7
Algorithm: kmeans
Train Period: 9 weeks
Reverse Test: False
Performance Metrics:
- Profit Factor: 1.418
- Sharpe Ratio: 0.119
- Win Ratio: 0.524
- Avg Trades per Window: 11.1
-

Unnamed: 0,num_clusters,clustering_algorithm,train_period,test_period,reverse_test,test_profit_factor,test_sharpe_ratio,test_win_ratio,test_num_trades,test_avg_trades_per_window,combined_score
108,6,kmeans,8,1,False,1.679252,0.176792,0.536709,12.941772,12.941772,0.894341
68,5,kmeans,8,1,False,1.592839,0.157706,0.48969,14.85567,14.85567,0.874608
7,4,birch,7,1,True,1.411155,0.114387,0.555851,17.037234,17.037234,0.810217
150,7,kmeans,9,1,False,1.418191,0.119055,0.524297,11.081841,11.081841,0.744988
