In [1]:
import pandas as pd
import numpy as np
from plotnine import *
import seaborn as sns
import matplotlib.pyplot as plt

from datetime import datetime
import pandas as pd
import numpy as np
import os
import shutil
import sys
import json

# FUNCTIONS

In [2]:
def filter_csv_pandas(input_file, instrument_column, target_instrument):
    # Initialize an empty list to store the filtered chunks
    filtered_chunks = []

    # Iterate through the CSV file in chunks
    for chunk in pd.read_csv(input_file, chunksize=10000):
        # Filter rows for target instrument and non-GMM algorithms
        if instrument_column in chunk.columns and 'clustering_algorithm' in chunk.columns:
            filtered_chunk = chunk[
                (chunk[instrument_column] == target_instrument) & 
                (chunk['clustering_algorithm'].isin(['kmeans', 'birch']))
            ]
            
            # If the filtered chunk is not empty, add it to our list
            if not filtered_chunk.empty:
                filtered_chunks.append(filtered_chunk)
    
    # Concatenate all filtered chunks into a single DataFrame
    if filtered_chunks:
        return pd.concat(filtered_chunks, ignore_index=True)
    else:
        return pd.DataFrame()  # Return an empty DataFrame if no matching rows found
    
    

def find_best_combinations(raw_df, n_top_combo=120):
    # Group by strategy parameters
    group_cols = ['num_clusters', 'clustering_algorithm', 'train_period', 
                  'test_period', 'reverse_test', 'random_seed']
    
    # Calculate mean performance metrics for each parameter combination
    qualified_params = raw_df.groupby(group_cols).agg({
        'test_profit_factor': 'mean',
        'test_sharpe_ratio': 'mean',
        'test_win_ratio': 'mean',
        'test_num_trades': 'mean',
        'test_avg_trades_per_window': 'mean'
    }).reset_index()
    
    # Calculate stability metrics across seeds for each parameter combination (excluding seed)
    stability_group = group_cols.copy()
    stability_group.remove('random_seed')
    
    stability_metrics = qualified_params.groupby(stability_group).agg({
        'test_profit_factor': 'std',
        'test_sharpe_ratio': 'std',
        'test_win_ratio': 'std',
        'test_avg_trades_per_window': 'std'
    }).reset_index()
    
    # Rename stability columns
    stability_metrics.columns = stability_group + ['pf_std', 'sharpe_std', 'win_std', 'trades_std']
    
    # Merge stability metrics back
    qualified_params = qualified_params.merge(
        stability_metrics,
        on=stability_group
    )
    
    # Calculate stability score (lower std = higher score)
    qualified_params['stability_score'] = (
        (1 / (1 + qualified_params['pf_std'])) +
        (1 / (1 + qualified_params['sharpe_std'])) +
        (1 / (1 + qualified_params['win_std'])) +
        (1 / (1 + qualified_params['trades_std']))
    ) / 4
    
    # Normalize stability score
    qualified_params['stability_score'] = (qualified_params['stability_score'] - 
                                         qualified_params['stability_score'].min()) / (
                                         qualified_params['stability_score'].max() - 
                                         qualified_params['stability_score'].min())
                                         
    # if stability score is NaN, set it to 0
    qualified_params['stability_score'] = qualified_params['stability_score'].fillna(0.9)                                         
    
    # Calculate combined score with stability
    qualified_params['combined_score'] = (
        qualified_params['test_profit_factor'] * 0.35 +
        qualified_params['test_sharpe_ratio'] * 0.35 +
        qualified_params['test_win_ratio'] * 0.2 +
        qualified_params['stability_score'] * 0.1
    )
    
    # Sort by combined score and get top combinations
    best_params = qualified_params.sort_values('combined_score', ascending=False)
    actual_n_top = min(n_top_combo, len(best_params))
    top_combinations = best_params.head(actual_n_top)
    
    # Create display DataFrame with formatted columns
    display_df = pd.DataFrame({
        'Rank': range(1, actual_n_top + 1),
        'Clusters': top_combinations['num_clusters'],
        'Algorithm': top_combinations['clustering_algorithm'],
        'Train Period': top_combinations['train_period'].astype(str) + 'w',
        'Rev.Test': top_combinations['reverse_test'],
        'Seed': top_combinations['random_seed'],
        'PF': top_combinations['test_profit_factor'].round(3),
        'Sharpe': top_combinations['test_sharpe_ratio'].round(3),
        'Win%': top_combinations['test_win_ratio'].round(3),
        'Trades': top_combinations['test_avg_trades_per_window'].round(1),
        'Stability': top_combinations['stability_score'].round(3),
        'Score': top_combinations['combined_score'].round(3)
    })
    
    # Apply styling to the display DataFrame
    styled_df = display_df.style\
        .format({
            'PF': '{:.3f}',
            'Sharpe': '{:.3f}',
            'Win%': '{:.1%}',
            'Trades': '{:.1f}',
            'Stability': '{:.3f}',
            'Score': '{:.3f}'
        })\
        .background_gradient(subset=['Score', 'Stability'], cmap='YlOrRd')\
        .background_gradient(subset=['PF'], cmap='YlOrRd')\
        .background_gradient(subset=['Sharpe'], cmap='YlOrRd')\
        .background_gradient(subset=['Win%'], cmap='YlOrRd')\
        .background_gradient(subset=['Trades'], cmap='YlOrRd')\
        .set_properties(**{
            'text-align': 'right',
            'font-family': 'monospace',
            'padding': '5px'
        })\
        .hide(axis="index")
    
    return best_params, styled_df


# FIND BEST COMBINATION SCRIPTS

In [3]:
instrument_list = [
    "AAPL_M15",
    "MSFT_M15", "AMZN_M15", "GOOGL_M15", "NVDA_M15", "TSLA_M15", "META_M15", 
    "AMD_M15", "NFLX_M15", "INTC_M15", "CSCO_M15", "PEP_M15", "COST_M15", "QCOM_M15", 
    "TXN_M15", "SBUX_M15", "AMGN_M15", "AVGO_M15", "ADP_M15", "ISRG_M15", "MU_M15", 
    "MDLZ_M15", "BKNG_M15", "GILD_M15", "FISV_M15", "ATVI_M15", "ADI_M15", "LRCX_M15", 
    "KLAC_M15", "MAR_M15", "MCHP_M15", "EA_M15", "EXC_M15", "ILMN_M15", "IDXX_M15", 
    "MNST_M15", "PAYX_M15", "LULU_M15", "ORLY_M15", "VRTX_M15", "REGN_M15", "ASML_M15", 
    "CSX_M15", "SNPS_M15", "CDNS_M15", "DXCM_M15", "KDP_M15", "MTD_M15", "SWKS_M15", 
    "WDAY_M15"
]

In [4]:
time = "2025-01-19_08:02"
time = "2025-01-19_01:28"
time = "2025-01-24_20:45"

# Initialize lists to store parameters
best_params_dict = {
    "instrument": [],
    "num_clusters": [],
    "clustering_algorithm": [],
    "train_period": [],
    "test_period": [],
    "price_history_length": [],
    "num_perceptually_important_points": [],
    "reverse_test": [],
    "random_seed": [],
    "test_profit_factor": [],
}
    
# Loop through each instrument
for instrument in instrument_list:
    raw_df = filter_csv_pandas(
        f"/projects/genomic-ml/da2343/ml_project_2/unsupervised/kmeans/results/{time}_results.csv",
        "instrument", 
        instrument
    )
    raw_df = raw_df[raw_df['test_direction'] != 'short']
    # raw_df = raw_df[raw_df['num_clusters'] == 5]
    # raw_df = raw_df[raw_df['clustering_algorithm'] == 'kmeans']
    
    best_params, styled_df = find_best_combinations(raw_df)
    best_row = best_params.iloc[0]  # Get top performing combination
    
    # Append parameters to dictionary
    best_params_dict["instrument"].append(instrument)
    best_params_dict["num_clusters"].append(int(best_row["num_clusters"]))
    best_params_dict["clustering_algorithm"].append(best_row["clustering_algorithm"])
    best_params_dict["train_period"].append(int(best_row["train_period"]))
    best_params_dict["test_period"].append(int(best_row["test_period"]))
    best_params_dict["price_history_length"].append(24)
    best_params_dict["num_perceptually_important_points"].append(4)
    best_params_dict["reverse_test"].append(bool(best_row["reverse_test"]))
    best_params_dict["random_seed"].append(int(best_row["random_seed"]))
    best_params_dict["test_profit_factor"].append(best_row["test_profit_factor"])

# Convert to DataFrame
params_concat_df = pd.DataFrame(best_params_dict).reset_index(drop=True)

In [5]:
params_concat_df

Unnamed: 0,instrument,num_clusters,clustering_algorithm,train_period,test_period,price_history_length,num_perceptually_important_points,reverse_test,random_seed,test_profit_factor
0,AAPL_M15,5,birch,1,1,24,4,False,42,1.58779
1,MSFT_M15,7,birch,1,1,24,4,False,42,1.150613
2,AMZN_M15,7,kmeans,7,1,24,4,False,426,1.473523
3,GOOGL_M15,7,kmeans,1,1,24,4,False,426,1.617416
4,NVDA_M15,7,birch,14,1,24,4,False,42,1.475583
5,TSLA_M15,7,birch,13,1,24,4,False,42,1.356909
6,META_M15,7,birch,1,1,24,4,False,42,1.462068
7,AMD_M15,5,kmeans,7,1,24,4,False,737,1.317595
8,NFLX_M15,5,birch,9,1,24,4,False,42,1.427743
9,INTC_M15,7,birch,14,1,24,4,False,42,1.629367


In [None]:
# styled_df
best_row

# Create a Job for HPC

In [6]:
n_tasks, ncol = params_concat_df.shape
date_time = datetime.now().strftime("%Y-%m-%d_%H:%M")
job_name = f"ml_project_2_{date_time}"
job_dir = "/scratch/da2343/" + job_name
results_dir = os.path.join(job_dir, "results")
os.system("mkdir -p " + results_dir)
params_concat_df.to_csv(os.path.join(job_dir, "params.csv"), index=False)

print(f"created {n_tasks} tasks in {job_dir}")

run_one_contents = f"""#!/bin/bash
#SBATCH --array=0-{n_tasks-1}
#SBATCH --time=5:00:00
#SBATCH --mem=4GB
#SBATCH --cpus-per-task=1
#SBATCH --error={job_dir}/slurm-%A_%a.out
#SBATCH --output={job_dir}/slurm-%A_%a.out
#SBATCH --job-name={job_name}
cd {job_dir}
python run_one.py $SLURM_ARRAY_TASK_ID
"""
run_one_sh = os.path.join(job_dir, "run_one.sh")
with open(run_one_sh, "w") as run_one_f:
    run_one_f.write(run_one_contents)

run_orig_py = "/projects/genomic-ml/da2343/ml_project_2/unsupervised/kmeans/demo_run_stock.py"
run_one_py = os.path.join(job_dir, "run_one.py")
shutil.copyfile(run_orig_py, run_one_py)
orig_dir = os.path.dirname(run_orig_py)
orig_results = os.path.join(orig_dir, "results")
os.system("mkdir -p " + orig_results)
orig_csv = os.path.join(orig_dir, "params.csv")
params_concat_df.to_csv(orig_csv, index=False)

msg = f"""created params CSV files and job scripts, test with
python {run_orig_py}
SLURM_ARRAY_TASK_ID=0 bash {run_one_sh}"""
print(msg)

created 50 tasks in /scratch/da2343/ml_project_2_2025-01-26_21:55
created params CSV files and job scripts, test with
python /projects/genomic-ml/da2343/ml_project_2/unsupervised/kmeans/demo_run_stock.py
SLURM_ARRAY_TASK_ID=0 bash /scratch/da2343/ml_project_2_2025-01-26_21:55/run_one.sh
