In [1]:
from datetime import datetime
import pandas as pd
import numpy as np
import os
import shutil
import sys
import json

# TRAINING HYPER-PARAMETERS

In [2]:
params_df_list = []

# Define fixed seed for BIRCH
BIRCH_SEED = 42  # You can choose any constant value

# Generate random seeds for other algorithms
DYNAMIC_SEEDS = np.random.choice(1000, size=10, replace=False)

params_dict = {
    "instrument": [
        # "EUR_USD_M15", "GBP_USD_M15", "USD_JPY_M15", "USD_CHF_M15", 
        # "USD_CAD_M15", "AUD_USD_M15", "AUD_JPY_M15", "AUD_CAD_M15", "EUR_GBP_M15", 
        
        "EUR_JPY_M15", "GBP_CHF_M15", "GBP_JPY_M15", 
        "EUR_CHF_M15", "AUD_NZD_M15", "CAD_JPY_M15", "NZD_USD_M15", 
        "EUR_CAD_M15"
    ],
    "price_history_length": [24],
    "num_perceptually_important_points": [4],
    "num_clusters": [5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 40, 50, 60],
    "clustering_algorithm": ["kmeans", "birch", "gaussian_mixture"],
    "train_period": [4, 5, 6, 7, 8, 9, 10, 11, 12, 14],
    "test_period": [1],
    "reverse_test": [True, False],
}

# Create separate dataframes for BIRCH and other algorithms
# BIRCH parameters
birch_params = params_dict.copy()
birch_params["clustering_algorithm"] = ["birch"]
birch_params["random_seed"] = [BIRCH_SEED]

# Other algorithms parameters
other_params = params_dict.copy()
other_params["clustering_algorithm"] = ["kmeans", "gaussian_mixture"]
other_params["random_seed"] = DYNAMIC_SEEDS

# Generate parameter combinations for BIRCH
birch_df = (
    pd.MultiIndex.from_product(birch_params.values(), names=birch_params.keys())
    .to_frame()
    .reset_index(drop=True)
)

# Generate parameter combinations for other algorithms
other_df = (
    pd.MultiIndex.from_product(other_params.values(), names=other_params.keys())
    .to_frame()
    .reset_index(drop=True)
)

# Combine both dataframes
params_concat_df = pd.concat([birch_df, other_df], ignore_index=True)
params_concat_df

Unnamed: 0,instrument,price_history_length,num_perceptually_important_points,num_clusters,clustering_algorithm,train_period,test_period,reverse_test,random_seed
0,EUR_JPY_M15,24,4,5,birch,4,1,True,42
1,EUR_JPY_M15,24,4,5,birch,4,1,False,42
2,EUR_JPY_M15,24,4,5,birch,5,1,True,42
3,EUR_JPY_M15,24,4,5,birch,5,1,False,42
4,EUR_JPY_M15,24,4,5,birch,6,1,True,42
...,...,...,...,...,...,...,...,...,...
43675,EUR_CAD_M15,24,4,60,gaussian_mixture,14,1,False,755
43676,EUR_CAD_M15,24,4,60,gaussian_mixture,14,1,False,311
43677,EUR_CAD_M15,24,4,60,gaussian_mixture,14,1,False,795
43678,EUR_CAD_M15,24,4,60,gaussian_mixture,14,1,False,564


# TESTING BEST HYPER-PARAMETERS

In [None]:
best_params = {
   "instrument": [
       'EUR_USD_M15', 'GBP_USD_M15', 'USD_JPY_M15', 'USD_CHF_M15', 
       'USD_CAD_M15', 'AUD_USD_M15', 'AUD_JPY_M15', 'AUD_CAD_M15',
       'EUR_GBP_M15', 'EUR_JPY_M15', 'GBP_CHF_M15', 'GBP_JPY_M15',
       'EUR_CHF_M15', 'AUD_NZD_M15', 'CAD_JPY_M15', 'NZD_USD_M15', 
       'EUR_CAD_M15'
   ],
   "num_clusters": [
       6, 4, 5, 5, 
       4, 8, 4, 4,
       4, 5, 4, 4,
       7, 4, 5, 7,
       6
   ],
   "clustering_algorithm": [
       'kmeans', 'birch', 'kmeans', 'kmeans',
       'birch', 'birch', 'kmeans', 'kmeans', 
       'kmeans', 'birch', 'birch', 'birch',
       'kmeans', 'kmeans', 'kmeans', 'birch',
       'kmeans'
   ],
   "train_period": [
       8, 9, 5, 14,
       11, 9, 12, 14,
       5, 4, 11, 10,
       7, 10, 4, 7,
       5
   ],
   "test_period": [1] * 17,  # all use 1 week test period
   "price_history_length": [24] * 17,  # all use 24
   "num_perceptually_important_points": [4] * 17,  # all use 4
   "reverse_test": [
       False, True, True, True,
       True, True, False, True,
       True, True, True, True, 
       False, True, False, True,
       True
   ],
   "random_seed": [388] * 17  # using a consistent seed
}

params_concat_df = pd.DataFrame(best_params)
params_concat_df

In [3]:

n_tasks, ncol = params_concat_df.shape
date_time = datetime.now().strftime("%Y-%m-%d_%H:%M")
job_name = f"ml_project_2_{date_time}"
job_dir = "/scratch/da2343/" + job_name
results_dir = os.path.join(job_dir, "results")
os.system("mkdir -p " + results_dir)
params_concat_df.to_csv(os.path.join(job_dir, "params.csv"), index=False)

print(f"created {n_tasks} tasks in {job_dir}")

run_one_contents = f"""#!/bin/bash
#SBATCH --array=0-{n_tasks-1}
#SBATCH --time=24:00:00
#SBATCH --mem=4GB
#SBATCH --cpus-per-task=1
#SBATCH --error={job_dir}/slurm-%A_%a.out
#SBATCH --output={job_dir}/slurm-%A_%a.out
#SBATCH --job-name={job_name}
cd {job_dir}
python run_one.py $SLURM_ARRAY_TASK_ID
"""
run_one_sh = os.path.join(job_dir, "run_one.sh")
with open(run_one_sh, "w") as run_one_f:
    run_one_f.write(run_one_contents)

run_orig_py = "demo_run_gfd.py"
run_one_py = os.path.join(job_dir, "run_one.py")
shutil.copyfile(run_orig_py, run_one_py)
orig_dir = os.path.dirname(run_orig_py)
orig_results = os.path.join(orig_dir, "results")
os.system("mkdir -p " + orig_results)
orig_csv = os.path.join(orig_dir, "params.csv")
params_concat_df.to_csv(orig_csv, index=False)

msg = f"""created params CSV files and job scripts, test with
python {run_orig_py}
SLURM_ARRAY_TASK_ID=0 bash {run_one_sh}"""
print(msg)


created 43680 tasks in /scratch/da2343/ml_project_2_2024-12-27_17:41
created params CSV files and job scripts, test with
python demo_run_gfd.py
SLURM_ARRAY_TASK_ID=0 bash /scratch/da2343/ml_project_2_2024-12-27_17:41/run_one.sh
