In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sktime.forecasting.model_selection import SlidingWindowSplitter


# Assume the RandomStartSlidingWindowSplitter class is defined here or imported

# Generate synthetic time series data
np.random.seed(42)
dates = pd.date_range(start='2020-01-01', end='2022-12-31', freq='D')
n = len(dates)
trend = np.linspace(0, 100, n)
seasonality = 10 * np.sin(2 * np.pi * np.arange(n) / 365.25)
noise = np.random.normal(0, 5, n)
y = trend + seasonality + noise

# Create features (using lag features for this example)
def create_features(y, lag=30):
    df = pd.DataFrame({'y': y, 'ds': dates})
    for i in range(1, lag+1):
        df[f'lag_{i}'] = df['y'].shift(i)
    df['month'] = df['ds'].dt.month
    df['day'] = df['ds'].dt.day
    return df.dropna().reset_index(drop=True)

df = create_features(y)
X = df.drop(['y', 'ds'], axis=1)
y = df['y']

# Initialize our custom splitter
splitter = RandomStartSlidingWindowSplitter(n_splits=10, 
                                            train_size=500, 
                                            test_size=10, 
                                            randomness=0.5)

# Prepare for storing results
mse_scores = []
predictions = []
true_values = []

# Perform cross-validation
for fold, (train_index, test_index) in enumerate(splitter.split(X)):
    print()
    print(f"Fold {fold}")
    print(f"Train indices: {train_index}")
    print(f"Test indices: {test_index}")
    print()

In [1]:
from datetime import datetime
import pandas as pd
import numpy as np
import os
import shutil
import sys
import pandas as pd
import json

# Load the config file
config_path = "/projects/genomic-ml/da2343/ml_project_2/settings/config.json"
with open(config_path) as f:
  config = json.load(f) 
config_settings = config["trading_settings"]

params_df_list = []
params_dict = {
    'max_cluster_labels': [1, 2, 5],
    'price_history_length': [24],
    'num_perceptually_important_points': [5],
    'distance_measure': [1],
    'num_clusters': [70, 80, 90, 100, 110, 120],
    'atr_multiplier': [10],
    'clustering_algorithm': ['kmeans', 'gaussian_mixture'],
    # 'random_seed': np.arange(1, 100),
    'random_seed': [1, 2, 4, 7, 10, 12, 15, 18, 20, 21, 42, 50, 80, 90, 100, 200, 300],
    'train_period': [30, 40, 50, 60, 70, 80, 90], # days   
    'test_period': [10] # days
}
params_df = pd.MultiIndex.from_product(
    params_dict.values(),
    names=params_dict.keys()
).to_frame().reset_index(drop=True)
params_df_list.append(params_df)
params_concat_df = pd.concat(params_df_list, ignore_index=True)
params_concat_df

Unnamed: 0,max_cluster_labels,price_history_length,num_perceptually_important_points,distance_measure,num_clusters,atr_multiplier,clustering_algorithm,random_seed,train_period,test_period
0,1,24,5,1,70,10,kmeans,1,30,10
1,1,24,5,1,70,10,kmeans,1,40,10
2,1,24,5,1,70,10,kmeans,1,50,10
3,1,24,5,1,70,10,kmeans,1,60,10
4,1,24,5,1,70,10,kmeans,1,70,10
...,...,...,...,...,...,...,...,...,...,...
4279,5,24,5,1,120,10,gaussian_mixture,300,50,10
4280,5,24,5,1,120,10,gaussian_mixture,300,60,10
4281,5,24,5,1,120,10,gaussian_mixture,300,70,10
4282,5,24,5,1,120,10,gaussian_mixture,300,80,10


In [6]:
data_dict = {
    'max_cluster_labels': ['Alice', 'Bob', 'Charlie', 'David'],
    'price_history_length': [25, 30, 35, 28],
    'num_perceptually_important_points': ['New York', 'San Francisco', 'Los Angeles', 'Chicago'],
    'distance_measure': ['New York', 'San Francisco', 'Los Angeles', 'Chicago'],
    'num_clusters': ['New York', 'San Francisco', 'Los Angeles', 'Chicago'],
    'atr_multiplier': ['New York', 'San Francisco', 'Los Angeles', 'Chicago'],
    'clustering_algorithm': ['New York', 'San Francisco', 'Los Angeles', 'Chicago'],
    'random_seed': ['New York', 'San Francisco', 'Los Angeles', 'Chicago'],
    'train_period': ['New York', 'San Francisco', 'Los Angeles', 'Chicago'],
    'test_period': 4 * [10],
}
params_concat_df = pd.DataFrame(data_dict)
print(params_concat_df)

  max_cluster_labels  price_history_length num_perceptually_important_points  \
0              Alice                    25                          New York   
1                Bob                    30                     San Francisco   
2            Charlie                    35                       Los Angeles   
3              David                    28                           Chicago   

  distance_measure   num_clusters atr_multiplier clustering_algorithm  \
0         New York       New York       New York             New York   
1    San Francisco  San Francisco  San Francisco        San Francisco   
2      Los Angeles    Los Angeles    Los Angeles          Los Angeles   
3          Chicago        Chicago        Chicago              Chicago   

     random_seed   train_period  test_period  
0       New York       New York           10  
1  San Francisco  San Francisco           10  
2    Los Angeles    Los Angeles           10  
3        Chicago        Chicago           10 

In [7]:
params_concat_df

Unnamed: 0,max_cluster_labels,price_history_length,num_perceptually_important_points,distance_measure,num_clusters,atr_multiplier,clustering_algorithm,random_seed,train_period,test_period
0,Alice,25,New York,New York,New York,New York,New York,New York,New York,10
1,Bob,30,San Francisco,San Francisco,San Francisco,San Francisco,San Francisco,San Francisco,San Francisco,10
2,Charlie,35,Los Angeles,Los Angeles,Los Angeles,Los Angeles,Los Angeles,Los Angeles,Los Angeles,10
3,David,28,Chicago,Chicago,Chicago,Chicago,Chicago,Chicago,Chicago,10


In [None]:
n_tasks, ncol = params_concat_df.shape
date_time = datetime.now().strftime("%Y-%m-%d_%H:%M")
job_name = f"ml_project_2_{date_time}"
job_dir = "/scratch/da2343/" + job_name
results_dir = os.path.join(job_dir, "results")
os.system("mkdir -p " + results_dir)
params_concat_df.to_csv(os.path.join(job_dir, "params.csv"), index=False)

run_one_contents = f"""#!/bin/bash
#SBATCH --array=0-{n_tasks-1}
#SBATCH --time=24:00:00
#SBATCH --mem=2GB
#SBATCH --cpus-per-task=1
#SBATCH --error={job_dir}/slurm-%A_%a.out
#SBATCH --output={job_dir}/slurm-%A_%a.out
#SBATCH --job-name={job_name}
cd {job_dir}
python run_one.py $SLURM_ARRAY_TASK_ID
"""
run_one_sh = os.path.join(job_dir, "run_one.sh")
with open(run_one_sh, "w") as run_one_f:
    run_one_f.write(run_one_contents)

# run_orig_py = "demo_run.py"
run_orig_py = "demo_run_optimized.py"
run_one_py = os.path.join(job_dir, "run_one.py")
shutil.copyfile(run_orig_py, run_one_py)
orig_dir = os.path.dirname(run_orig_py)
orig_results = os.path.join(orig_dir, "results")
os.system("mkdir -p " + orig_results)
orig_csv = os.path.join(orig_dir, "params.csv")
params_concat_df.to_csv(orig_csv, index=False)

msg = f"""created params CSV files and job scripts, test with
python {run_orig_py}
SLURM_ARRAY_TASK_ID=0 bash {run_one_sh}"""
print(msg)