In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sktime.forecasting.model_selection import SlidingWindowSplitter
import os
import sys

sys.path.append(
    os.path.abspath(
        "/projects/genomic-ml/da2343/ml_project_2/unsupervised/kmeans/utils.py"
    )
)
from utils import *


# Assume the RandomStartSlidingWindowSplitter class is defined here or imported

# Generate synthetic time series data
np.random.seed(42)
dates = pd.date_range(start='2020-01-01', end='2022-12-31', freq='D')
n = len(dates)
trend = np.linspace(0, 100, n)
seasonality = 10 * np.sin(2 * np.pi * np.arange(n) / 365.25)
noise = np.random.normal(0, 5, n)
y = trend + seasonality + noise

# Create features (using lag features for this example)
def create_features(y, lag=30):
    df = pd.DataFrame({'y': y, 'ds': dates})
    for i in range(1, lag+1):
        df[f'lag_{i}'] = df['y'].shift(i)
    df['month'] = df['ds'].dt.month
    df['day'] = df['ds'].dt.day
    return df.dropna().reset_index(drop=True)

df = create_features(y)
X = df.drop(['y', 'ds'], axis=1)
y = df['y']

# Initialize our custom splitter
splitter = SlidingWindowSplitter(
        window_length=500,
        fh=np.arange(1, 10 + 1),
        step_length=1
)

# Prepare for storing results
mse_scores = []
predictions = []
true_values = []

# Perform cross-validation
for fold, (train_index, test_index) in enumerate(splitter.split(X)):
    print()
    print(f"Fold {fold}")
    print(f"Length of train set: {len(train_index)}")
    print(f"Length of test set: {len(test_index)}")
    
    print(f"Train indices: {train_index}")
    print(f"Test indices: {test_index}")
    print()

In [None]:
data_dict = {
    'train_period': [3840, 4800, 4800, 4800, 4800, 8640, 8640, 8640, 8640],
    'random_seed': [20, 10, 20, 42, 200, 7, 7, 10, 90],
    'num_clusters': [110, 80, 80, 70, 90, 80, 80, 90, 70],
    'clustering_algorithm': 8 * ['gaussian_mixture'] + ['kmeans'],
    'max_cluster_labels': [2, 2, 1, 1, 1, 1, 2, 2, 2],
    
    'num_perceptually_important_points': 9 * [5],
    'distance_measure': 9 *[1],
    'atr_multiplier': 9 * [10],
    'price_history_length': 9 * [24],
    'test_period': 9 * [960],
}
params_concat_df = pd.DataFrame(data_dict)
params_concat_df

In [None]:
params_concat_df