In [3]:
import pandas as pd
import talib
import numpy as np

# Load the data
df = pd.read_csv('/projects/genomic-ml/da2343/ml_project_2/data/gen_oanda_data/GBP_USD_M15_raw_data.csv', parse_dates=["time"])
df.set_index("time", inplace=True)

# Add time-based features
df["year"] = df.index.year
df["month"] = df.index.month
df["day_of_week"] = df.index.dayofweek
df["hour"] = df.index.hour
df["minute"] = df.index.minute
# Calculate ATR
df["atr"] = talib.ATR(df["high"], df["low"], df["close"], timeperiod=1)
# df["log_close"] = np.log(df["close"])
# df["log_high"] = np.log(df["high"])
# df["log_low"] = np.log(df["low"])
# df["log_atr"] = np.log(df["atr"])

# Remove NaN values
df = df.dropna()

# Extract only year 2018
df = df[df["year"] >= 2018]

# Calculate Q1 and Q3
Q1 = df['atr'].quantile(0.4)
Q3 = df['atr'].quantile(0.9)
IQR = Q3 - Q1

# Create the new 'atr_cleaned' column
df['atr_cleaned'] = df['atr'].clip(lower=Q1, upper=Q3)

df

Unnamed: 0_level_0,open,high,low,close,volume,year,month,day_of_week,hour,minute,atr,atr_cleaned
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2018-01-01 22:00:00+00:00,1.35021,1.35153,1.35021,1.35149,66,2018,1,0,22,0,0.00132,0.00132
2018-01-01 22:15:00+00:00,1.35112,1.35152,1.35100,1.35101,16,2018,1,0,22,15,0.00052,0.00067
2018-01-01 22:30:00+00:00,1.35144,1.35163,1.35077,1.35098,86,2018,1,0,22,30,0.00086,0.00086
2018-01-01 22:45:00+00:00,1.35145,1.35238,1.35092,1.35180,141,2018,1,0,22,45,0.00146,0.00146
2018-01-01 23:00:00+00:00,1.35137,1.35238,1.35069,1.35196,3858,2018,1,0,23,0,0.00169,0.00169
...,...,...,...,...,...,...,...,...,...,...,...,...
2024-02-16 09:00:00+00:00,1.25766,1.25846,1.25746,1.25805,1086,2024,2,4,9,0,0.00100,0.00100
2024-02-16 09:15:00+00:00,1.25806,1.25864,1.25786,1.25854,1180,2024,2,4,9,15,0.00078,0.00078
2024-02-16 09:30:00+00:00,1.25856,1.25890,1.25842,1.25850,1455,2024,2,4,9,30,0.00048,0.00067
2024-02-16 09:45:00+00:00,1.25850,1.25882,1.25831,1.25858,1198,2024,2,4,9,45,0.00051,0.00067


In [None]:
# Print some statistics
print(f"Original ATR range: {df['atr'].min()} to {df['atr'].max()}")
print(f"Cleaned ATR range: {df['atr_cleaned'].min()} to {df['atr_cleaned'].max()}")
print(f"Q1: {Q1}")
print(f"Q3: {Q3}")

# Optional: Plot the results
import matplotlib.pyplot as plt
plt.figure(figsize=(12, 6))
plt.plot(df.index, df['atr'], label='Original ATR', alpha=0.5)
plt.plot(df.index, df['atr_cleaned'], label='Cleaned ATR', alpha=0.8)
plt.title('ATR - Original vs Cleaned (Clipped to Q1-Q3 range)')
plt.legend()
plt.show()

In [4]:
import pandas as pd
import numpy as np
from typing import Tuple, Iterator, Optional

class OrderedSlidingWindowSplitter:
    def __init__(self, train_weeks: int, test_weeks: int = 2, step_size: int = 1, 
                 allow_partial_window: bool = True, min_test_size: float = 0.85):
        """
        Initialize the OrderedSlidingWindowSplitter.

        Args:
            train_weeks (int): Number of weeks for the training data.
            test_weeks (int): Number of weeks for the test data.
            step_size (int): Number of weeks to slide the window.
            allow_partial_window (bool): Whether to allow partial windows at the end of the dataset.
            min_test_size (float): Minimum size of test set as a fraction of expected size.
        """
        self.train_weeks = train_weeks
        self.test_weeks = test_weeks
        self.step_size = step_size
        self.allow_partial_window = allow_partial_window
        self.min_test_size = min_test_size

    def split(self, X: pd.DataFrame) -> Iterator[Tuple[np.ndarray, np.ndarray]]:
        """
        Generate sliding window splits.

        Args:
            X (pd.DataFrame): Input data with DatetimeIndex.

        Yields:
            Tuple containing:
                - train indices
                - test indices
        """
        self._validate_input(X)
        start_date = X.index[0]
        end_date = X.index[-1]

        expected_train_points = self.train_weeks * 5 * 24 * 4
        expected_test_points = self.test_weeks * 5 * 24 * 4
        min_test_points = int(expected_test_points * self.min_test_size)

        window_count = 0
        while start_date + pd.Timedelta(weeks=self.train_weeks + self.test_weeks) <= end_date:
            train_start = self._next_sunday_open(start_date)
            train_end = self._friday_close_after_weeks(train_start, self.train_weeks)
            test_start = self._next_sunday_open(train_end)
            test_end = self._friday_close_after_weeks(test_start, self.test_weeks)

            train_mask = self._create_market_hours_mask(X, train_start, train_end)
            test_mask = self._create_market_hours_mask(X, test_start, min(test_end, end_date))

            train_indices = np.where(train_mask)[0]
            test_indices = np.where(test_mask)[0]

            if len(test_indices) < min_test_points and not self.allow_partial_window:
                break

            yield train_indices, test_indices

            start_date += pd.Timedelta(weeks=self.step_size)
            window_count += 1

    def get_n_splits(self, X: pd.DataFrame) -> int:
        """
        Calculate the number of splits.

        Args:
            X (pd.DataFrame): Input data with DatetimeIndex.

        Returns:
            int: Number of splits.
        """
        self._validate_input(X)
        return sum(1 for _ in self.split(X))

    def _validate_input(self, X: pd.DataFrame) -> None:
        """
        Validate the input data.

        Args:
            X (pd.DataFrame): Input data with DatetimeIndex.

        Raises:
            ValueError: If input data is invalid.
        """
        if not isinstance(X.index, pd.DatetimeIndex):
            raise ValueError("Input data must have a DatetimeIndex")
        if len(X) < (self.train_weeks + self.test_weeks) * 5 * 24 * 4:  # Assuming 15-minute intervals, 5 days a week
            raise ValueError("Insufficient data for at least one split")

    def _next_sunday_open(self, date: pd.Timestamp) -> pd.Timestamp:
        """
        Find the next Sunday 22:00.

        Args:
            date (pd.Timestamp): Starting date.

        Returns:
            pd.Timestamp: Next Sunday at 22:00.
        """
        next_sunday = date + pd.Timedelta(days=(6 - date.dayofweek) % 7)
        return next_sunday.replace(hour=22, minute=0, second=0, microsecond=0)

    def _friday_close_after_weeks(self, start: pd.Timestamp, weeks: int) -> pd.Timestamp:
        """
        Find the Friday 21:45 after the specified number of weeks.

        Args:
            start (pd.Timestamp): Starting date.
            weeks (int): Number of weeks to add.

        Returns:
            pd.Timestamp: Friday at 21:45 after the specified number of weeks.
        """
        end = start + pd.Timedelta(weeks=weeks, days=-2, hours=21, minutes=45)
        return end

    def _create_market_hours_mask(self, X: pd.DataFrame, start: pd.Timestamp, end: pd.Timestamp) -> np.ndarray:
        """
        Create a boolean mask for market hours within the given period.

        Args:
            X (pd.DataFrame): Input data with DatetimeIndex.
            start (pd.Timestamp): Start of the period.
            end (pd.Timestamp): End of the period.

        Returns:
            np.ndarray: Boolean mask for market hours.
        """
        mask = (X.index >= start) & (X.index <= end)
        mask &= ((X.index.dayofweek < 5) | ((X.index.dayofweek == 6) & (X.index.hour >= 22)))
        return mask

    def plot_splits(self, X: pd.DataFrame, n_splits: Optional[int] = None) -> None:
        """
        Plot the splits for visualization.

        Args:
            X (pd.DataFrame): Input data with DatetimeIndex.
            n_splits (int, optional): Number of splits to plot. If None, plot all splits.
        """
        import matplotlib.pyplot as plt

        splits = list(self.split(X))
        if n_splits is not None:
            splits = splits[:n_splits]

        fig, ax = plt.subplots(figsize=(15, 5 * len(splits)))
        for i, (train_idx, test_idx) in enumerate(splits):
            ax.plot(X.index[train_idx], [i] * len(train_idx), 'b.', label='Train' if i == 0 else '')
            ax.plot(X.index[test_idx], [i] * len(test_idx), 'r.', label='Test' if i == 0 else '')

        ax.set_yticks(range(len(splits)))
        ax.set_yticklabels([f'Split {i+1}' for i in range(len(splits))])
        ax.legend()
        plt.title('Ordered Sliding Window Splits')
        plt.xlabel('Date')
        plt.tight_layout()
        plt.show()

In [5]:
# # Load your actual data here
# df = pd.read_csv('/projects/genomic-ml/da2343/ml_project_2/data/gen_oanda_data/GBP_USD_M15_raw_data.csv', parse_dates=["time"])
# df.set_index("time", inplace=True)
# df = df[df["year"] == 2018]


window_splitter = OrderedSlidingWindowSplitter(train_weeks=4, test_weeks=2, step_size=1)

print(f"Total number of splits: {window_splitter.get_n_splits(df)}")

for window, (train_indices, test_indices) in enumerate(window_splitter.split(df), 1):
    train_data = df.iloc[train_indices]
    test_data = df.iloc[test_indices]
    
    print(f"\nWindow {window}:")
    print(f"Train data: {train_data.index[0]} to {train_data.index[-1]}")
    print(f"Train data shape: {train_data.shape}")
    print(f"Test data: {test_data.index[0]} to {test_data.index[-1]}")
    print(f"Test data shape: {test_data.shape}")
    print(f"Expected train points: {4 * 5 * 24 * 4}, Actual: {len(train_indices)}")
    print(f"Expected test points: {2 * 5 * 24 * 4}, Actual: {len(test_indices)}")
    
    # if window == window_splitter.get_n_splits(df):
    #     break

print("\nAll windows processed.")


Total number of splits: 314

Window 1:
Train data: 2018-01-07 22:00:00+00:00 to 2018-02-02 21:45:00+00:00
Train data shape: (1920, 12)
Test data: 2018-02-04 22:00:00+00:00 to 2018-02-16 21:45:00+00:00
Test data shape: (960, 12)
Expected train points: 1920, Actual: 1920
Expected test points: 960, Actual: 960

Window 2:
Train data: 2018-01-14 22:00:00+00:00 to 2018-02-09 21:45:00+00:00
Train data shape: (1920, 12)
Test data: 2018-02-11 22:00:00+00:00 to 2018-02-23 21:45:00+00:00
Test data shape: (960, 12)
Expected train points: 1920, Actual: 1920
Expected test points: 960, Actual: 960

Window 3:
Train data: 2018-01-21 22:00:00+00:00 to 2018-02-16 21:45:00+00:00
Train data shape: (1920, 12)
Test data: 2018-02-18 22:00:00+00:00 to 2018-03-02 21:45:00+00:00
Test data shape: (960, 12)
Expected train points: 1920, Actual: 1920
Expected test points: 960, Actual: 960

Window 4:
Train data: 2018-01-28 22:00:00+00:00 to 2018-02-23 21:45:00+00:00
Train data shape: (1920, 12)
Test data: 2018-02-25 