<a href="https://colab.research.google.com/github/BaronVonBussin/NewTransit/blob/main/nested_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import os
import logging
from typing import Tuple, Dict, Optional
from dataclasses import dataclass

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('temporal_processing.log'),
        logging.StreamHandler()
    ]
)

@dataclass
class ParentPeriodStats:
    high: float
    low: float
    duration: int
    completed_bars: int
    rpc_count: int
    bar_of_high: int
    bar_of_low: int

class DataValidationError(Exception):
    pass

class ConnectTemporalPeriods:
    REQUIRED_COLUMNS = {'date', 'open', 'high', 'low', 'close'}

    def __init__(self, child_period: str = "D", parent_period: str = "M"):
        self.child_period = child_period
        self.parent_period = parent_period
        self.logger = logging.getLogger(__name__)

    def validate_dataframe(self, df: pd.DataFrame) -> None:
        """Validate input dataframe structure and content."""
        missing_cols = self.REQUIRED_COLUMNS - set(df.columns)
        if missing_cols:
            raise DataValidationError(f"Missing required columns: {missing_cols}")

        # Check for date format
        if not pd.api.types.is_datetime64_any_dtype(df['date']):
            try:
                pd.to_datetime(df['date'])
            except Exception as e:
                raise DataValidationError(f"Invalid date format: {e}")

        # Check for numeric price columns
        price_cols = ['open', 'high', 'low', 'close']
        for col in price_cols:
            if not pd.api.types.is_numeric_dtype(df[col]):
                raise DataValidationError(f"Column {col} must be numeric")

        # Validate high >= low
        invalid_rows = df[df['high'] < df['low']].index
        if len(invalid_rows) > 0:
            raise DataValidationError(f"Found {len(invalid_rows)} rows where high < low")

    def process_files(self, input_dir: str = "input/") -> None:
        """Process all CSV files in the input directory."""
        try:
            os.makedirs("parent_output", exist_ok=True)
            os.makedirs("processed_output", exist_ok=True)

            file_count = 0
            for filename in os.listdir(input_dir):
                if filename.endswith(".csv"):
                    ticker = filename.replace(".csv", "")
                    self.logger.info(f"Processing {ticker}")
                    self.process_single_file(os.path.join(input_dir, filename), ticker)
                    file_count += 1

            self.logger.info(f"Successfully processed {file_count} files")

        except Exception as e:
            self.logger.error(f"Error processing files: {e}")
            raise

    def process_single_file(self, filepath: str, ticker: str) -> None:
        """Process a single ticker's data file."""
        try:
            df = pd.read_csv(filepath)
            df['date'] = pd.to_datetime(df['date'])

            self.validate_dataframe(df)

            df['parent_period'] = df['date'].dt.to_period(self.parent_period)
            parent_stats = self._calculate_parent_stats(df)

            processed_df = self._process_child_data(df, parent_stats)

            # Save outputs with error handling
            self._safe_save_csv(parent_stats, f"parent_output/{ticker}_parent.csv")
            self._safe_save_csv(processed_df, f"processed_output/{ticker}_processed.csv")

            self.logger.info(f"Successfully processed {ticker}")

        except Exception as e:
            self.logger.error(f"Error processing {ticker}: {e}")
            raise

    def _safe_save_csv(self, df: pd.DataFrame, filepath: str) -> None:
        """Safely save DataFrame to CSV with error handling."""
        try:
            df.to_csv(filepath, index=False)
        except Exception as e:
            self.logger.error(f"Error saving to {filepath}: {e}")
            raise

    def _calculate_parent_stats(self, df: pd.DataFrame) -> pd.DataFrame:
        """Calculate parent period statistics."""
        try:
            parent_stats = df.groupby('parent_period').agg({
                'date': ['first', 'count'],
                'high': 'max',
                'low': 'min'
            }).reset_index()

            parent_stats.columns = ['parent_period', 'date', 'duration', 'parent_high', 'parent_low']
            return parent_stats

        except Exception as e:
            self.logger.error(f"Error calculating parent stats: {e}")
            raise

    def _process_child_data(self, df: pd.DataFrame, parent_stats: pd.DataFrame) -> pd.DataFrame:
        """Process child period data with detailed metrics."""
        try:
            df = df.copy()

            # Initialize columns
            numeric_cols = ['intra_period_count', 'intra_period_high', 'intra_period_low',
                          'intra_period_reu', 'intra_period_red', 'range_expansion_flag',
                          'intra_period_bar_of_h', 'intra_period_bar_of_l',
                          'bar_rpc', 'intra_period_cumulative_rpc']

            for col in numeric_cols:
                df[col] = 0

            df['rpc_direction'] = 'N'

            # Process each parent period
            for period in df['parent_period'].unique():
                mask = df['parent_period'] == period
                period_data = df[mask].copy()

                parent_high = parent_stats[parent_stats['parent_period'] == period]['parent_high'].iloc[0]
                parent_low = parent_stats[parent_stats['parent_period'] == period]['parent_low'].iloc[0]

                processed_period = self._process_parent_period(period_data, parent_high, parent_low)
                df.loc[mask] = processed_period

            return df

        except Exception as e:
            self.logger.error(f"Error processing child data: {e}")
            raise

    def _process_parent_period(
        self, period_data: pd.DataFrame,
        parent_high: float,
        parent_low: float
    ) -> pd.DataFrame:
        """Process data within a single parent period."""
        try:
            active_bars = 0
            cumulative_rpc = 0
            prev_direction = 'N'
            intra_high = float('-inf')
            intra_low = float('inf')

            for idx in period_data.index:
                row = period_data.loc[idx]

                if pd.isna(row['high']) or pd.isna(row['low']):
                    continue

                active_bars += 1
                period_data.at[idx, 'intra_period_count'] = active_bars

                # Initialize first bar
                if intra_high == float('-inf'):
                    intra_high = row['high']
                    intra_low = row['low']

                # Calculate range expansions
                reu = max(0, row['high'] - intra_high)
                red = max(0, intra_low - row['low'])

                # Update metrics
                period_data.at[idx, 'intra_period_reu'] = reu
                period_data.at[idx, 'intra_period_red'] = red
                period_data.at[idx, 'range_expansion_flag'] = 1 if (reu > 0 or red > 0) else 0

                new_direction = self._determine_rpc_direction(
                    reu, red, row['high'], row['low'], row['close'], prev_direction
                )

                if new_direction != prev_direction and prev_direction != 'N':
                    cumulative_rpc += 1

                period_data.at[idx, 'rpc_direction'] = new_direction
                period_data.at[idx, 'intra_period_cumulative_rpc'] = cumulative_rpc
                period_data.at[idx, 'bar_rpc'] = self._calculate_bar_rpc(
                    reu, red, prev_direction, new_direction
                )

                # Update parent boundary bars
                if row['high'] == parent_high and period_data.at[idx, 'intra_period_bar_of_h'] == 0:
                    period_data.at[idx, 'intra_period_bar_of_h'] = active_bars
                if row['low'] == parent_low and period_data.at[idx, 'intra_period_bar_of_l'] == 0:
                    period_data.at[idx, 'intra_period_bar_of_l'] = active_bars

                # Update running high/low
                intra_high = max(intra_high, row['high'])
                intra_low = min(intra_low, row['low'])
                period_data.at[idx, 'intra_period_high'] = intra_high
                period_data.at[idx, 'intra_period_low'] = intra_low

                prev_direction = new_direction

            return period_data

        except Exception as e:
            self.logger.error(f"Error processing parent period: {e}")
            raise

    def _determine_rpc_direction(
        self, reu: float, red: float,
        high: float, low: float, close: float,
        prev_direction: str
    ) -> str:
        """Determine the RPC direction based on expansions."""
        try:
            if reu == 0 and red == 0:
                return prev_direction
            elif reu > 0 and red > 0:
                # Two-way expansion - use close location
                return 'U' if (close - low) / (high - low) >= 0.5 else 'D'
            elif reu > 0:
                return 'U'
            else:
                return 'D'

        except Exception as e:
            self.logger.error(f"Error determining RPC direction: {e}")
            raise

    def _calculate_bar_rpc(
        self, reu: float, red: float,
        prev_direction: str, new_direction: str
    ) -> int:
        """Calculate the bar's RPC value."""
        try:
            if reu == 0 and red == 0:
                return 0
            elif reu > 0 and red > 0:
                if prev_direction == 'N' or new_direction != prev_direction:
                    return 2
                return 1
            elif new_direction != prev_direction and prev_direction != 'N':
                return 1
            return 0

        except Exception as e:
            self.logger.error(f"Error calculating bar RPC: {e}")
            raise

def run_tests():
    """Run unit tests for the framework."""
    import unittest

    class TestTemporalFramework(unittest.TestCase):
        def setUp(self):
            self.processor = ConnectTemporalPeriods()

        def test_data_validation(self):
            # Test missing columns
            df_missing = pd.DataFrame({'date': [], 'high': [], 'low': []})
            with self.assertRaises(DataValidationError):
                self.processor.validate_dataframe(df_missing)

            # Test invalid date format
            df_bad_date = pd.DataFrame({
                'date': ['bad_date'],
                'open': [1], 'high': [2], 'low': [1], 'close': [1.5]
            })
            with self.assertRaises(DataValidationError):
                self.processor.validate_dataframe(df_bad_date)

            # Test high < low
            df_invalid_hl = pd.DataFrame({
                'date': ['2024-01-01'],
                'open': [1], 'high': [1], 'low': [2], 'close': [1.5]
            })
            with self.assertRaises(DataValidationError):
                self.processor.validate_dataframe(df_invalid_hl)

        def test_rpc_calculation(self):
            # Test no expansion
            self.assertEqual(
                self.processor._calculate_bar_rpc(0, 0, 'N', 'N'), 0
            )

            # Test two-way expansion
            self.assertEqual(
                self.processor._calculate_bar_rpc(1, 1, 'N', 'U'), 2
            )

            # Test direction change
            self.assertEqual(
                self.processor._calculate_bar_rpc(1, 0, 'D', 'U'), 1
            )

        def test_rpc_direction(self):
            # Test no expansion
            self.assertEqual(
                self.processor._determine_rpc_direction(0, 0, 100, 90, 95, 'U'),
                'U'
            )

            # Test two-way expansion with close in upper half
            self.assertEqual(
                self.processor._determine_rpc_direction(1, 1, 100, 90, 97, 'N'),
                'U'
            )

            # Test upward expansion only
            self.assertEqual(
                self.processor._determine_rpc_direction(1, 0, 100, 90, 95, 'D'),
                'U'
            )

    # Run tests
    unittest.main(argv=['dummy'])

if __name__ == "__main__":
    run_tests()
    processor = ConnectTemporalPeriods()
    processor.process_files()


----------------------------------------------------------------------
Ran 0 tests in 0.000s

OK


SystemExit: False

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
import pandas as pd
import os
import logging
from typing import Tuple, Dict, Optional
from dataclasses import dataclass

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('temporal_processing.log'),
        logging.StreamHandler()
    ]
)

@dataclass
class ParentPeriodStats:
    high: float
    low: float
    duration: int
    completed_bars: int
    rpc_count: int
    bar_of_high: int
    bar_of_low: int

class DataValidationError(Exception):
    pass

class ConnectTemporalPeriods:
    REQUIRED_COLUMNS = {'date', 'open', 'high', 'low', 'close'}

    def __init__(self, child_period: str = "D", parent_period: str = "ME"):
        self.child_period = child_period
        self.parent_period = parent_period
        self.logger = logging.getLogger(__name__)

    def validate_dataframe(self, df: pd.DataFrame) -> None:
        """Validate input dataframe structure and content."""
        missing_cols = self.REQUIRED_COLUMNS - set(df.columns)
        if missing_cols:
            raise DataValidationError(f"Missing required columns: {missing_cols}")

        # Check for date format
        if not pd.api.types.is_datetime64_any_dtype(df['date']):
            try:
                pd.to_datetime(df['date'])
            except Exception as e:
                raise DataValidationError(f"Invalid date format: {e}")

        # Check for numeric price columns
        price_cols = ['open', 'high', 'low', 'close']
        for col in price_cols:
            if not pd.api.types.is_numeric_dtype(df[col]):
                raise DataValidationError(f"Column {col} must be numeric")

        # Validate high >= low
        invalid_rows = df[df['high'] < df['low']].index
        if len(invalid_rows) > 0:
            raise DataValidationError(f"Found {len(invalid_rows)} rows where high < low")

    def process_files(self, input_dir: str = "input/") -> None:
        """Process all CSV files in the input directory."""
        try:
            os.makedirs("parent_output", exist_ok=True)
            os.makedirs("processed_output", exist_ok=True)

            file_count = 0
            for filename in os.listdir(input_dir):
                if filename.endswith(".csv"):
                    ticker = filename.replace(".csv", "")
                    self.logger.info(f"Processing {ticker}")
                    self.process_single_file(os.path.join(input_dir, filename), ticker)
                    file_count += 1

            self.logger.info(f"Successfully processed {file_count} files")

        except Exception as e:
            self.logger.error(f"Error processing files: {e}")
            raise

    def process_single_file(self, filepath: str, ticker: str) -> None:
        """Process a single ticker's data file."""
        try:
            df = pd.read_csv(filepath)
            df['date'] = pd.to_datetime(df['date'])

            self.validate_dataframe(df)

            df['parent_period'] = df['date'].dt.to_period(self.parent_period)
            parent_stats = self._calculate_parent_stats(df)

            processed_df = self._process_child_data(df, parent_stats)

            # Save outputs with error handling
            self._safe_save_csv(parent_stats, f"parent_output/{ticker}_parent.csv")
            self._safe_save_csv(processed_df, f"processed_output/{ticker}_processed.csv")

            self.logger.info(f"Successfully processed {ticker}")

        except Exception as e:
            self.logger.error(f"Error processing {ticker}: {e}")
            raise

    def _safe_save_csv(self, df: pd.DataFrame, filepath: str) -> None:
        """Safely save DataFrame to CSV with error handling."""
        try:
            df.to_csv(filepath, index=False)
        except Exception as e:
            self.logger.error(f"Error saving to {filepath}: {e}")
            raise

    def _calculate_parent_stats(self, df: pd.DataFrame) -> pd.DataFrame:
        """Calculate parent period statistics."""
        try:
            parent_stats = df.groupby('parent_period').agg({
                'date': ['first', 'count'],
                'high': 'max',
                'low': 'min'
            }).reset_index()

            parent_stats.columns = ['parent_period', 'date', 'duration', 'parent_high', 'parent_low']
            return parent_stats

        except Exception as e:
            self.logger.error(f"Error calculating parent stats: {e}")
            raise

    def _process_child_data(self, df: pd.DataFrame, parent_stats: pd.DataFrame) -> pd.DataFrame:
        """Process child period data with detailed metrics."""
        try:
            df = df.copy()

            # Initialize columns
            numeric_cols = ['intra_period_count', 'intra_period_high', 'intra_period_low',
                          'intra_period_reu', 'intra_period_red', 'range_expansion_flag',
                          'intra_period_bar_of_h', 'intra_period_bar_of_l',
                          'bar_rpc', 'intra_period_cumulative_rpc']

            for col in numeric_cols:
                df[col] = 0

            df['rpc_direction'] = 'N'

            # Process each parent period
            for period in df['parent_period'].unique():
                mask = df['parent_period'] == period
                period_data = df[mask].copy()

                parent_high = parent_stats[parent_stats['parent_period'] == period]['parent_high'].iloc[0]
                parent_low = parent_stats[parent_stats['parent_period'] == period]['parent_low'].iloc[0]

                processed_period = self._process_parent_period(period_data, parent_high, parent_low)
                df.loc[mask] = processed_period

            return df

        except Exception as e:
            self.logger.error(f"Error processing child data: {e}")
            raise

    def _process_parent_period(
        self, period_data: pd.DataFrame,
        parent_high: float,
        parent_low: float
    ) -> pd.DataFrame:
        """Process data within a single parent period."""
        try:
            active_bars = 0
            cumulative_rpc = 0
            prev_direction = 'N'
            intra_high = float('-inf')
            intra_low = float('inf')

            for idx in period_data.index:
                row = period_data.loc[idx]

                if pd.isna(row['high']) or pd.isna(row['low']):
                    continue

                active_bars += 1
                period_data.at[idx, 'intra_period_count'] = active_bars

                # Initialize first bar
                if intra_high == float('-inf'):
                    intra_high = row['high']
                    intra_low = row['low']

                # Calculate range expansions
                reu = max(0, row['high'] - intra_high)
                red = max(0, intra_low - row['low'])

                # Update metrics
                period_data.at[idx, 'intra_period_reu'] = reu
                period_data.at[idx, 'intra_period_red'] = red
                period_data.at[idx, 'range_expansion_flag'] = 1 if (reu > 0 or red > 0) else 0

                new_direction = self._determine_rpc_direction(
                    reu, red, row['high'], row['low'], row['close'], prev_direction
                )

                if new_direction != prev_direction and prev_direction != 'N':
                    cumulative_rpc += 1

                period_data.at[idx, 'rpc_direction'] = new_direction
                period_data.at[idx, 'intra_period_cumulative_rpc'] = cumulative_rpc
                period_data.at[idx, 'bar_rpc'] = self._calculate_bar_rpc(
                    reu, red, prev_direction, new_direction
                )

                # Update parent boundary bars
                if row['high'] == parent_high and period_data.at[idx, 'intra_period_bar_of_h'] == 0:
                    period_data.at[idx, 'intra_period_bar_of_h'] = active_bars
                if row['low'] == parent_low and period_data.at[idx, 'intra_period_bar_of_l'] == 0:
                    period_data.at[idx, 'intra_period_bar_of_l'] = active_bars

                # Update running high/low
                intra_high = max(intra_high, row['high'])
                intra_low = min(intra_low, row['low'])
                period_data.at[idx, 'intra_period_high'] = intra_high
                period_data.at[idx, 'intra_period_low'] = intra_low

                prev_direction = new_direction

            return period_data

        except Exception as e:
            self.logger.error(f"Error processing parent period: {e}")
            raise

    def _determine_rpc_direction(
        self, reu: float, red: float,
        high: float, low: float, close: float,
        prev_direction: str
    ) -> str:
        """Determine the RPC direction based on expansions."""
        try:
            if reu == 0 and red == 0:
                return prev_direction
            elif reu > 0 and red > 0:
                # Two-way expansion - use close location
                return 'U' if (close - low) / (high - low) >= 0.5 else 'D'
            elif reu > 0:
                return 'U'
            else:
                return 'D'

        except Exception as e:
            self.logger.error(f"Error determining RPC direction: {e}")
            raise

    def _calculate_bar_rpc(
        self, reu: float, red: float,
        prev_direction: str, new_direction: str
    ) -> int:
        """Calculate the bar's RPC value."""
        try:
            if reu == 0 and red == 0:
                return 0
            elif reu > 0 and red > 0:
                if prev_direction == 'N' or new_direction != prev_direction:
                    return 2
                return 1
            elif new_direction != prev_direction and prev_direction != 'N':
                return 1
            return 0

        except Exception as e:
            self.logger.error(f"Error calculating bar RPC: {e}")
            raise

def run_tests():
    """Run unit tests for the framework."""
    import unittest

    class TestTemporalFramework(unittest.TestCase):
        def setUp(self):
            self.processor = ConnectTemporalPeriods()

        def test_data_validation(self):
            # Test missing columns
            df_missing = pd.DataFrame({'date': [], 'high': [], 'low': []})
            with self.assertRaises(DataValidationError):
                self.processor.validate_dataframe(df_missing)

            # Test invalid date format
            df_bad_date = pd.DataFrame({
                'date': ['bad_date'],
                'open': [1], 'high': [2], 'low': [1], 'close': [1.5]
            })
            with self.assertRaises(DataValidationError):
                self.processor.validate_dataframe(df_bad_date)

            # Test high < low
            df_invalid_hl = pd.DataFrame({
                'date': ['2024-01-01'],
                'open': [1], 'high': [1], 'low': [2], 'close': [1.5]
            })
            with self.assertRaises(DataValidationError):
                self.processor.validate_dataframe(df_invalid_hl)

        def test_rpc_calculation(self):
            # Test no expansion
            self.assertEqual(
                self.processor._calculate_bar_rpc(0, 0, 'N', 'N'), 0
            )

            # Test two-way expansion
            self.assertEqual(
                self.processor._calculate_bar_rpc(1, 1, 'N', 'U'), 2
            )

            # Test direction change
            self.assertEqual(
                self.processor._calculate_bar_rpc(1, 0, 'D', 'U'), 1
            )

        def test_rpc_direction(self):
            # Test no expansion
            self.assertEqual(
                self.processor._determine_rpc_direction(0, 0, 100, 90, 95, 'U'),
                'U'
            )

            # Test two-way expansion with close in upper half
            self.assertEqual(
                self.processor._determine_rpc_direction(1, 1, 100, 90, 97, 'N'),
                'U'
            )

            # Test upward expansion only
            self.assertEqual(
                self.processor._determine_rpc_direction(1, 0, 100, 90, 95, 'D'),
                'U'
            )

    # Run tests
    unittest.main()

if __name__ == "__main__":
    run_tests()
    processor = ConnectTemporalPeriods()
    processor.process_files()

E
ERROR: /root/ (unittest.loader._FailedTest)
----------------------------------------------------------------------
AttributeError: module '__main__' has no attribute '/root/'

----------------------------------------------------------------------
Ran 1 test in 0.005s

FAILED (errors=1)


SystemExit: True

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
from IPython import get_ipython
from IPython.display import display
# %%
import pandas as pd
import os
import logging
from typing import Tuple, Dict, Optional
from dataclasses import dataclass
import unittest

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('temporal_processing.log'),
        logging.StreamHandler()
    ]
)

@dataclass
class ParentPeriodStats:
    high: float
    low: float
    duration: int
    completed_bars: int
    rpc_count: int
    bar_of_high: int
    bar_of_low: int

class DataValidationError(Exception):
    pass

class ConnectTemporalPeriods:
    REQUIRED_COLUMNS = {'date', 'open', 'high', 'low', 'close'}

    def __init__(self, child_period: str = "D", parent_period: str = "M"):
        self.child_period = child_period
        self.parent_period = parent_period
        self.logger = logging.getLogger(__name__)

    def validate_dataframe(self, df: pd.DataFrame) -> None:
        """Validate input dataframe structure and content."""
        missing_cols = self.REQUIRED_COLUMNS - set(df.columns)
        if missing_cols:
            raise DataValidationError(f"Missing required columns: {missing_cols}")

        # Check for date format
        if not pd.api.types.is_datetime64_any_dtype(df['date']):
            try:
                pd.to_datetime(df['date'])
            except Exception as e:
                raise DataValidationError(f"Invalid date format: {e}")

        # Check for numeric price columns
        price_cols = ['open', 'high', 'low', 'close']
        for col in price_cols:
            if not pd.api.types.is_numeric_dtype(df[col]):
                raise DataValidationError(f"Column {col} must be numeric")

        # Validate high >= low
        invalid_rows = df[df['high'] < df['low']].index
        if len(invalid_rows) > 0:
            raise DataValidationError(f"Found {len(invalid_rows)} rows where high < low")

    def process_files(self, input_dir: str = "input/") -> None:
        """Process all CSV files in the input directory."""
        try:
            os.makedirs("parent_output", exist_ok=True)
            os.makedirs("processed_output", exist_ok=True)

            file_count = 0
            for filename in os.listdir(input_dir):
                if filename.endswith(".csv"):
                    ticker = filename.replace(".csv", "")
                    self.logger.info(f"Processing {ticker}")
                    self.process_single_file(os.path.join(input_dir, filename), ticker)
                    file_count += 1

            self.logger.info(f"Successfully processed {file_count} files")

        except Exception as e:
            self.logger.error(f"Error processing files: {e}")
            raise

    def process_single_file(self, filepath: str, ticker: str) -> None:
        """Process a single ticker's data file."""
        try:
            df = pd.read_csv(filepath)
            df['date'] = pd.to_datetime(df['date'])

            self.validate_dataframe(df)

            df['parent_period'] = df['date'].dt.to_period(self.parent_period)
            parent_stats = self._calculate_parent_stats(df)

            processed_df = self._process_child_data(df, parent_stats)

            # Save outputs with error handling
            self._safe_save_csv(parent_stats, f"parent_output/{ticker}_parent.csv")
            self._safe_save_csv(processed_df, f"processed_output/{ticker}_processed.csv")

            self.logger.info(f"Successfully processed {ticker}")

        except Exception as e:
            self.logger.error(f"Error processing {ticker}: {e}")
            raise

    def _safe_save_csv(self, df: pd.DataFrame, filepath: str) -> None:
        """Safely save DataFrame to CSV with error handling."""
        try:
            df.to_csv(filepath, index=False)
        except Exception as e:
            self.logger.error(f"Error saving to {filepath}: {e}")
            raise

    def _calculate_parent_stats(self, df: pd.DataFrame) -> pd.DataFrame:
        """Calculate parent period statistics."""
        try:
            parent_stats = df.groupby('parent_period').agg({
                'date': ['first', 'count'],
                'high': 'max',
                'low': 'min'
            }).reset_index()

            parent_stats.columns = ['parent_period', 'date', 'duration', 'parent_high', 'parent_low']
            return parent_stats

        except Exception as e:
            self.logger.error(f"Error calculating parent stats: {e}")
            raise

    def _process_child_data(self, df: pd.DataFrame, parent_stats: pd.DataFrame) -> pd.DataFrame:
        """Process child period data with detailed metrics."""
        try:
            df = df.copy()

            # Initialize columns
            numeric_cols = ['intra_period_count', 'intra_period_high', 'intra_period_low',
                          'intra_period_reu', 'intra_period_red', 'range_expansion_flag',
                          'intra_period_bar_of_h', 'intra_period_bar_of_l',
                          'bar_rpc', 'intra_period_cumulative_rpc']

            for col in numeric_cols:
                df[col] = 0

            df['rpc_direction'] = 'N'

            # Process each parent period
            for period in df['parent_period'].unique():
                mask = df['parent_period'] == period
                period_data = df[mask].copy()

                parent_high = parent_stats[parent_stats['parent_period'] == period]['parent_high'].iloc[0]
                parent_low = parent_stats[parent_stats['parent_period'] == period]['parent_low'].iloc[0]

                processed_period = self._process_parent_period(period_data, parent_high, parent_low)
                df.loc[mask] = processed_period

            return df

        except Exception as e:
            self.logger.error(f"Error processing child data: {e}")
            raise

    def _process_parent_period(
        self, period_data: pd.DataFrame,
        parent_high: float,
        parent_low: float
    ) -> pd.DataFrame:
        """Process data within a single parent period."""
        try:
            active_bars = 0
            cumulative_rpc = 0
            prev_direction = 'N'
            intra_high = float('-inf')
            intra_low = float('inf')

            for idx in period_data.index:
                row = period_data.loc[idx]

                if pd.isna(row['high']) or pd.isna(row['low']):
                    continue

                active_bars += 1
                period_data.at[idx, 'intra_period_count'] = active_bars

                # Initialize first bar
                if intra_high == float('-inf'):
                    intra_high = row['high']
                    intra_low = row['low']

                # Calculate range expansions
                reu = max(0, row['high'] - intra_high) if intra_high != float('-inf') else 0
                red = max(0, intra_low - row['low']) if intra_low != float('inf') else 0

                # Then update running high/low for next iteration
                intra_high = max(intra_high, row['high']) if intra_high != float('-inf') else row['high']
                intra_low = min(intra_low, row['low']) if intra_low != float('inf') else row['low']

                # Update metrics
                period_data.at[idx, 'intra_period_reu'] = reu
                period_data.at[idx, 'intra_period_red'] = red
                period_data.at[idx, 'range_expansion_flag'] = 1 if (reu > 0 or red > 0) else 0

                new_direction = self._determine_rpc_direction(
                    reu, red, row['high'], row['low'], row['close'], prev_direction
                )

                if new_direction != prev_direction and prev_direction != 'N':
                    cumulative_rpc += 1

                period_data.at[idx, 'rpc_direction'] = new_direction
                period_data.at[idx, 'intra_period_cumulative_rpc'] = cumulative_rpc
                period_data.at[idx, 'bar_rpc'] = self._calculate_bar_rpc(
                    reu, red, prev_direction, new_direction
                )

                # Update parent boundary bars
                if row['high'] == parent_high and period_data.at[idx, 'intra_period_bar_of_h'] == 0:
                    period_data.at[idx, 'intra_period_bar_of_h'] = active_bars
                if row['low'] == parent_low and period_data.at[idx, 'intra_period_bar_of_l'] == 0:
                    period_data.at[idx, 'intra_period_bar_of_l'] = active_bars

                # Update running high/low
                intra_high = max(intra_high, row['high'])
                intra_low = min(intra_low, row['low'])
                period_data.at[idx, 'intra_period_high'] = intra_high
                period_data.at[idx, 'intra_period_low'] = intra_low

                prev_direction = new_direction

            return period_data

        except Exception as e:
            self.logger.error(f"Error processing parent period: {e}")
            raise

    def _determine_rpc_direction(
        self, reu: float, red: float,
        high: float, low: float, close: float,
        prev_direction: str
    ) -> str:
        """Determine the RPC direction based on expansions."""
        try:
            if reu == 0 and red == 0:
                return prev_direction
            elif reu > 0 and red > 0:
                # Two-way expansion - use close location
                return 'U' if (close - low) / (high - low) >= 0.5 else 'D'
            elif reu > 0:
                return 'U'
            else:
                return 'D'

        except Exception as e:
            self.logger.error(f"Error determining RPC direction: {e}")
            raise

    def _calculate_bar_rpc(
        self, reu: float, red: float,
        prev_direction: str, new_direction: str
    ) -> int:
        """Calculate the bar's RPC value."""
        try:
            if reu == 0 and red == 0:
                return 0
            elif reu > 0 and red > 0:
                if prev_direction == 'N' or new_direction != prev_direction:
                    return 2
                return 1
            elif new_direction != prev_direction and prev_direction != 'N':
                return 1
            return 0

        except Exception as e:
            self.logger.error(f"Error calculating bar RPC: {e}")
            raise

def run_tests():
    """Run unit tests for the framework."""

    class TestTemporalFramework(unittest.TestCase):
        def setUp(self):
            self.processor = ConnectTemporalPeriods()

        def test_data_validation(self):
            # Test missing columns
            df_missing = pd.DataFrame({'date': [], 'high': [], 'low': []})
            with self.assertRaises(DataValidationError):
                self.processor.validate_dataframe(df_missing)

            # Test invalid date format
            df_bad_date = pd.DataFrame({
                'date': ['bad_date'],
                'open': [1], 'high': [2], 'low': [1], 'close': [1.5]
            })
            with self.assertRaises(DataValidationError):
                self.processor.validate_dataframe(df_bad_date)

            # Test high < low
            df_invalid_hl = pd.DataFrame({
                'date': ['2024-01-01'],
                'open': [1], 'high': [1], 'low': [2], 'close': [1.5]
            })
            with self.assertRaises(DataValidationError):
                self.processor.validate_dataframe(df_invalid_hl)

        def test_rpc_calculation(self):
            # Test no expansion
            self.assertEqual(
                self.processor._calculate_bar_rpc(0, 0, 'N', 'N'), 0
            )

            # Test two-way expansion
            self.assertEqual(
                self.processor._calculate_bar_rpc(1, 1, 'N', 'U'), 2
            )

            # Test direction change
            self.assertEqual(
                self.processor._calculate_bar_rpc(1, 0, 'D', 'U'), 1
            )

        def test_rpc_direction(self):
            # Test no expansion
            self.assertEqual(
                self.processor._determine_rpc_direction(0, 0, 100, 90, 95, 'U'),
                'U'
            )

            # Test two-way expansion with close in upper half
            self.assertEqual(
                self.processor._determine_rpc_direction(1, 1, 100, 90, 97, 'N'),
                'U'
            )

            # Test upward expansion only
            self.assertEqual(
                self.processor._determine_rpc_direction(1, 0, 100, 90, 95, 'D'),
                'U'
            )

    # Run tests
    unittest.main(argv=['dummy'], exit=False)

if __name__ == "__main__":
    run_tests()
    processor = ConnectTemporalPeriods()
    processor.process_files()
# %%


----------------------------------------------------------------------
Ran 0 tests in 0.000s

OK
  period_data.at[idx, 'intra_period_high'] = intra_high
  period_data.at[idx, 'intra_period_low'] = intra_low
  period_data.at[idx, 'intra_period_red'] = red
  period_data.at[idx, 'intra_period_reu'] = reu
 124.0133743 124.0133743 124.0133743 124.0133743 124.0133743 124.0133743
 124.0133743 124.0133743 124.0133743 124.0133743 124.0133743 124.0133743
 126.2541809]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df.loc[mask] = processed_period
 116.3294296 115.8361206 115.8361206 114.7240829 114.7240829 112.5752487
 112.5752487 112.5752487 112.5752487 112.5752487 112.5752487 112.5752487
 112.5752487]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df.loc[mask] = processed_period
 0.        0.        0.        0.        0.        0.        0.
 0.        0.        0.        0.        2.2408066]' has dtype incom

In [None]:
import pandas as pd
import os
import logging
from typing import Tuple, Dict, Optional
from dataclasses import dataclass

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('temporal_processing.log'),
        logging.StreamHandler()
    ]
)

@dataclass
class ParentPeriodStats:
    high: float
    low: float
    duration: int
    completed_bars: int
    rpc_count: int
    bar_of_high: int
    bar_of_low: int

class DataValidationError(Exception):
    pass

class ConnectTemporalPeriods:
    REQUIRED_COLUMNS = {'date', 'open', 'high', 'low', 'close'}

    def __init__(self, child_period: str = "D", parent_period: str = "ME"):
        self.child_period = child_period
        self.parent_period = parent_period
        self.logger = logging.getLogger(__name__)

    def validate_dataframe(self, df: pd.DataFrame) -> None:
        """Validate input dataframe structure and content."""
        missing_cols = self.REQUIRED_COLUMNS - set(df.columns)
        if missing_cols:
            raise DataValidationError(f"Missing required columns: {missing_cols}")

        if not pd.api.types.is_datetime64_any_dtype(df['date']):
            try:
                pd.to_datetime(df['date'])
            except Exception as e:
                raise DataValidationError(f"Invalid date format: {e}")

        price_cols = ['open', 'high', 'low', 'close']
        for col in price_cols:
            if not pd.api.types.is_numeric_dtype(df[col]):
                raise DataValidationError(f"Column {col} must be numeric")

        invalid_rows = df[df['high'] < df['low']].index
        if len(invalid_rows) > 0:
            raise DataValidationError(f"Found {len(invalid_rows)} rows where high < low")

    def process_files(self, input_dir: str = "input/") -> None:
        """Process all CSV files in the input directory."""
        try:
            os.makedirs("parent_output", exist_ok=True)
            os.makedirs("processed_output", exist_ok=True)

            file_count = 0
            for filename in os.listdir(input_dir):
                if filename.endswith(".csv"):
                    ticker = filename.replace(".csv", "")
                    self.logger.info(f"Processing {ticker}")
                    self.process_single_file(os.path.join(input_dir, filename), ticker)
                    file_count += 1

            self.logger.info(f"Successfully processed {file_count} files")

        except Exception as e:
            self.logger.error(f"Error processing files: {e}")
            raise

    def process_single_file(self, filepath: str, ticker: str) -> None:
        """Process a single ticker's data file."""
        try:
            df = pd.read_csv(filepath)
            df['date'] = pd.to_datetime(df['date'])

            self.validate_dataframe(df)

            df['parent_period'] = df['date'].dt.to_period(self.parent_period)
            parent_stats = self._calculate_parent_stats(df)

            processed_df = self._process_child_data(df, parent_stats)

            self._safe_save_csv(parent_stats, f"parent_output/{ticker}_parent.csv")
            self._safe_save_csv(processed_df, f"processed_output/{ticker}_processed.csv")

            self.logger.info(f"Successfully processed {ticker}")

        except Exception as e:
            self.logger.error(f"Error processing {ticker}: {e}")
            raise

    def _safe_save_csv(self, df: pd.DataFrame, filepath: str) -> None:
        """Safely save DataFrame to CSV with error handling."""
        try:
            df.to_csv(filepath, index=False)
        except Exception as e:
            self.logger.error(f"Error saving to {filepath}: {e}")
            raise

    def _calculate_parent_stats(self, df: pd.DataFrame) -> pd.DataFrame:
        """Calculate parent period statistics including bar locations and counts."""
        try:
            # Basic stats
            parent_stats = df.groupby('parent_period').agg({
                'date': ['first', 'count'],
                'high': 'max',
                'low': 'min',
                'range_expansion_flag': 'sum',
                'intra_period_cumulative_rpc': 'max'
            }).reset_index()

            parent_stats.columns = ['parent_period', 'date', 'duration', 'parent_high',
                                  'parent_low', 'expansion_count', 'rpc_count']

            # Calculate bar positions for each parent period
            for period in df['parent_period'].unique():
                period_data = df[df['parent_period'] == period]
                period_high = parent_stats.loc[parent_stats['parent_period'] == period, 'parent_high'].iloc[0]
                period_low = parent_stats.loc[parent_stats['parent_period'] == period, 'parent_low'].iloc[0]

                # Find first occurrence of high and low
                bar_of_high = period_data[period_data['high'] == period_high].index.min()
                bar_of_low = period_data[period_data['low'] == period_low].index.min()

                if bar_of_high is not None:
                    parent_stats.loc[parent_stats['parent_period'] == period, 'bar_of_high'] = \
                        period_data.loc[:bar_of_high].notna().sum()
                if bar_of_low is not None:
                    parent_stats.loc[parent_stats['parent_period'] == period, 'bar_of_low'] = \
                        period_data.loc[:bar_of_low].notna().sum()

            return parent_stats

        except Exception as e:
            self.logger.error("Error calculating parent stats: {e}")
            raise

    def _process_child_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """Process child period data with detailed metrics."""
        try:
            df = df.copy()

            # Initialize columns with correct dtypes
            numeric_cols = {
                'intra_period_count': 'int64',
                'intra_period_high': 'float64',
                'intra_period_low': 'float64',
                'intra_period_reu': 'float64',
                'intra_period_red': 'float64',
                'range_expansion_flag': 'int64',
                'bar_rpc': 'int64',
                'intra_period_cumulative_rpc': 'int64'
            }

            for col, dtype in numeric_cols.items():
                df[col] = pd.Series(dtype=dtype)

            df['rpc_direction'] = 'N'

            # Process each parent period
            for period in df['parent_period'].unique():
                mask = df['parent_period'] == period
                period_data = df[mask].copy()

                processed_period = self._process_parent_period(period_data)
                df.loc[mask] = processed_period.astype(df[mask].dtypes)

            return df

        except Exception as e:
            self.logger.error(f"Error processing child data: {e}")
            raise

    def _process_parent_period(self, period_data: pd.DataFrame) -> pd.DataFrame:
        """Process data within a single parent period."""
        try:
            active_bars = 0
            cumulative_rpc = 0
            prev_direction = 'N'
            prior_intra_high = None
            prior_intra_low = None

            for idx in period_data.index:
                row = period_data.loc[idx]

                if pd.isna(row['high']) or pd.isna(row['low']):
                    continue

                active_bars += 1
                period_data.at[idx, 'intra_period_count'] = active_bars

                # Calculate range expansions using prior values
                reu = max(0, row['high'] - prior_intra_high) if prior_intra_high is not None else 0
                red = max(0, prior_intra_low - row['low']) if prior_intra_low is not None else 0

                # Update metrics
                period_data.at[idx, 'intra_period_reu'] = reu
                period_data.at[idx, 'intra_period_red'] = red
                period_data.at[idx, 'range_expansion_flag'] = 1 if (reu > 0 or red > 0) else 0

                new_direction = self._determine_rpc_direction(
                    reu, red, row['high'], row['low'], row['close'], prev_direction
                )

                if new_direction != prev_direction and prev_direction != 'N':
                    cumulative_rpc += 1

                period_data.at[idx, 'rpc_direction'] = new_direction
                period_data.at[idx, 'intra_period_cumulative_rpc'] = cumulative_rpc
                period_data.at[idx, 'bar_rpc'] = self._calculate_bar_rpc(
                    reu, red, prev_direction, new_direction
                )

                # Update prior values for next iteration
                prior_intra_high = max(row['high'], prior_intra_high if prior_intra_high is not None else row['high'])
                prior_intra_low = min(row['low'], prior_intra_low if prior_intra_low is not None else row['low'])

                prev_direction = new_direction

            return period_data

        except Exception as e:
            self.logger.error(f"Error processing parent period: {e}")
            raise

    def _determine_rpc_direction(
        self, reu: float, red: float,
        high: float, low: float, close: float,
        prev_direction: str
    ) -> str:
        """Determine the RPC direction based on expansions."""
        try:
            if reu == 0 and red == 0:
                return prev_direction
            elif reu > 0 and red > 0:
                # Two-way expansion - use close location
                return 'U' if (close - low) / (high - low) >= 0.5 else 'D'
            elif reu > 0:
                return 'U'
            else:
                return 'D'

        except Exception as e:
            self.logger.error(f"Error determining RPC direction: {e}")
            raise

    def _calculate_bar_rpc(
        self, reu: float, red: float,
        prev_direction: str, new_direction: str
    ) -> int:
        """Calculate the bar's RPC value."""
        try:
            if reu == 0 and red == 0:
                return 0
            elif reu > 0 and red > 0:
                if prev_direction == 'N' or new_direction != prev_direction:
                    return 2
                return 1
            elif new_direction != prev_direction and prev_direction != 'N':
                return 1
            return 0

        except Exception as e:
            self.logger.error(f"Error calculating bar RPC: {e}")
            raise

def run_tests():
    """Run unit tests for the framework."""
    import unittest

    class TestTemporalFramework(unittest.TestCase):
        def setUp(self):
            self.processor = ConnectTemporalPeriods()

        def test_data_validation(self):
            # Test missing columns
            df_missing = pd.DataFrame({'date': [], 'high': [], 'low': []})
            with self.assertRaises(DataValidationError):
                self.processor.validate_dataframe(df_missing)

            # Test invalid date format
            df_bad_date = pd.DataFrame({
                'date': ['bad_date'],
                'open': [1], 'high': [2], 'low': [1], 'close': [1.5]
            })
            with self.assertRaises(DataValidationError):
                self.processor.validate_dataframe(df_bad_date)

            # Test high < low
            df_invalid_hl = pd.DataFrame({
                'date': ['2024-01-01'],
                'open': [1], 'high': [1], 'low': [2], 'close': [1.5]
            })
            with self.assertRaises(DataValidationError):
                self.processor.validate_dataframe(df_invalid_hl)

        def test_rpc_calculation(self):
            # Test no expansion
            self.assertEqual(
                self.processor._calculate_bar_rpc(0, 0, 'N', 'N'), 0
            )

            # Test two-way expansion
            self.assertEqual(
                self.processor._calculate_bar_rpc(1, 1, 'N', 'U'), 2
            )

            # Test direction change
            self.assertEqual(
                self.processor._calculate_bar_rpc(1, 0, 'D', 'U'), 1
            )

        def test_rpc_direction(self):
            # Test no expansion
            self.assertEqual(
                self.processor._determine_rpc_direction(0, 0, 100, 90, 95, 'U'),
                'U'
            )

            # Test two-way expansion with close in upper half
            self.assertEqual(
                self.processor._determine_rpc_direction(1, 1, 100, 90, 97, 'N'),
                'U'
            )

            # Test upward expansion only
            self.assertEqual(
                self.processor._determine_rpc_direction(1, 0, 100, 90, 95, 'D'),
                'U'
            )

    # Run tests
    unittest.main(argv=['dummy'])

if __name__ == "__main__":
    run_tests()
    processor = ConnectTemporalPeriods()
    processor.process_files()


----------------------------------------------------------------------
Ran 0 tests in 0.000s

OK


SystemExit: False

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
import pandas as pd
import os
import logging
from typing import Tuple, Dict, Optional
from dataclasses import dataclass

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('temporal_processing.log'),
        logging.StreamHandler()
    ]
)

@dataclass
class ParentPeriodStats:
    high: float
    low: float
    duration: int
    completed_bars: int
    rpc_count: int
    bar_of_high: int
    bar_of_low: int

class DataValidationError(Exception):
    pass

class ConnectTemporalPeriods:
    REQUIRED_COLUMNS = {'date', 'open', 'high', 'low', 'close'}

    def __init__(self, child_period: str = "D", parent_period: str = "ME"):
        self.child_period = child_period
        self.parent_period = parent_period
        self.logger = logging.getLogger(__name__)

    def validate_dataframe(self, df: pd.DataFrame) -> None:
        """Validate input dataframe structure and content."""
        missing_cols = self.REQUIRED_COLUMNS - set(df.columns)
        if missing_cols:
            raise DataValidationError(f"Missing required columns: {missing_cols}")

        if not pd.api.types.is_datetime64_any_dtype(df['date']):
            try:
                pd.to_datetime(df['date'])
            except Exception as e:
                raise DataValidationError(f"Invalid date format: {e}")

        price_cols = ['open', 'high', 'low', 'close']
        for col in price_cols:
            if not pd.api.types.is_numeric_dtype(df[col]):
                raise DataValidationError(f"Column {col} must be numeric")

        invalid_rows = df[df['high'] < df['low']].index
        if len(invalid_rows) > 0:
            raise DataValidationError(f"Found {len(invalid_rows)} rows where high < low")

    def process_files(self, input_dir: str = "input/") -> None:
        """Process all CSV files in the input directory."""
        try:
            os.makedirs("parent_output", exist_ok=True)
            os.makedirs("processed_output", exist_ok=True)

            file_count = 0
            for filename in os.listdir(input_dir):
                if filename.endswith(".csv"):
                    ticker = filename.replace(".csv", "")
                    self.logger.info(f"Processing {ticker}")
                    self.process_single_file(os.path.join(input_dir, filename), ticker)
                    file_count += 1

            self.logger.info(f"Successfully processed {file_count} files")

        except Exception as e:
            self.logger.error(f"Error processing files: {e}")
            raise

    def process_single_file(self, filepath: str, ticker: str) -> None:
        """Process a single ticker's data file."""
        try:
            df = pd.read_csv(filepath)
            df['date'] = pd.to_datetime(df['date'])

            self.validate_dataframe(df)

            df['parent_period'] = df['date'].dt.to_period(self.parent_period)
            parent_stats = self._calculate_parent_stats(df)

            processed_df = self._process_child_data(df, parent_stats)

            self._safe_save_csv(parent_stats, f"parent_output/{ticker}_parent.csv")
            self._safe_save_csv(processed_df, f"processed_output/{ticker}_processed.csv")

            self.logger.info(f"Successfully processed {ticker}")

        except Exception as e:
            self.logger.error(f"Error processing {ticker}: {e}")
            raise

    def _safe_save_csv(self, df: pd.DataFrame, filepath: str) -> None:
        """Safely save DataFrame to CSV with error handling."""
        try:
            df.to_csv(filepath, index=False)
        except Exception as e:
            self.logger.error(f"Error saving to {filepath}: {e}")
            raise

    def _calculate_parent_stats(self, df: pd.DataFrame) -> pd.DataFrame:
        """Calculate parent period statistics including bar locations and counts."""
        try:
            # Basic stats
            parent_stats = df.groupby('parent_period').agg({
                'date': ['first', 'count'],
                'high': 'max',
                'low': 'min',
                'range_expansion_flag': 'sum',
                'intra_period_cumulative_rpc': 'max'
            }).reset_index()

            parent_stats.columns = ['parent_period', 'date', 'duration', 'parent_high',
                                  'parent_low', 'expansion_count', 'rpc_count']

            # Calculate bar positions for each parent period
            for period in df['parent_period'].unique():
                period_data = df[df['parent_period'] == period]
                period_high = parent_stats.loc[parent_stats['parent_period'] == period, 'parent_high'].iloc[0]
                period_low = parent_stats.loc[parent_stats['parent_period'] == period, 'parent_low'].iloc[0]

                # Find first occurrence of high and low
                bar_of_high = period_data[period_data['high'] == period_high].index.min()
                bar_of_low = period_data[period_data['low'] == period_low].index.min()

                if bar_of_high is not None:
                    parent_stats.loc[parent_stats['parent_period'] == period, 'bar_of_high'] = \
                        period_data.loc[:bar_of_high].notna().sum()
                if bar_of_low is not None:
                    parent_stats.loc[parent_stats['parent_period'] == period, 'bar_of_low'] = \
                        period_data.loc[:bar_of_low].notna().sum()

            return parent_stats

        except Exception as e:
            self.logger.error("Error calculating parent stats: {e}")
            raise

    def _process_child_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """Process child period data with detailed metrics."""
        try:
            df = df.copy()

            # Initialize columns with correct dtypes
            numeric_cols = {
                'intra_period_count': 'int64',
                'intra_period_high': 'float64',
                'intra_period_low': 'float64',
                'intra_period_reu': 'float64',
                'intra_period_red': 'float64',
                'range_expansion_flag': 'int64',
                'bar_rpc': 'int64',
                'intra_period_cumulative_rpc': 'int64'
            }

            for col, dtype in numeric_cols.items():
                df[col] = pd.Series(dtype=dtype)

            df['rpc_direction'] = 'N'

            # Process each parent period
            for period in df['parent_period'].unique():
                mask = df['parent_period'] == period
                period_data = df[mask].copy()

                processed_period = self._process_parent_period(period_data)
                df.loc[mask] = processed_period.astype(df[mask].dtypes)

            return df

        except Exception as e:
            self.logger.error(f"Error processing child data: {e}")
            raise

    def _process_parent_period(self, period_data: pd.DataFrame) -> pd.DataFrame:
        """Process data within a single parent period."""
        try:
            active_bars = 0
            cumulative_rpc = 0
            prev_direction = 'N'
            prior_intra_high = None
            prior_intra_low = None

            for idx in period_data.index:
                row = period_data.loc[idx]

                if pd.isna(row['high']) or pd.isna(row['low']):
                    continue

                active_bars += 1
                period_data.at[idx, 'intra_period_count'] = active_bars

                # Calculate range expansions using prior values
                reu = max(0, row['high'] - prior_intra_high) if prior_intra_high is not None else 0
                red = max(0, prior_intra_low - row['low']) if prior_intra_low is not None else 0

                # Update metrics
                period_data.at[idx, 'intra_period_reu'] = reu
                period_data.at[idx, 'intra_period_red'] = red
                period_data.at[idx, 'range_expansion_flag'] = 1 if (reu > 0 or red > 0) else 0

                new_direction = self._determine_rpc_direction(
                    reu, red, row['high'], row['low'], row['close'], prev_direction
                )

                if new_direction != prev_direction and prev_direction != 'N':
                    cumulative_rpc += 1

                period_data.at[idx, 'rpc_direction'] = new_direction
                period_data.at[idx, 'intra_period_cumulative_rpc'] = cumulative_rpc
                period_data.at[idx, 'bar_rpc'] = self._calculate_bar_rpc(
                    reu, red, prev_direction, new_direction
                )

                # Update prior values for next iteration
                prior_intra_high = max(row['high'], prior_intra_high if prior_intra_high is not None else row['high'])
                prior_intra_low = min(row['low'], prior_intra_low if prior_intra_low is not None else row['low'])

                prev_direction = new_direction

            return period_data

        except Exception as e:
            self.logger.error(f"Error processing parent period: {e}")
            raise

    def _determine_rpc_direction(
        self, reu: float, red: float,
        high: float, low: float, close: float,
        prev_direction: str
    ) -> str:
        """Determine the RPC direction based on expansions."""
        try:
            if reu == 0 and red == 0:
                return prev_direction
            elif reu > 0 and red > 0:
                # Two-way expansion - use close location
                return 'U' if (close - low) / (high - low) >= 0.5 else 'D'
            elif reu > 0:
                return 'U'
            else:
                return 'D'

        except Exception as e:
            self.logger.error(f"Error determining RPC direction: {e}")
            raise

    def _calculate_bar_rpc(
        self, reu: float, red: float,
        prev_direction: str, new_direction: str
    ) -> int:
        """Calculate the bar's RPC value."""
        try:
            if reu == 0 and red == 0:
                return 0
            elif reu > 0 and red > 0:
                if prev_direction == 'N' or new_direction != prev_direction:
                    return 2
                return 1
            elif new_direction != prev_direction and prev_direction != 'N':
                return 1
            return 0

        except Exception as e:
            self.logger.error(f"Error calculating bar RPC: {e}")
            raise

class TestTemporalFramework:
    def run_all_tests(self):
        processor = ConnectTemporalPeriods()

        # Test RPC calculation
        assert processor._calculate_bar_rpc(0, 0, 'N', 'N') == 0, "No expansion should return 0"
        assert processor._calculate_bar_rpc(1, 1, 'N', 'U') == 2, "Two-way expansion should return 2"
        assert processor._calculate_bar_rpc(1, 0, 'D', 'U') == 1, "Direction change should return 1"

        # Test RPC direction
        assert processor._determine_rpc_direction(0, 0, 100, 90, 95, 'U') == 'U', "No expansion should maintain direction"
        assert processor._determine_rpc_direction(1, 1, 100, 90, 97, 'N') == 'U', "Close in upper half should return U"
        assert processor._determine_rpc_direction(1, 0, 100, 90, 95, 'D') == 'U', "Upward expansion should return U"

        print("All tests passed!")

if __name__ == "__main__":
    # Run tests
    tests = TestTemporalFramework()
    tests.run_all_tests()

    # Process files
    processor = ConnectTemporalPeriods()
    processor.process_files()

ERROR:__main__:Error processing MMM: Invalid frequency: ME, failed to parse with error message: ValueError("for Period, please use 'M' instead of 'ME'")
ERROR:__main__:Error processing files: Invalid frequency: ME, failed to parse with error message: ValueError("for Period, please use 'M' instead of 'ME'")


All tests passed!


ValueError: Invalid frequency: ME, failed to parse with error message: ValueError("for Period, please use 'M' instead of 'ME'")

In [None]:
import pandas as pd
import os
import logging
from typing import Tuple, Dict, Optional
from dataclasses import dataclass

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('temporal_processing.log'),
        logging.StreamHandler()
    ]
)

@dataclass
class ParentPeriodStats:
    high: float
    low: float
    duration: int
    completed_bars: int
    rpc_count: int
    bar_of_high: int
    bar_of_low: int

class DataValidationError(Exception):
    pass

class ConnectTemporalPeriods:
    REQUIRED_COLUMNS = {'date', 'open', 'high', 'low', 'close'}

    def __init__(self, child_period: str = "D", parent_period: str = "M"):
        self.child_period = child_period
        self.parent_period = parent_period
        self.logger = logging.getLogger(__name__)

    def validate_dataframe(self, df: pd.DataFrame) -> None:
        """Validate input dataframe structure and content."""
        missing_cols = self.REQUIRED_COLUMNS - set(df.columns)
        if missing_cols:
            raise DataValidationError(f"Missing required columns: {missing_cols}")

        if not pd.api.types.is_datetime64_any_dtype(df['date']):
            try:
                pd.to_datetime(df['date'])
            except Exception as e:
                raise DataValidationError(f"Invalid date format: {e}")

        price_cols = ['open', 'high', 'low', 'close']
        for col in price_cols:
            if not pd.api.types.is_numeric_dtype(df[col]):
                raise DataValidationError(f"Column {col} must be numeric")

        invalid_rows = df[df['high'] < df['low']].index
        if len(invalid_rows) > 0:
            raise DataValidationError(f"Found {len(invalid_rows)} rows where high < low")

    def process_files(self, input_dir: str = "input/") -> None:
        """Process all CSV files in the input directory."""
        try:
            os.makedirs("parent_output", exist_ok=True)
            os.makedirs("processed_output", exist_ok=True)

            file_count = 0
            for filename in os.listdir(input_dir):
                if filename.endswith(".csv"):
                    ticker = filename.replace(".csv", "")
                    self.logger.info(f"Processing {ticker}")
                    self.process_single_file(os.path.join(input_dir, filename), ticker)
                    file_count += 1

            self.logger.info(f"Successfully processed {file_count} files")

        except Exception as e:
            self.logger.error(f"Error processing files: {e}")
            raise

    def process_single_file(self, filepath: str, ticker: str) -> None:
        """Process a single ticker's data file."""
        try:
            df = pd.read_csv(filepath)
            df['date'] = pd.to_datetime(df['date'])

            self.validate_dataframe(df)

            df['parent_period'] = df['date'].dt.to_period(self.parent_period)
            parent_stats = self._calculate_parent_stats(df)

            processed_df = self._process_child_data(df, parent_stats)

            self._safe_save_csv(parent_stats, f"parent_output/{ticker}_parent.csv")
            self._safe_save_csv(processed_df, f"processed_output/{ticker}_processed.csv")

            self.logger.info(f"Successfully processed {ticker}")

        except Exception as e:
            self.logger.error(f"Error processing {ticker}: {e}")
            raise

    def _safe_save_csv(self, df: pd.DataFrame, filepath: str) -> None:
        """Safely save DataFrame to CSV with error handling."""
        try:
            df.to_csv(filepath, index=False)
        except Exception as e:
            self.logger.error(f"Error saving to {filepath}: {e}")
            raise

    def _calculate_parent_stats(self, df: pd.DataFrame) -> pd.DataFrame:
        """Calculate parent period statistics including bar locations and counts."""
        try:
            # Basic stats
            parent_stats = df.groupby('parent_period').agg({
                'date': ['first', 'count'],
                'high': 'max',
                'low': 'min',
                'range_expansion_flag': 'sum',
                'intra_period_cumulative_rpc': 'max'
            }).reset_index()

            parent_stats.columns = ['parent_period', 'date', 'duration', 'parent_high',
                                  'parent_low', 'expansion_count', 'rpc_count']

            # Calculate bar positions for each parent period
            for period in df['parent_period'].unique():
                period_data = df[df['parent_period'] == period]
                period_high = parent_stats.loc[parent_stats['parent_period'] == period, 'parent_high'].iloc[0]
                period_low = parent_stats.loc[parent_stats['parent_period'] == period, 'parent_low'].iloc[0]

                # Find first occurrence of high and low
                bar_of_high = period_data[period_data['high'] == period_high].index.min()
                bar_of_low = period_data[period_data['low'] == period_low].index.min()

                if bar_of_high is not None:
                    parent_stats.loc[parent_stats['parent_period'] == period, 'bar_of_high'] = \
                        period_data.loc[:bar_of_high].notna().sum()
                if bar_of_low is not None:
                    parent_stats.loc[parent_stats['parent_period'] == period, 'bar_of_low'] = \
                        period_data.loc[:bar_of_low].notna().sum()

            return parent_stats

        except Exception as e:
            self.logger.error("Error calculating parent stats: {e}")
            raise

    def _process_child_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """Process child period data with detailed metrics."""
        try:
            df = df.copy()

            # Initialize columns with correct dtypes
            numeric_cols = {
                'intra_period_count': 'int64',
                'intra_period_high': 'float64',
                'intra_period_low': 'float64',
                'intra_period_reu': 'float64',
                'intra_period_red': 'float64',
                'range_expansion_flag': 'int64',
                'bar_rpc': 'int64',
                'intra_period_cumulative_rpc': 'int64'
            }

            for col, dtype in numeric_cols.items():
                df[col] = pd.Series(dtype=dtype)

            df['rpc_direction'] = 'N'

            # Process each parent period
            for period in df['parent_period'].unique():
                mask = df['parent_period'] == period
                period_data = df[mask].copy()

                processed_period = self._process_parent_period(period_data)
                df.loc[mask] = processed_period.astype(df[mask].dtypes)

            return df

        except Exception as e:
            self.logger.error(f"Error processing child data: {e}")
            raise

    def _process_parent_period(self, period_data: pd.DataFrame) -> pd.DataFrame:
        """Process data within a single parent period."""
        try:
            active_bars = 0
            cumulative_rpc = 0
            prev_direction = 'N'
            prior_intra_high = None
            prior_intra_low = None

            for idx in period_data.index:
                row = period_data.loc[idx]

                if pd.isna(row['high']) or pd.isna(row['low']):
                    continue

                active_bars += 1
                period_data.at[idx, 'intra_period_count'] = active_bars

                # Calculate range expansions using prior values
                reu = max(0, row['high'] - prior_intra_high) if prior_intra_high is not None else 0
                red = max(0, prior_intra_low - row['low']) if prior_intra_low is not None else 0

                # Update metrics
                period_data.at[idx, 'intra_period_reu'] = reu
                period_data.at[idx, 'intra_period_red'] = red
                period_data.at[idx, 'range_expansion_flag'] = 1 if (reu > 0 or red > 0) else 0

                new_direction = self._determine_rpc_direction(
                    reu, red, row['high'], row['low'], row['close'], prev_direction
                )

                if new_direction != prev_direction and prev_direction != 'N':
                    cumulative_rpc += 1

                period_data.at[idx, 'rpc_direction'] = new_direction
                period_data.at[idx, 'intra_period_cumulative_rpc'] = cumulative_rpc
                period_data.at[idx, 'bar_rpc'] = self._calculate_bar_rpc(
                    reu, red, prev_direction, new_direction
                )

                # Update prior values for next iteration
                prior_intra_high = max(row['high'], prior_intra_high if prior_intra_high is not None else row['high'])
                prior_intra_low = min(row['low'], prior_intra_low if prior_intra_low is not None else row['low'])

                prev_direction = new_direction

            return period_data

        except Exception as e:
            self.logger.error(f"Error processing parent period: {e}")
            raise

    def _determine_rpc_direction(
        self, reu: float, red: float,
        high: float, low: float, close: float,
        prev_direction: str
    ) -> str:
        """Determine the RPC direction based on expansions."""
        try:
            if reu == 0 and red == 0:
                return prev_direction
            elif reu > 0 and red > 0:
                # Two-way expansion - use close location
                return 'U' if (close - low) / (high - low) >= 0.5 else 'D'
            elif reu > 0:
                return 'U'
            else:
                return 'D'

        except Exception as e:
            self.logger.error(f"Error determining RPC direction: {e}")
            raise

    def _calculate_bar_rpc(
        self, reu: float, red: float,
        prev_direction: str, new_direction: str
    ) -> int:
        """Calculate the bar's RPC value."""
        try:
            if reu == 0 and red == 0:
                return 0
            elif reu > 0 and red > 0:
                if prev_direction == 'N' or new_direction != prev_direction:
                    return 2
                return 1
            elif new_direction != prev_direction and prev_direction != 'N':
                return 1
            return 0

        except Exception as e:
            self.logger.error(f"Error calculating bar RPC: {e}")
            raise

class TestTemporalFramework:
    def run_all_tests(self):
        processor = ConnectTemporalPeriods()

        # Test RPC calculation
        assert processor._calculate_bar_rpc(0, 0, 'N', 'N') == 0, "No expansion should return 0"
        assert processor._calculate_bar_rpc(1, 1, 'N', 'U') == 2, "Two-way expansion should return 2"
        assert processor._calculate_bar_rpc(1, 0, 'D', 'U') == 1, "Direction change should return 1"

        # Test RPC direction
        assert processor._determine_rpc_direction(0, 0, 100, 90, 95, 'U') == 'U', "No expansion should maintain direction"
        assert processor._determine_rpc_direction(1, 1, 100, 90, 97, 'N') == 'U', "Close in upper half should return U"
        assert processor._determine_rpc_direction(1, 0, 100, 90, 95, 'D') == 'U', "Upward expansion should return U"

        print("All tests passed!")

if __name__ == "__main__":
    # Run tests
    tests = TestTemporalFramework()
    tests.run_all_tests()

    # Process files
    processor = ConnectTemporalPeriods()
    processor.process_files()

ERROR:__main__:Error calculating parent stats: {e}
ERROR:__main__:Error processing MMM: "Column(s) ['intra_period_cumulative_rpc', 'range_expansion_flag'] do not exist"
ERROR:__main__:Error processing files: "Column(s) ['intra_period_cumulative_rpc', 'range_expansion_flag'] do not exist"


All tests passed!


KeyError: "Column(s) ['intra_period_cumulative_rpc', 'range_expansion_flag'] do not exist"

In [None]:
import pandas as pd
import os
import logging
from typing import Tuple, Dict, Optional
from dataclasses import dataclass

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('temporal_processing.log'),
        logging.StreamHandler()
    ]
)

@dataclass
class ParentPeriodStats:
    high: float
    low: float
    duration: int
    completed_bars: int
    rpc_count: int
    bar_of_high: int
    bar_of_low: int

class DataValidationError(Exception):
    pass

class ConnectTemporalPeriods:
    REQUIRED_COLUMNS = {'date', 'open', 'high', 'low', 'close'}

    def __init__(self, child_period: str = "D", parent_period: str = "M"):
        self.child_period = child_period
        self.parent_period = parent_period
        self.logger = logging.getLogger(__name__)

    def validate_dataframe(self, df: pd.DataFrame) -> None:
        """Validate input dataframe structure and content."""
        missing_cols = self.REQUIRED_COLUMNS - set(df.columns)
        if missing_cols:
            raise DataValidationError(f"Missing required columns: {missing_cols}")

        if not pd.api.types.is_datetime64_any_dtype(df['date']):
            try:
                pd.to_datetime(df['date'])
            except Exception as e:
                raise DataValidationError(f"Invalid date format: {e}")

        price_cols = ['open', 'high', 'low', 'close']
        for col in price_cols:
            if not pd.api.types.is_numeric_dtype(df[col]):
                raise DataValidationError(f"Column {col} must be numeric")

        invalid_rows = df[df['high'] < df['low']].index
        if len(invalid_rows) > 0:
            raise DataValidationError(f"Found {len(invalid_rows)} rows where high < low")

    def process_files(self, input_dir: str = "input/") -> None:
        """Process all CSV files in the input directory."""
        try:
            os.makedirs("parent_output", exist_ok=True)
            os.makedirs("processed_output", exist_ok=True)

            file_count = 0
            for filename in os.listdir(input_dir):
                if filename.endswith(".csv"):
                    ticker = filename.replace(".csv", "")
                    self.logger.info(f"Processing {ticker}")
                    self.process_single_file(os.path.join(input_dir, filename), ticker)
                    file_count += 1

            self.logger.info(f"Successfully processed {file_count} files")

        except Exception as e:
            self.logger.error(f"Error processing files: {e}")
            raise

    def process_single_file(self, filepath: str, ticker: str) -> None:
        """Process a single ticker's data file."""
        try:
            df = pd.read_csv(filepath)
            df['date'] = pd.to_datetime(df['date'])

            self.validate_dataframe(df)

            df['parent_period'] = df['date'].dt.to_period(self.parent_period)
            parent_stats = self._calculate_parent_stats(df)

            processed_df = self._process_child_data(df, parent_stats)

            self._safe_save_csv(parent_stats, f"parent_output/{ticker}_parent.csv")
            self._safe_save_csv(processed_df, f"processed_output/{ticker}_processed.csv")

            self.logger.info(f"Successfully processed {ticker}")

        except Exception as e:
            self.logger.error(f"Error processing {ticker}: {e}")
            raise

    def _safe_save_csv(self, df: pd.DataFrame, filepath: str) -> None:
        """Safely save DataFrame to CSV with error handling."""
        try:
            df.to_csv(filepath, index=False)
        except Exception as e:
            self.logger.error(f"Error saving to {filepath}: {e}")
            raise

    def _calculate_parent_stats(self, df: pd.DataFrame) -> pd.DataFrame:
        """Calculate parent period statistics including bar locations and counts."""
        try:
            # Basic stats
            parent_stats = df.groupby('parent_period').agg({
                'date': ['first', 'count'],
                'high': 'max',
                'low': 'min',
                'range_expansion_flag': 'sum',
                'intra_period_cumulative_rpc': 'max'
            }).reset_index()

            parent_stats.columns = ['parent_period', 'date', 'duration', 'parent_high',
                                  'parent_low', 'expansion_count', 'rpc_count']

            # Calculate bar positions for each parent period
            for period in df['parent_period'].unique():
                period_data = df[df['parent_period'] == period]
                period_high = parent_stats.loc[parent_stats['parent_period'] == period, 'parent_high'].iloc[0]
                period_low = parent_stats.loc[parent_stats['parent_period'] == period, 'parent_low'].iloc[0]

                # Find first occurrence of high and low
                bar_of_high = period_data[period_data['high'] == period_high].index.min()
                bar_of_low = period_data[period_data['low'] == period_low].index.min()

                if bar_of_high is not None:
                    parent_stats.loc[parent_stats['parent_period'] == period, 'bar_of_high'] = \
                        period_data.loc[:bar_of_high].notna().sum()
                if bar_of_low is not None:
                    parent_stats.loc[parent_stats['parent_period'] == period, 'bar_of_low'] = \
                        period_data.loc[:bar_of_low].notna().sum()

            return parent_stats

        except Exception as e:
            self.logger.error("Error calculating parent stats: {e}")
            raise

    def _process_child_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """Process child period data with detailed metrics."""
        try:
            df = df.copy()

            # Initialize columns with correct dtypes
            numeric_cols = {
                'intra_period_count': 'int64',
                'intra_period_high': 'float64',
                'intra_period_low': 'float64',
                'intra_period_reu': 'float64',
                'intra_period_red': 'float64',
                'range_expansion_flag': 'int64',
                'bar_rpc': 'int64',
                'intra_period_cumulative_rpc': 'int64'
            }

            for col, dtype in numeric_cols.items():
                df[col] = pd.Series(dtype=dtype)

            df['rpc_direction'] = 'N'

            # Process each parent period
            for period in df['parent_period'].unique():
                mask = df['parent_period'] == period
                period_data = df[mask].copy()

                processed_period = self._process_parent_period(period_data)
                df.loc[mask] = processed_period.astype(df[mask].dtypes)

            return df

        except Exception as e:
            self.logger.error(f"Error processing child data: {e}")
            raise

    def _process_parent_period(self, period_data: pd.DataFrame) -> pd.DataFrame:
        """Process data within a single parent period."""
        try:
            active_bars = 0
            cumulative_rpc = 0
            prev_direction = 'N'
            prior_intra_high = None
            prior_intra_low = None

            for idx in period_data.index:
                row = period_data.loc[idx]

                if pd.isna(row['high']) or pd.isna(row['low']):
                    continue

                active_bars += 1
                period_data.at[idx, 'intra_period_count'] = active_bars

                # Calculate range expansions using prior values
                reu = max(0, row['high'] - prior_intra_high) if prior_intra_high is not None else 0
                red = max(0, prior_intra_low - row['low']) if prior_intra_low is not None else 0

                # Update metrics
                period_data.at[idx, 'intra_period_reu'] = reu
                period_data.at[idx, 'intra_period_red'] = red
                period_data.at[idx, 'range_expansion_flag'] = 1 if (reu > 0 or red > 0) else 0

                new_direction = self._determine_rpc_direction(
                    reu, red, row['high'], row['low'], row['close'], prev_direction
                )

                if new_direction != prev_direction and prev_direction != 'N':
                    cumulative_rpc += 1

                period_data.at[idx, 'rpc_direction'] = new_direction
                period_data.at[idx, 'intra_period_cumulative_rpc'] = cumulative_rpc
                period_data.at[idx, 'bar_rpc'] = self._calculate_bar_rpc(
                    reu, red, prev_direction, new_direction
                )

                # Update prior values for next iteration
                prior_intra_high = max(row['high'], prior_intra_high if prior_intra_high is not None else row['high'])
                prior_intra_low = min(row['low'], prior_intra_low if prior_intra_low is not None else row['low'])

                prev_direction = new_direction

            return period_data

        except Exception as e:
            self.logger.error(f"Error processing parent period: {e}")
            raise

    def _determine_rpc_direction(
        self, reu: float, red: float,
        high: float, low: float, close: float,
        prev_direction: str
    ) -> str:
        """Determine the RPC direction based on expansions."""
        try:
            if reu == 0 and red == 0:
                return prev_direction
            elif reu > 0 and red > 0:
                # Two-way expansion - use close location
                return 'U' if (close - low) / (high - low) >= 0.5 else 'D'
            elif reu > 0:
                return 'U'
            else:
                return 'D'

        except Exception as e:
            self.logger.error(f"Error determining RPC direction: {e}")
            raise

    def _calculate_bar_rpc(
        self, reu: float, red: float,
        prev_direction: str, new_direction: str
    ) -> int:
        """Calculate the bar's RPC value."""
        try:
            if reu == 0 and red == 0:
                return 0
            elif reu > 0 and red > 0:
                if prev_direction == 'N' or new_direction != prev_direction:
                    return 2
                return 1
            elif new_direction != prev_direction and prev_direction != 'N':
                return 1
            return 0

        except Exception as e:
            self.logger.error(f"Error calculating bar RPC: {e}")
            raise

class TestTemporalFramework:
    def run_all_tests(self):
        processor = ConnectTemporalPeriods()

        # Test RPC calculation
        assert processor._calculate_bar_rpc(0, 0, 'N', 'N') == 0, "No expansion should return 0"
        assert processor._calculate_bar_rpc(1, 1, 'N', 'U') == 2, "Two-way expansion should return 2"
        assert processor._calculate_bar_rpc(1, 0, 'D', 'U') == 1, "Direction change should return 1"

        # Test RPC direction
        assert processor._determine_rpc_direction(0, 0, 100, 90, 95, 'U') == 'U', "No expansion should maintain direction"
        assert processor._determine_rpc_direction(1, 1, 100, 90, 97, 'N') == 'U', "Close in upper half should return U"
        assert processor._determine_rpc_direction(1, 0, 100, 90, 95, 'D') == 'U', "Upward expansion should return U"

        print("All tests passed!")

if __name__ == "__main__":
    # Run tests
    tests = TestTemporalFramework()
    tests.run_all_tests()

    # Process files
    processor = ConnectTemporalPeriods()
    processor.process_files()

ERROR:__main__:Error calculating parent stats: {e}
ERROR:__main__:Error processing MMM: "Column(s) ['intra_period_cumulative_rpc', 'range_expansion_flag'] do not exist"
ERROR:__main__:Error processing files: "Column(s) ['intra_period_cumulative_rpc', 'range_expansion_flag'] do not exist"


All tests passed!


KeyError: "Column(s) ['intra_period_cumulative_rpc', 'range_expansion_flag'] do not exist"

In [None]:
import pandas as pd
import os
import logging
from typing import Tuple, Dict, Optional
from dataclasses import dataclass

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('temporal_processing.log'),
        logging.StreamHandler()
    ]
)

@dataclass
class ParentPeriodStats:
    high: float
    low: float
    duration: int
    completed_bars: int
    rpc_count: int
    bar_of_high: int
    bar_of_low: int

class DataValidationError(Exception):
    pass

class ConnectTemporalPeriods:
    REQUIRED_COLUMNS = {'date', 'open', 'high', 'low', 'close'}

    def __init__(self, child_period: str = "D", parent_period: str = "M"):
        self.child_period = child_period
        self.parent_period = parent_period
        self.logger = logging.getLogger(__name__)

    def validate_dataframe(self, df: pd.DataFrame) -> None:
        """Validate input dataframe structure and content."""
        missing_cols = self.REQUIRED_COLUMNS - set(df.columns)
        if missing_cols:
            raise DataValidationError(f"Missing required columns: {missing_cols}")

        if not pd.api.types.is_datetime64_any_dtype(df['date']):
            try:
                pd.to_datetime(df['date'])
            except Exception as e:
                raise DataValidationError(f"Invalid date format: {e}")

        price_cols = ['open', 'high', 'low', 'close']
        for col in price_cols:
            if not pd.api.types.is_numeric_dtype(df[col]):
                raise DataValidationError(f"Column {col} must be numeric")

        invalid_rows = df[df['high'] < df['low']].index
        if len(invalid_rows) > 0:
            raise DataValidationError(f"Found {len(invalid_rows)} rows where high < low")

    def process_files(self, input_dir: str = "input/") -> None:
        """Process all CSV files in the input directory."""
        try:
            os.makedirs("parent_output", exist_ok=True)
            os.makedirs("processed_output", exist_ok=True)

            file_count = 0
            for filename in os.listdir(input_dir):
                if filename.endswith(".csv"):
                    ticker = filename.replace(".csv", "")
                    self.logger.info(f"Processing {ticker}")
                    self.process_single_file(os.path.join(input_dir, filename), ticker)
                    file_count += 1

            self.logger.info(f"Successfully processed {file_count} files")

        except Exception as e:
            self.logger.error(f"Error processing files: {e}")
            raise

    def process_single_file(self, filepath: str, ticker: str) -> None:
        """Process a single ticker's data file."""
        try:
            df = pd.read_csv(filepath)
            df['date'] = pd.to_datetime(df['date'])

            self.validate_dataframe(df)

            df['parent_period'] = df['date'].dt.to_period(self.parent_period)
            parent_stats = self._calculate_parent_stats(df)

            processed_df = self._process_child_data(df, parent_stats)

            self._safe_save_csv(parent_stats, f"parent_output/{ticker}_parent.csv")
            self._safe_save_csv(processed_df, f"processed_output/{ticker}_processed.csv")

            self.logger.info(f"Successfully processed {ticker}")

        except Exception as e:
            self.logger.error(f"Error processing {ticker}: {e}")
            raise

    def _safe_save_csv(self, df: pd.DataFrame, filepath: str) -> None:
        """Safely save DataFrame to CSV with error handling."""
        try:
            df.to_csv(filepath, index=False)
        except Exception as e:
            self.logger.error(f"Error saving to {filepath}: {e}")
            raise

    def _calculate_parent_stats(self, df: pd.DataFrame) -> pd.DataFrame:
        """Calculate parent period statistics including bar locations and counts."""
        try:
            # Basic stats
            parent_stats = df.groupby('parent_period').agg({
                'date': ['first', 'count'],
                'high': 'max',
                'low': 'min',
                'range_expansion_flag': 'sum',
                'intra_period_cumulative_rpc': 'max'
            }).reset_index()

            parent_stats.columns = ['parent_period', 'date', 'duration', 'parent_high',
                                  'parent_low', 'expansion_count', 'rpc_count']

            # Calculate bar positions for each parent period
            for period in df['parent_period'].unique():
                period_data = df[df['parent_period'] == period]
                period_high = parent_stats.loc[parent_stats['parent_period'] == period, 'parent_high'].iloc[0]
                period_low = parent_stats.loc[parent_stats['parent_period'] == period, 'parent_low'].iloc[0]

                # Find first occurrence of high and low
                bar_of_high = period_data[period_data['high'] == period_high].index.min()
                bar_of_low = period_data[period_data['low'] == period_low].index.min()

                if bar_of_high is not None:
                    parent_stats.loc[parent_stats['parent_period'] == period, 'bar_of_high'] = \
                        period_data.loc[:bar_of_high].notna().sum()
                if bar_of_low is not None:
                    parent_stats.loc[parent_stats['parent_period'] == period, 'bar_of_low'] = \
                        period_data.loc[:bar_of_low].notna().sum()

            return parent_stats

        except Exception as e:
            self.logger.error("Error calculating parent stats: {e}")
            raise

    def _process_child_data(self, df: pd.DataFrame, parent_stats: pd.DataFrame = None) -> pd.DataFrame:
        """Process child period data with detailed metrics."""
        try:
            df = df.copy()

            # Initialize columns with correct dtypes
            numeric_cols = {
                'intra_period_count': 'int64',
                'intra_period_high': 'float64',
                'intra_period_low': 'float64',
                'intra_period_reu': 'float64',
                'intra_period_red': 'float64',
                'range_expansion_flag': 'int64',
                'bar_rpc': 'int64',
                'intra_period_cumulative_rpc': 'int64'
            }

            for col, dtype in numeric_cols.items():
                df[col] = pd.Series(dtype=dtype)

            df['rpc_direction'] = 'N'

            # Process each parent period
            for period in df['parent_period'].unique():
                mask = df['parent_period'] == period
                period_data = df[mask].copy()

                processed_period = self._process_parent_period(period_data)
                df.loc[mask] = processed_period.astype(df[mask].dtypes)

            return df

        except Exception as e:
            self.logger.error(f"Error processing child data: {e}")
            raise

    def _process_parent_period(self, period_data: pd.DataFrame) -> pd.DataFrame:
        """Process data within a single parent period."""
        try:
            active_bars = 0
            cumulative_rpc = 0
            prev_direction = 'N'
            prior_intra_high = None
            prior_intra_low = None

            for idx in period_data.index:
                row = period_data.loc[idx]

                if pd.isna(row['high']) or pd.isna(row['low']):
                    continue

                active_bars += 1
                period_data.at[idx, 'intra_period_count'] = active_bars

                # Calculate range expansions using prior values
                reu = max(0, row['high'] - prior_intra_high) if prior_intra_high is not None else 0
                red = max(0, prior_intra_low - row['low']) if prior_intra_low is not None else 0

                # Update metrics
                period_data.at[idx, 'intra_period_reu'] = reu
                period_data.at[idx, 'intra_period_red'] = red
                period_data.at[idx, 'range_expansion_flag'] = 1 if (reu > 0 or red > 0) else 0

                new_direction = self._determine_rpc_direction(
                    reu, red, row['high'], row['low'], row['close'], prev_direction
                )

                if new_direction != prev_direction and prev_direction != 'N':
                    cumulative_rpc += 1

                period_data.at[idx, 'rpc_direction'] = new_direction
                period_data.at[idx, 'intra_period_cumulative_rpc'] = cumulative_rpc
                period_data.at[idx, 'bar_rpc'] = self._calculate_bar_rpc(
                    reu, red, prev_direction, new_direction
                )

                # Update prior values for next iteration
                prior_intra_high = max(row['high'], prior_intra_high if prior_intra_high is not None else row['high'])
                prior_intra_low = min(row['low'], prior_intra_low if prior_intra_low is not None else row['low'])

                prev_direction = new_direction

            return period_data

        except Exception as e:
            self.logger.error(f"Error processing parent period: {e}")
            raise

    def _determine_rpc_direction(
        self, reu: float, red: float,
        high: float, low: float, close: float,
        prev_direction: str
    ) -> str:
        """Determine the RPC direction based on expansions."""
        try:
            if reu == 0 and red == 0:
                return prev_direction
            elif reu > 0 and red > 0:
                # Two-way expansion - use close location
                return 'U' if (close - low) / (high - low) >= 0.5 else 'D'
            elif reu > 0:
                return 'U'
            else:
                return 'D'

        except Exception as e:
            self.logger.error(f"Error determining RPC direction: {e}")
            raise

    def _calculate_bar_rpc(
        self, reu: float, red: float,
        prev_direction: str, new_direction: str
    ) -> int:
        """Calculate the bar's RPC value."""
        try:
            if reu == 0 and red == 0:
                return 0
            elif reu > 0 and red > 0:
                if prev_direction == 'N' or new_direction != prev_direction:
                    return 2
                return 1
            elif new_direction != prev_direction and prev_direction != 'N':
                return 1
            return 0

        except Exception as e:
            self.logger.error(f"Error calculating bar RPC: {e}")
            raise

class TestTemporalFramework:
    def run_all_tests(self):
        processor = ConnectTemporalPeriods()

        # Test RPC calculation
        assert processor._calculate_bar_rpc(0, 0, 'N', 'N') == 0, "No expansion should return 0"
        assert processor._calculate_bar_rpc(1, 1, 'N', 'U') == 2, "Two-way expansion should return 2"
        assert processor._calculate_bar_rpc(1, 0, 'D', 'U') == 1, "Direction change should return 1"

        # Test RPC direction
        assert processor._determine_rpc_direction(0, 0, 100, 90, 95, 'U') == 'U', "No expansion should maintain direction"
        assert processor._determine_rpc_direction(1, 1, 100, 90, 97, 'N') == 'U', "Close in upper half should return U"
        assert processor._determine_rpc_direction(1, 0, 100, 90, 95, 'D') == 'U', "Upward expansion should return U"

        print("All tests passed!")

if __name__ == "__main__":
    # Run tests
    tests = TestTemporalFramework()
    tests.run_all_tests()

    # Process files
    processor = ConnectTemporalPeriods()
    processor.process_files()

ERROR:__main__:Error calculating parent stats: {e}
ERROR:__main__:Error processing MMM: "Column(s) ['intra_period_cumulative_rpc', 'range_expansion_flag'] do not exist"
ERROR:__main__:Error processing files: "Column(s) ['intra_period_cumulative_rpc', 'range_expansion_flag'] do not exist"


All tests passed!


KeyError: "Column(s) ['intra_period_cumulative_rpc', 'range_expansion_flag'] do not exist"

In [None]:
import pandas as pd
import os
import logging
from typing import Tuple, Dict, Optional
from dataclasses import dataclass

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('temporal_processing.log'),
        logging.StreamHandler()
    ]
)

@dataclass
class ParentPeriodStats:
    high: float
    low: float
    duration: int
    completed_bars: int
    rpc_count: int
    bar_of_high: int
    bar_of_low: int

class DataValidationError(Exception):
    pass

class ConnectTemporalPeriods:
    REQUIRED_COLUMNS = {'date', 'open', 'high', 'low', 'close'}

    def __init__(self, child_period: str = "D", parent_period: str = "M"):
        self.child_period = child_period
        self.parent_period = parent_period
        self.logger = logging.getLogger(__name__)

    def validate_dataframe(self, df: pd.DataFrame) -> None:
        """Validate input dataframe structure and content."""
        missing_cols = self.REQUIRED_COLUMNS - set(df.columns)
        if missing_cols:
            raise DataValidationError(f"Missing required columns: {missing_cols}")

        if not pd.api.types.is_datetime64_any_dtype(df['date']):
            try:
                pd.to_datetime(df['date'])
            except Exception as e:
                raise DataValidationError(f"Invalid date format: {e}")

        price_cols = ['open', 'high', 'low', 'close']
        for col in price_cols:
            if not pd.api.types.is_numeric_dtype(df[col]):
                raise DataValidationError(f"Column {col} must be numeric")

        invalid_rows = df[df['high'] < df['low']].index
        if len(invalid_rows) > 0:
            raise DataValidationError(f"Found {len(invalid_rows)} rows where high < low")

    def process_files(self, input_dir: str = "input/") -> None:
        """Process all CSV files in the input directory."""
        try:
            os.makedirs("parent_output", exist_ok=True)
            os.makedirs("processed_output", exist_ok=True)

            file_count = 0
            for filename in os.listdir(input_dir):
                if filename.endswith(".csv"):
                    ticker = filename.replace(".csv", "")
                    self.logger.info(f"Processing {ticker}")
                    self.process_single_file(os.path.join(input_dir, filename), ticker)
                    file_count += 1

            self.logger.info(f"Successfully processed {file_count} files")

        except Exception as e:
            self.logger.error(f"Error processing files: {e}")
            raise

    def process_single_file(self, filepath: str, ticker: str) -> None:
        """Process a single ticker's data file."""
        try:
            df = pd.read_csv(filepath)
            df['date'] = pd.to_datetime(df['date'])

            self.validate_dataframe(df)

            df['parent_period'] = df['date'].dt.to_period(self.parent_period)
            parent_stats = self._calculate_parent_stats(df)

            processed_df = self._process_child_data(df, parent_stats)

            self._safe_save_csv(parent_stats, f"parent_output/{ticker}_parent.csv")
            self._safe_save_csv(processed_df, f"processed_output/{ticker}_processed.csv")

            self.logger.info(f"Successfully processed {ticker}")

        except Exception as e:
            self.logger.error(f"Error processing {ticker}: {e}")
            raise

    def _safe_save_csv(self, df: pd.DataFrame, filepath: str) -> None:
        """Safely save DataFrame to CSV with error handling."""
        try:
            df.to_csv(filepath, index=False)
        except Exception as e:
            self.logger.error(f"Error saving to {filepath}: {e}")
            raise

    def _calculate_parent_stats(self, df: pd.DataFrame) -> pd.DataFrame:
        """Calculate parent period statistics including bar locations and counts."""
        try:
            # Basic stats
            parent_stats = df.groupby('parent_period').agg({
                'date': ['first', 'count'],
                'high': 'max',
                'low': 'min',
                'range_expansion_flag': 'sum',
                'intra_period_cumulative_rpc': 'max'
            }).reset_index()

            parent_stats.columns = ['parent_period', 'date', 'duration', 'parent_high',
                                  'parent_low', 'expansion_count', 'rpc_count']

            # Calculate bar positions for each parent period
            for period in df['parent_period'].unique():
                period_data = df[df['parent_period'] == period]
                period_high = parent_stats.loc[parent_stats['parent_period'] == period, 'parent_high'].iloc[0]
                period_low = parent_stats.loc[parent_stats['parent_period'] == period, 'parent_low'].iloc[0]

                # Find first occurrence of high and low
                bar_of_high = period_data[period_data['high'] == period_high].index.min()
                bar_of_low = period_data[period_data['low'] == period_low].index.min()

                if bar_of_high is not None:
                    parent_stats.loc[parent_stats['parent_period'] == period, 'bar_of_high'] = \
                        period_data.loc[:bar_of_high].notna().sum()
                if bar_of_low is not None:
                    parent_stats.loc[parent_stats['parent_period'] == period, 'bar_of_low'] = \
                        period_data.loc[:bar_of_low].notna().sum()

            return parent_stats

        except Exception as e:
            self.logger.error("Error calculating parent stats: {e}")
            raise

    def _process_child_data(self, df: pd.DataFrame, parent_stats: pd.DataFrame = None) -> pd.DataFrame:
        """Process child period data with detailed metrics."""
        try:
            df = df.copy()

            # Initialize columns with correct dtypes
            numeric_cols = {
                'intra_period_count': 'int64',
                'intra_period_high': 'float64',
                'intra_period_low': 'float64',
                'intra_period_reu': 'float64',
                'intra_period_red': 'float64',
                'range_expansion_flag': 'int64',
                'bar_rpc': 'int64',
                'intra_period_cumulative_rpc': 'int64'
            }

            for col, dtype in numeric_cols.items():
                df[col] = pd.Series(dtype=dtype)

            df['rpc_direction'] = 'N'

            # Process each parent period
            for period in df['parent_period'].unique():
                mask = df['parent_period'] == period
                period_data = df[mask].copy()

                processed_period = self._process_parent_period(period_data)
                df.loc[mask] = processed_period.astype(df[mask].dtypes)

            return df

        except Exception as e:
            self.logger.error(f"Error processing child data: {e}")
            raise

    def _process_parent_period(self, period_data: pd.DataFrame) -> pd.DataFrame:
        """Process data within a single parent period."""
        try:
            active_bars = 0
            cumulative_rpc = 0
            prev_direction = 'N'
            prior_intra_high = None
            prior_intra_low = None

            for idx in period_data.index:
                row = period_data.loc[idx]

                if pd.isna(row['high']) or pd.isna(row['low']):
                    continue

                active_bars += 1
                period_data.at[idx, 'intra_period_count'] = active_bars

                # Calculate range expansions using prior values
                reu = max(0, row['high'] - prior_intra_high) if prior_intra_high is not None else 0
                red = max(0, prior_intra_low - row['low']) if prior_intra_low is not None else 0

                # Update metrics
                period_data.at[idx, 'intra_period_reu'] = reu
                period_data.at[idx, 'intra_period_red'] = red
                period_data.at[idx, 'range_expansion_flag'] = 1 if (reu > 0 or red > 0) else 0

                new_direction = self._determine_rpc_direction(
                    reu, red, row['high'], row['low'], row['close'], prev_direction
                )

                if new_direction != prev_direction and prev_direction != 'N':
                    cumulative_rpc += 1

                period_data.at[idx, 'rpc_direction'] = new_direction
                period_data.at[idx, 'intra_period_cumulative_rpc'] = cumulative_rpc
                period_data.at[idx, 'bar_rpc'] = self._calculate_bar_rpc(
                    reu, red, prev_direction, new_direction
                )

                # Update prior values for next iteration
                prior_intra_high = max(row['high'], prior_intra_high if prior_intra_high is not None else row['high'])
                prior_intra_low = min(row['low'], prior_intra_low if prior_intra_low is not None else row['low'])

                prev_direction = new_direction

            return period_data

        except Exception as e:
            self.logger.error(f"Error processing parent period: {e}")
            raise

    def _determine_rpc_direction(
        self, reu: float, red: float,
        high: float, low: float, close: float,
        prev_direction: str
    ) -> str:
        """Determine the RPC direction based on expansions."""
        try:
            if reu == 0 and red == 0:
                return prev_direction
            elif reu > 0 and red > 0:
                # Two-way expansion - use close location
                return 'U' if (close - low) / (high - low) >= 0.5 else 'D'
            elif reu > 0:
                return 'U'
            else:
                return 'D'

        except Exception as e:
            self.logger.error(f"Error determining RPC direction: {e}")
            raise

    def _calculate_bar_rpc(
        self, reu: float, red: float,
        prev_direction: str, new_direction: str
    ) -> int:
        """Calculate the bar's RPC value."""
        try:
            if reu == 0 and red == 0:
                return 0
            elif reu > 0 and red > 0:
                if prev_direction == 'N' or new_direction != prev_direction:
                    return 2
                return 1
            elif new_direction != prev_direction and prev_direction != 'N':
                return 1
            return 0

        except Exception as e:
            self.logger.error(f"Error calculating bar RPC: {e}")
            raise

class TestTemporalFramework:
    def run_all_tests(self):
        processor = ConnectTemporalPeriods()

        # Test RPC calculation
        assert processor._calculate_bar_rpc(0, 0, 'N', 'N') == 0, "No expansion should return 0"
        assert processor._calculate_bar_rpc(1, 1, 'N', 'U') == 2, "Two-way expansion should return 2"
        assert processor._calculate_bar_rpc(1, 0, 'D', 'U') == 1, "Direction change should return 1"

        # Test RPC direction
        assert processor._determine_rpc_direction(0, 0, 100, 90, 95, 'U') == 'U', "No expansion should maintain direction"
        assert processor._determine_rpc_direction(1, 1, 100, 90, 97, 'N') == 'U', "Close in upper half should return U"
        assert processor._determine_rpc_direction(1, 0, 100, 90, 95, 'D') == 'U', "Upward expansion should return U"

        print("All tests passed!")

if __name__ == "__main__":
    # Run tests
    tests = TestTemporalFramework()
    tests.run_all_tests()

    # Process files
    processor = ConnectTemporalPeriods()
    processor.process_files()

ERROR:__main__:Error calculating parent stats: {e}
ERROR:__main__:Error processing MMM: "Column(s) ['intra_period_cumulative_rpc', 'range_expansion_flag'] do not exist"
ERROR:__main__:Error processing files: "Column(s) ['intra_period_cumulative_rpc', 'range_expansion_flag'] do not exist"


All tests passed!


KeyError: "Column(s) ['intra_period_cumulative_rpc', 'range_expansion_flag'] do not exist"

In [None]:
def _calculate_parent_stats(self, df: pd.DataFrame) -> pd.DataFrame:
        """Calculate parent period statistics including bar locations and counts."""
        try:
            # Basic stats
            parent_stats = df.groupby('parent_period').agg({
                'date': ['first', 'count'],
                'high': 'max',
                'low': 'min'
            }).reset_index()

            parent_stats.columns = ['parent_period', 'date', 'duration', 'parent_high', 'parent_low']

            # Calculate bar positions and supplementary stats for each parent period
            for period in df['parent_period'].unique():
                period_data = df[df['parent_period'] == period]
                period_high = parent_stats.loc[parent_stats['parent_period'] == period, 'parent_high'].iloc[0]
                period_low = parent_stats.loc[parent_stats['parent_period'] == period, 'parent_low'].iloc[0]

                # Find first occurrence of high and low
                bar_of_high = period_data[period_data['high'] == period_high].index.min()
                bar_of_low = period_data[period_data['low'] == period_low].index.min()

                if bar_of_high is not None:
                    parent_stats.loc[parent_stats['parent_period'] == period, 'bar_of_high'] = \
                        period_data.loc[:bar_of_high].notna().sum()
                if bar_of_low is not None:
                    parent_stats.loc[parent_stats['parent_period'] == period, 'bar_of_low'] = \
                        period_data.loc[:bar_of_low].notna().sum()

                # Add expansion and RPC counts
                parent_stats.loc[parent_stats['parent_period'] == period, 'expansion_count'] = \
                    period_data['range_expansion_flag'].sum()
                parent_stats.loc[parent_stats['parent_period'] == period, 'rpc_count'] = \
                    period_data['intra_period_cumulative_rpc'].max()

            return parent_stats

        except Exception as e:
            self.logger.error(f"Error calculating parent stats: {str(e)}")
            raiseimport pandas as pd
import os
import logging
from typing import Tuple, Dict, Optional
from dataclasses import dataclass

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('temporal_processing.log'),
        logging.StreamHandler()
    ]
)

@dataclass
class ParentPeriodStats:
    high: float
    low: float
    duration: int
    completed_bars: int
    rpc_count: int
    bar_of_high: int
    bar_of_low: int

class DataValidationError(Exception):
    pass

class ConnectTemporalPeriods:
    REQUIRED_COLUMNS = {'date', 'open', 'high', 'low', 'close'}

    def __init__(self, child_period: str = "D", parent_period: str = "M"):
        self.child_period = child_period
        self.parent_period = parent_period
        self.logger = logging.getLogger(__name__)

    def validate_dataframe(self, df: pd.DataFrame) -> None:
        """Validate input dataframe structure and content."""
        missing_cols = self.REQUIRED_COLUMNS - set(df.columns)
        if missing_cols:
            raise DataValidationError(f"Missing required columns: {missing_cols}")

        if not pd.api.types.is_datetime64_any_dtype(df['date']):
            try:
                pd.to_datetime(df['date'])
            except Exception as e:
                raise DataValidationError(f"Invalid date format: {e}")

        price_cols = ['open', 'high', 'low', 'close']
        for col in price_cols:
            if not pd.api.types.is_numeric_dtype(df[col]):
                raise DataValidationError(f"Column {col} must be numeric")

        invalid_rows = df[df['high'] < df['low']].index
        if len(invalid_rows) > 0:
            raise DataValidationError(f"Found {len(invalid_rows)} rows where high < low")

    def process_files(self, input_dir: str = "input/") -> None:
        """Process all CSV files in the input directory."""
        try:
            os.makedirs("parent_output", exist_ok=True)
            os.makedirs("processed_output", exist_ok=True)

            file_count = 0
            for filename in os.listdir(input_dir):
                if filename.endswith(".csv"):
                    ticker = filename.replace(".csv", "")
                    self.logger.info(f"Processing {ticker}")
                    self.process_single_file(os.path.join(input_dir, filename), ticker)
                    file_count += 1

            self.logger.info(f"Successfully processed {file_count} files")

        except Exception as e:
            self.logger.error(f"Error processing files: {e}")
            raise

    def process_single_file(self, filepath: str, ticker: str) -> None:
        """Process a single ticker's data file."""
        try:
            df = pd.read_csv(filepath)
            df['date'] = pd.to_datetime(df['date'])

            self.validate_dataframe(df)

            df['parent_period'] = df['date'].dt.to_period(self.parent_period)
            parent_stats = self._calculate_parent_stats(df)

            processed_df = self._process_child_data(df, parent_stats)

            self._safe_save_csv(parent_stats, f"parent_output/{ticker}_parent.csv")
            self._safe_save_csv(processed_df, f"processed_output/{ticker}_processed.csv")

            self.logger.info(f"Successfully processed {ticker}")

        except Exception as e:
            self.logger.error(f"Error processing {ticker}: {e}")
            raise

    def _safe_save_csv(self, df: pd.DataFrame, filepath: str) -> None:
        """Safely save DataFrame to CSV with error handling."""
        try:
            df.to_csv(filepath, index=False)
        except Exception as e:
            self.logger.error(f"Error saving to {filepath}: {e}")
            raise

    def _calculate_parent_stats(self, df: pd.DataFrame) -> pd.DataFrame:
        """Calculate parent period statistics including bar locations and counts."""
        try:
            # Basic stats
            parent_stats = df.groupby('parent_period').agg({
                'date': ['first', 'count'],
                'high': 'max',
                'low': 'min',
                'range_expansion_flag': 'sum',
                'intra_period_cumulative_rpc': 'max'
            }).reset_index()

            parent_stats.columns = ['parent_period', 'date', 'duration', 'parent_high',
                                  'parent_low', 'expansion_count', 'rpc_count']

            # Calculate bar positions for each parent period
            for period in df['parent_period'].unique():
                period_data = df[df['parent_period'] == period]
                period_high = parent_stats.loc[parent_stats['parent_period'] == period, 'parent_high'].iloc[0]
                period_low = parent_stats.loc[parent_stats['parent_period'] == period, 'parent_low'].iloc[0]

                # Find first occurrence of high and low
                bar_of_high = period_data[period_data['high'] == period_high].index.min()
                bar_of_low = period_data[period_data['low'] == period_low].index.min()

                if bar_of_high is not None:
                    parent_stats.loc[parent_stats['parent_period'] == period, 'bar_of_high'] = \
                        period_data.loc[:bar_of_high].notna().sum()
                if bar_of_low is not None:
                    parent_stats.loc[parent_stats['parent_period'] == period, 'bar_of_low'] = \
                        period_data.loc[:bar_of_low].notna().sum()

            return parent_stats

        except Exception as e:
            self.logger.error("Error calculating parent stats: {e}")
            raise

    def _process_child_data(self, df: pd.DataFrame, parent_stats: pd.DataFrame = None) -> pd.DataFrame:
        """Process child period data with detailed metrics."""
        try:
            df = df.copy()

            # Initialize columns with correct dtypes
            numeric_cols = {
                'intra_period_count': 'int64',
                'intra_period_high': 'float64',
                'intra_period_low': 'float64',
                'intra_period_reu': 'float64',
                'intra_period_red': 'float64',
                'range_expansion_flag': 'int64',
                'bar_rpc': 'int64',
                'intra_period_cumulative_rpc': 'int64'
            }

            for col, dtype in numeric_cols.items():
                df[col] = pd.Series(dtype=dtype)

            df['rpc_direction'] = 'N'

            # Process each parent period
            for period in df['parent_period'].unique():
                mask = df['parent_period'] == period
                period_data = df[mask].copy()

                processed_period = self._process_parent_period(period_data)
                df.loc[mask] = processed_period.astype(df[mask].dtypes)

            return df

        except Exception as e:
            self.logger.error(f"Error processing child data: {e}")
            raise

    def _process_parent_period(self, period_data: pd.DataFrame) -> pd.DataFrame:
        """Process data within a single parent period."""
        try:
            active_bars = 0
            cumulative_rpc = 0
            prev_direction = 'N'
            prior_intra_high = None
            prior_intra_low = None

            for idx in period_data.index:
                row = period_data.loc[idx]

                if pd.isna(row['high']) or pd.isna(row['low']):
                    continue

                active_bars += 1
                period_data.at[idx, 'intra_period_count'] = active_bars

                # Calculate range expansions using prior values
                reu = max(0, row['high'] - prior_intra_high) if prior_intra_high is not None else 0
                red = max(0, prior_intra_low - row['low']) if prior_intra_low is not None else 0

                # Update metrics
                period_data.at[idx, 'intra_period_reu'] = reu
                period_data.at[idx, 'intra_period_red'] = red
                period_data.at[idx, 'range_expansion_flag'] = 1 if (reu > 0 or red > 0) else 0

                new_direction = self._determine_rpc_direction(
                    reu, red, row['high'], row['low'], row['close'], prev_direction
                )

                if new_direction != prev_direction and prev_direction != 'N':
                    cumulative_rpc += 1

                period_data.at[idx, 'rpc_direction'] = new_direction
                period_data.at[idx, 'intra_period_cumulative_rpc'] = cumulative_rpc
                period_data.at[idx, 'bar_rpc'] = self._calculate_bar_rpc(
                    reu, red, prev_direction, new_direction
                )

                # Update prior values for next iteration
                prior_intra_high = max(row['high'], prior_intra_high if prior_intra_high is not None else row['high'])
                prior_intra_low = min(row['low'], prior_intra_low if prior_intra_low is not None else row['low'])

                prev_direction = new_direction

            return period_data

        except Exception as e:
            self.logger.error(f"Error processing parent period: {e}")
            raise

    def _determine_rpc_direction(
        self, reu: float, red: float,
        high: float, low: float, close: float,
        prev_direction: str
    ) -> str:
        """Determine the RPC direction based on expansions."""
        try:
            if reu == 0 and red == 0:
                return prev_direction
            elif reu > 0 and red > 0:
                # Two-way expansion - use close location
                return 'U' if (close - low) / (high - low) >= 0.5 else 'D'
            elif reu > 0:
                return 'U'
            else:
                return 'D'

        except Exception as e:
            self.logger.error(f"Error determining RPC direction: {e}")
            raise

    def _calculate_bar_rpc(
        self, reu: float, red: float,
        prev_direction: str, new_direction: str
    ) -> int:
        """Calculate the bar's RPC value."""
        try:
            if reu == 0 and red == 0:
                return 0
            elif reu > 0 and red > 0:
                if prev_direction == 'N' or new_direction != prev_direction:
                    return 2
                return 1
            elif new_direction != prev_direction and prev_direction != 'N':
                return 1
            return 0

        except Exception as e:
            self.logger.error(f"Error calculating bar RPC: {e}")
            raise

class TestTemporalFramework:
    def run_all_tests(self):
        processor = ConnectTemporalPeriods()

        # Test RPC calculation
        assert processor._calculate_bar_rpc(0, 0, 'N', 'N') == 0, "No expansion should return 0"
        assert processor._calculate_bar_rpc(1, 1, 'N', 'U') == 2, "Two-way expansion should return 2"
        assert processor._calculate_bar_rpc(1, 0, 'D', 'U') == 1, "Direction change should return 1"

        # Test RPC direction
        assert processor._determine_rpc_direction(0, 0, 100, 90, 95, 'U') == 'U', "No expansion should maintain direction"
        assert processor._determine_rpc_direction(1, 1, 100, 90, 97, 'N') == 'U', "Close in upper half should return U"
        assert processor._determine_rpc_direction(1, 0, 100, 90, 95, 'D') == 'U', "Upward expansion should return U"

        print("All tests passed!")

if __name__ == "__main__":
    # Run tests
    tests = TestTemporalFramework()
    tests.run_all_tests()

    # Process files
    processor = ConnectTemporalPeriods()
    processor.process_files()

SyntaxError: invalid syntax (<ipython-input-17-a46306e6a797>, line 40)

In [None]:
import pandas as pd
import os
import logging
from typing import Tuple, Dict, Optional
from dataclasses import dataclass

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('temporal_processing.log'),
        logging.StreamHandler()
    ]
)

@dataclass
class ParentPeriodStats:
    high: float
    low: float
    duration: int
    completed_bars: int
    rpc_count: int
    bar_of_high: int
    bar_of_low: int

class DataValidationError(Exception):
    pass

class ConnectTemporalPeriods:
    REQUIRED_COLUMNS = {'date', 'open', 'high', 'low', 'close'}

    def __init__(self, child_period: str = "D", parent_period: str = "M"):
        self.child_period = child_period
        self.parent_period = parent_period
        self.logger = logging.getLogger(__name__)

    def validate_dataframe(self, df: pd.DataFrame) -> None:
        missing_cols = self.REQUIRED_COLUMNS - set(df.columns)
        if missing_cols:
            raise DataValidationError(f"Missing required columns: {missing_cols}")

        if not pd.api.types.is_datetime64_any_dtype(df['date']):
            try:
                pd.to_datetime(df['date'])
            except Exception as e:
                raise DataValidationError(f"Invalid date format: {e}")

        price_cols = ['open', 'high', 'low', 'close']
        for col in price_cols:
            if not pd.api.types.is_numeric_dtype(df[col]):
                raise DataValidationError(f"Column {col} must be numeric")

        invalid_rows = df[df['high'] < df['low']].index
        if len(invalid_rows) > 0:
            raise DataValidationError(f"Found {len(invalid_rows)} rows where high < low")

    def process_files(self, input_dir: str = "input/") -> None:
        try:
            os.makedirs("parent_output", exist_ok=True)
            os.makedirs("processed_output", exist_ok=True)

            file_count = 0
            for filename in os.listdir(input_dir):
                if filename.endswith(".csv"):
                    ticker = filename.replace(".csv", "")
                    self.logger.info(f"Processing {ticker}")
                    self.process_single_file(os.path.join(input_dir, filename), ticker)
                    file_count += 1

            self.logger.info(f"Successfully processed {file_count} files")

        except Exception as e:
            self.logger.error(f"Error processing files: {e}")
            raise

    def process_single_file(self, filepath: str, ticker: str) -> None:
        try:
            # Load and validate data
            df = pd.read_csv(filepath)
            df['date'] = pd.to_datetime(df['date'])
            self.validate_dataframe(df)

            # Add period column and initialize fields
            df['parent_period'] = df['date'].dt.to_period(self.parent_period)

            numeric_cols = {
                'intra_period_count': 'int64',
                'intra_period_high': 'float64',
                'intra_period_low': 'float64',
                'intra_period_reu': 'float64',
                'intra_period_red': 'float64',
                'range_expansion_flag': 'int64',
                'bar_rpc': 'int64',
                'intra_period_cumulative_rpc': 'int64'
            }

            for col, dtype in numeric_cols.items():
                df[col] = pd.Series(0, index=df.index, dtype=dtype)

            df['rpc_direction'] = 'N'

            # Process each parent period
            for period in df['parent_period'].unique():
                mask = df['parent_period'] == period
                period_data = df[mask].copy()
                processed_period = self._process_parent_period(period_data)
                df.loc[mask] = processed_period.astype(df[mask].dtypes)

            # Calculate parent stats
            parent_stats = self._calculate_parent_stats(df)

            # Save outputs
            self._safe_save_csv(parent_stats, f"parent_output/{ticker}_parent.csv")
            self._safe_save_csv(df, f"processed_output/{ticker}_processed.csv")

            self.logger.info(f"Successfully processed {ticker}")

        except Exception as e:
            self.logger.error(f"Error processing {ticker}: {e}")
            raise

    def _safe_save_csv(self, df: pd.DataFrame, filepath: str) -> None:
        try:
            df.to_csv(filepath, index=False)
        except Exception as e:
            self.logger.error(f"Error saving to {filepath}: {e}")
            raise

    def _calculate_parent_stats(self, df: pd.DataFrame) -> pd.DataFrame:
        try:
            # Basic stats
            parent_stats = df.groupby('parent_period').agg({
                'date': 'first',
                'high': 'max',
                'low': 'min',
                'date': 'count'
            }).reset_index()

            parent_stats.columns = ['parent_period', 'date', 'parent_high', 'parent_low', 'duration']

            # Initialize supplementary columns
            parent_stats['bar_of_high'] = 0
            parent_stats['bar_of_low'] = 0
            parent_stats['expansion_count'] = 0
            parent_stats['rpc_count'] = 0

            # Calculate additional stats per period
            for period in df['parent_period'].unique():
                period_data = df[df['parent_period'] == period]
                period_high = parent_stats.loc[parent_stats['parent_period'] == period, 'parent_high'].iloc[0]
                period_low = parent_stats.loc[parent_stats['parent_period'] == period, 'parent_low'].iloc[0]

                # First occurrence of high/low
                high_idx = period_data[period_data['high'] == period_high].index.min()
                low_idx = period_data[period_data['low'] == period_low].index.min()

                idx = parent_stats['parent_period'] == period
                if high_idx is not None:
                    high_bar = period_data.loc[:high_idx, 'high'].notna().sum()
                    parent_stats.loc[idx, 'bar_of_high'] = high_bar

                if low_idx is not None:
                    low_bar = period_data.loc[:low_idx, 'low'].notna().sum()
                    parent_stats.loc[idx, 'bar_of_low'] = low_bar

                # Count expansions and RPCs
                parent_stats.loc[idx, 'expansion_count'] = period_data['range_expansion_flag'].sum()
                parent_stats.loc[idx, 'rpc_count'] = period_data['intra_period_cumulative_rpc'].max()

            return parent_stats

        except Exception as e:
            self.logger.error(f"Error calculating parent stats: {str(e)}")
            raise

    def _process_parent_period(self, period_data: pd.DataFrame) -> pd.DataFrame:
        try:
            active_bars = 0
            cumulative_rpc = 0
            prev_direction = 'N'
            prior_intra_high = None
            prior_intra_low = None

            for idx in period_data.index:
                row = period_data.loc[idx]

                if pd.isna(row['high']) or pd.isna(row['low']):
                    continue

                active_bars += 1
                period_data.at[idx, 'intra_period_count'] = active_bars

                # Calculate range expansions
                reu = max(0, row['high'] - prior_intra_high) if prior_intra_high is not None else 0
                red = max(0, prior_intra_low - row['low']) if prior_intra_low is not None else 0

                # Update metrics
                period_data.at[idx, 'intra_period_reu'] = reu
                period_data.at[idx, 'intra_period_red'] = red
                period_data.at[idx, 'range_expansion_flag'] = 1 if (reu > 0 or red > 0) else 0

                # Calculate RPC
                new_direction = self._determine_rpc_direction(
                    reu, red, row['high'], row['low'], row['close'], prev_direction
                )

                if new_direction != prev_direction and prev_direction != 'N':
                    cumulative_rpc += 1

                period_data.at[idx, 'rpc_direction'] = new_direction
                period_data.at[idx, 'intra_period_cumulative_rpc'] = cumulative_rpc
                period_data.at[idx, 'bar_rpc'] = self._calculate_bar_rpc(
                    reu, red, prev_direction, new_direction
                )

                # Update prior values for next iteration
                prior_intra_high = max(row['high'], prior_intra_high if prior_intra_high is not None else row['high'])
                prior_intra_low = min(row['low'], prior_intra_low if prior_intra_low is not None else row['low'])

                prev_direction = new_direction

            return period_data

        except Exception as e:
            self.logger.error(f"Error processing parent period: {e}")
            raise

    def _determine_rpc_direction(
        self, reu: float, red: float,
        high: float, low: float, close: float,
        prev_direction: str
    ) -> str:
        try:
            if reu == 0 and red == 0:
                return prev_direction
            elif reu > 0 and red > 0:
                return 'U' if (close - low) / (high - low) >= 0.5 else 'D'
            elif reu > 0:
                return 'U'
            else:
                return 'D'

        except Exception as e:
            self.logger.error(f"Error determining RPC direction: {e}")
            raise

    def _calculate_bar_rpc(
        self, reu: float, red: float,
        prev_direction: str, new_direction: str
    ) -> int:
        try:
            if reu == 0 and red == 0:
                return 0
            elif reu > 0 and red > 0:
                if prev_direction == 'N' or new_direction != prev_direction:
                    return 2
                return 1
            elif new_direction != prev_direction and prev_direction != 'N':
                return 1
            return 0

        except Exception as e:
            self.logger.error(f"Error calculating bar RPC: {e}")
            raise

class TestTemporalFramework:
    def run_all_tests(self):
        processor = ConnectTemporalPeriods()

        assert processor._calculate_bar_rpc(0, 0, 'N', 'N') == 0, "No expansion should return 0"
        assert processor._calculate_bar_rpc(1, 1, 'N', 'U') == 2, "Two-way expansion should return 2"
        assert processor._calculate_bar_rpc(1, 0, 'D', 'U') == 1, "Direction change should return 1"

        assert processor._determine_rpc_direction(0, 0, 100, 90, 95, 'U') == 'U', "No expansion should maintain direction"
        assert processor._determine_rpc_direction(1, 1, 100, 90, 97, 'N') == 'U', "Close in upper half should return U"
        assert processor._determine_rpc_direction(1, 0, 100, 90, 95, 'D') == 'U', "Upward expansion should return U"

        print("All tests passed!")

if __name__ == "__main__":
    tests = TestTemporalFramework()
    tests.run_all_tests()

    processor = ConnectTemporalPeriods()
    processor.process_files()

All tests passed!


ERROR:__main__:Error calculating parent stats: Length mismatch: Expected axis has 4 elements, new values have 5 elements
ERROR:__main__:Error processing MMM: Length mismatch: Expected axis has 4 elements, new values have 5 elements
ERROR:__main__:Error processing files: Length mismatch: Expected axis has 4 elements, new values have 5 elements


ValueError: Length mismatch: Expected axis has 4 elements, new values have 5 elements

In [None]:
import pandas as pd
import os
import logging
from typing import Tuple, Dict, Optional
from dataclasses import dataclass

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('temporal_processing.log'),
        logging.StreamHandler()
    ]
)

@dataclass
class ParentPeriodStats:
    high: float
    low: float
    duration: int
    completed_bars: int
    rpc_count: int
    bar_of_high: int
    bar_of_low: int

class DataValidationError(Exception):
    pass

class ConnectTemporalPeriods:
    REQUIRED_COLUMNS = {'date', 'open', 'high', 'low', 'close'}

    def __init__(self, child_period: str = "D", parent_period: str = "M"):
        self.child_period = child_period
        self.parent_period = parent_period
        self.logger = logging.getLogger(__name__)

    def validate_dataframe(self, df: pd.DataFrame) -> None:
        missing_cols = self.REQUIRED_COLUMNS - set(df.columns)
        if missing_cols:
            raise DataValidationError(f"Missing required columns: {missing_cols}")

        if not pd.api.types.is_datetime64_any_dtype(df['date']):
            try:
                pd.to_datetime(df['date'])
            except Exception as e:
                raise DataValidationError(f"Invalid date format: {e}")

        price_cols = ['open', 'high', 'low', 'close']
        for col in price_cols:
            if not pd.api.types.is_numeric_dtype(df[col]):
                raise DataValidationError(f"Column {col} must be numeric")

        invalid_rows = df[df['high'] < df['low']].index
        if len(invalid_rows) > 0:
            raise DataValidationError(f"Found {len(invalid_rows)} rows where high < low")

    def process_files(self, input_dir: str = "input/") -> None:
        try:
            os.makedirs("parent_output", exist_ok=True)
            os.makedirs("processed_output", exist_ok=True)

            file_count = 0
            for filename in os.listdir(input_dir):
                if filename.endswith(".csv"):
                    ticker = filename.replace(".csv", "")
                    self.logger.info(f"Processing {ticker}")
                    self.process_single_file(os.path.join(input_dir, filename), ticker)
                    file_count += 1

            self.logger.info(f"Successfully processed {file_count} files")

        except Exception as e:
            self.logger.error(f"Error processing files: {e}")
            raise

    def process_single_file(self, filepath: str, ticker: str) -> None:
        try:
            # Load and validate data
            df = pd.read_csv(filepath)
            df['date'] = pd.to_datetime(df['date'])
            self.validate_dataframe(df)

            # Add period column and initialize fields
            df['parent_period'] = df['date'].dt.to_period(self.parent_period)

            numeric_cols = {
                'intra_period_count': 'int64',
                'intra_period_high': 'float64',
                'intra_period_low': 'float64',
                'intra_period_reu': 'float64',
                'intra_period_red': 'float64',
                'range_expansion_flag': 'int64',
                'bar_rpc': 'int64',
                'intra_period_cumulative_rpc': 'int64'
            }

            for col, dtype in numeric_cols.items():
                df[col] = pd.Series(0, index=df.index, dtype=dtype)

            df['rpc_direction'] = 'N'

            # Process each parent period
            for period in df['parent_period'].unique():
                mask = df['parent_period'] == period
                period_data = df[mask].copy()
                processed_period = self._process_parent_period(period_data)
                df.loc[mask] = processed_period.astype(df[mask].dtypes)

            # Calculate parent stats
            parent_stats = self._calculate_parent_stats(df)

            # Save outputs
            self._safe_save_csv(parent_stats, f"parent_output/{ticker}_parent.csv")
            self._safe_save_csv(df, f"processed_output/{ticker}_processed.csv")

            self.logger.info(f"Successfully processed {ticker}")

        except Exception as e:
            self.logger.error(f"Error processing {ticker}: {e}")
            raise

    def _safe_save_csv(self, df: pd.DataFrame, filepath: str) -> None:
        try:
            df.to_csv(filepath, index=False)
        except Exception as e:
            self.logger.error(f"Error saving to {filepath}: {e}")
            raise

    def _calculate_parent_stats(self, df: pd.DataFrame) -> pd.DataFrame:
        try:
            # Basic stats
            parent_stats = df.groupby('parent_period').agg({
                'date': 'first',
                'high': 'max',
                'low': 'min',
                'range_expansion_flag': 'sum'  # Count
            }).reset_index()

            parent_stats.columns = ['parent_period', 'date', 'parent_high', 'parent_low', 'duration']

            # Initialize supplementary columns
            parent_stats['bar_of_high'] = 0
            parent_stats['bar_of_low'] = 0
            parent_stats['expansion_count'] = 0
            parent_stats['rpc_count'] = 0

            # Calculate additional stats per period
            for period in df['parent_period'].unique():
                period_data = df[df['parent_period'] == period]
                period_high = parent_stats.loc[parent_stats['parent_period'] == period, 'parent_high'].iloc[0]
                period_low = parent_stats.loc[parent_stats['parent_period'] == period, 'parent_low'].iloc[0]

                # First occurrence of high/low
                high_idx = period_data[period_data['high'] == period_high].index.min()
                low_idx = period_data[period_data['low'] == period_low].index.min()

                idx = parent_stats['parent_period'] == period
                if high_idx is not None:
                    high_bar = period_data.loc[:high_idx, 'high'].notna().sum()
                    parent_stats.loc[idx, 'bar_of_high'] = high_bar

                if low_idx is not None:
                    low_bar = period_data.loc[:low_idx, 'low'].notna().sum()
                    parent_stats.loc[idx, 'bar_of_low'] = low_bar

                # Count expansions and RPCs
                parent_stats.loc[idx, 'expansion_count'] = period_data['range_expansion_flag'].sum()
                parent_stats.loc[idx, 'rpc_count'] = period_data['intra_period_cumulative_rpc'].max()

            return parent_stats

        except Exception as e:
            self.logger.error(f"Error calculating parent stats: {str(e)}")
            raise

    def _process_parent_period(self, period_data: pd.DataFrame) -> pd.DataFrame:
        try:
            active_bars = 0
            cumulative_rpc = 0
            prev_direction = 'N'
            prior_intra_high = None
            prior_intra_low = None

            for idx in period_data.index:
                row = period_data.loc[idx]

                if pd.isna(row['high']) or pd.isna(row['low']):
                    continue

                active_bars += 1
                period_data.at[idx, 'intra_period_count'] = active_bars

                # Calculate range expansions
                reu = max(0, row['high'] - prior_intra_high) if prior_intra_high is not None else 0
                red = max(0, prior_intra_low - row['low']) if prior_intra_low is not None else 0

                # Update metrics
                period_data.at[idx, 'intra_period_reu'] = reu
                period_data.at[idx, 'intra_period_red'] = red
                period_data.at[idx, 'range_expansion_flag'] = 1 if (reu > 0 or red > 0) else 0

                # Calculate RPC
                new_direction = self._determine_rpc_direction(
                    reu, red, row['high'], row['low'], row['close'], prev_direction
                )

                if new_direction != prev_direction and prev_direction != 'N':
                    cumulative_rpc += 1

                period_data.at[idx, 'rpc_direction'] = new_direction
                period_data.at[idx, 'intra_period_cumulative_rpc'] = cumulative_rpc
                period_data.at[idx, 'bar_rpc'] = self._calculate_bar_rpc(
                    reu, red, prev_direction, new_direction
                )

                # Update prior values for next iteration
                prior_intra_high = max(row['high'], prior_intra_high if prior_intra_high is not None else row['high'])
                prior_intra_low = min(row['low'], prior_intra_low if prior_intra_low is not None else row['low'])

                prev_direction = new_direction

            return period_data

        except Exception as e:
            self.logger.error(f"Error processing parent period: {e}")
            raise

    def _determine_rpc_direction(
        self, reu: float, red: float,
        high: float, low: float, close: float,
        prev_direction: str
    ) -> str:
        try:
            if reu == 0 and red == 0:
                return prev_direction
            elif reu > 0 and red > 0:
                return 'U' if (close - low) / (high - low) >= 0.5 else 'D'
            elif reu > 0:
                return 'U'
            else:
                return 'D'

        except Exception as e:
            self.logger.error(f"Error determining RPC direction: {e}")
            raise

    def _calculate_bar_rpc(
        self, reu: float, red: float,
        prev_direction: str, new_direction: str
    ) -> int:
        try:
            if reu == 0 and red == 0:
                return 0
            elif reu > 0 and red > 0:
                if prev_direction == 'N' or new_direction != prev_direction:
                    return 2
                return 1
            elif new_direction != prev_direction and prev_direction != 'N':
                return 1
            return 0

        except Exception as e:
            self.logger.error(f"Error calculating bar RPC: {e}")
            raise

class TestTemporalFramework:
    def run_all_tests(self):
        processor = ConnectTemporalPeriods()

        assert processor._calculate_bar_rpc(0, 0, 'N', 'N') == 0, "No expansion should return 0"
        assert processor._calculate_bar_rpc(1, 1, 'N', 'U') == 2, "Two-way expansion should return 2"
        assert processor._calculate_bar_rpc(1, 0, 'D', 'U') == 1, "Direction change should return 1"

        assert processor._determine_rpc_direction(0, 0, 100, 90, 95, 'U') == 'U', "No expansion should maintain direction"
        assert processor._determine_rpc_direction(1, 1, 100, 90, 97, 'N') == 'U', "Close in upper half should return U"
        assert processor._determine_rpc_direction(1, 0, 100, 90, 95, 'D') == 'U', "Upward expansion should return U"

        print("All tests passed!")

if __name__ == "__main__":
    tests = TestTemporalFramework()
    tests.run_all_tests()

    processor = ConnectTemporalPeriods()
    processor.process_files()

All tests passed!


In [None]:
import pandas as pd
import os
import logging
from typing import Tuple, Dict, Optional
from dataclasses import dataclass

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('temporal_processing.log'),
        logging.StreamHandler()
    ]
)

@dataclass
class ParentPeriodStats:
    high: float
    low: float
    duration: int
    completed_bars: int
    rpc_count: int
    bar_of_high: int
    bar_of_low: int

class DataValidationError(Exception):
    pass

class ConnectTemporalPeriods:
    REQUIRED_COLUMNS = {'date', 'open', 'high', 'low', 'close'}

    def __init__(self, child_period: str = "D", parent_period: str = "M"):
        self.child_period = child_period
        self.parent_period = parent_period
        self.logger = logging.getLogger(__name__)

    def validate_dataframe(self, df: pd.DataFrame) -> None:
        missing_cols = self.REQUIRED_COLUMNS - set(df.columns)
        if missing_cols:
            raise DataValidationError(f"Missing required columns: {missing_cols}")

        if not pd.api.types.is_datetime64_any_dtype(df['date']):
            try:
                pd.to_datetime(df['date'])
            except Exception as e:
                raise DataValidationError(f"Invalid date format: {e}")

        price_cols = ['open', 'high', 'low', 'close']
        for col in price_cols:
            if not pd.api.types.is_numeric_dtype(df[col]):
                raise DataValidationError(f"Column {col} must be numeric")

        invalid_rows = df[df['high'] < df['low']].index
        if len(invalid_rows) > 0:
            raise DataValidationError(f"Found {len(invalid_rows)} rows where high < low")

    def process_files(self, input_dir: str = "input/") -> None:
        try:
            os.makedirs("parent_output", exist_ok=True)
            os.makedirs("processed_output", exist_ok=True)

            file_count = 0
            for filename in os.listdir(input_dir):
                if filename.endswith(".csv"):
                    ticker = filename.replace(".csv", "")
                    self.logger.info(f"Processing {ticker}")
                    self.process_single_file(os.path.join(input_dir, filename), ticker)
                    file_count += 1

            self.logger.info(f"Successfully processed {file_count} files")

        except Exception as e:
            self.logger.error(f"Error processing files: {e}")
            raise

    def process_single_file(self, filepath: str, ticker: str) -> None:
        try:
            # Load and validate data
            df = pd.read_csv(filepath)
            df['date'] = pd.to_datetime(df['date'])
            self.validate_dataframe(df)

            # Add period column and initialize fields
            df['parent_period'] = df['date'].dt.to_period(self.parent_period)

            numeric_cols = {
                'intra_period_count': 'int64',
                'intra_period_high': 'float64',
                'intra_period_low': 'float64',
                'intra_period_reu': 'float64',
                'intra_period_red': 'float64',
                'range_expansion_flag': 'int64',
                'bar_rpc': 'int64',
                'intra_period_cumulative_rpc': 'int64'
            }

            for col, dtype in numeric_cols.items():
                df[col] = pd.Series(0, index=df.index, dtype=dtype)

            df['rpc_direction'] = 'N'

            # Process each parent period
            for period in df['parent_period'].unique():
                mask = df['parent_period'] == period
                period_data = df[mask].copy()
                processed_period = self._process_parent_period(period_data)
                df.loc[mask] = processed_period.astype(df[mask].dtypes)

            # Calculate parent stats
            parent_stats = self._calculate_parent_stats(df)

            # Save outputs
            self._safe_save_csv(parent_stats, f"parent_output/{ticker}_parent.csv")
            self._safe_save_csv(df, f"processed_output/{ticker}_processed.csv")

            self.logger.info(f"Successfully processed {ticker}")

        except Exception as e:
            self.logger.error(f"Error processing {ticker}: {e}")
            raise

    def _safe_save_csv(self, df: pd.DataFrame, filepath: str) -> None:
        try:
            df.to_csv(filepath, index=False)
        except Exception as e:
            self.logger.error(f"Error saving to {filepath}: {e}")
            raise

    def _calculate_parent_stats(self, df: pd.DataFrame) -> pd.DataFrame:
        try:
            # Basic stats
            parent_stats = df.groupby('parent_period').agg({
                'date': 'first',
                'high': 'max',
                'low': 'min',
                'range_expansion_flag': 'sum'  # Count
            }).reset_index()

            parent_stats.columns = ['parent_period', 'date', 'parent_high', 'parent_low', 'duration']

            # Initialize supplementary columns
            parent_stats['bar_of_high'] = 0
            parent_stats['bar_of_low'] = 0
            parent_stats['expansion_count'] = 0
            parent_stats['rpc_count'] = 0

            # Calculate additional stats per period
            for period in df['parent_period'].unique():
                period_data = df[df['parent_period'] == period]
                period_high = parent_stats.loc[parent_stats['parent_period'] == period, 'parent_high'].iloc[0]
                period_low = parent_stats.loc[parent_stats['parent_period'] == period, 'parent_low'].iloc[0]

                # First occurrence of high/low
                high_idx = period_data[period_data['high'] == period_high].index.min()
                low_idx = period_data[period_data['low'] == period_low].index.min()

                idx = parent_stats['parent_period'] == period
                if high_idx is not None:
                    high_bar = period_data.loc[:high_idx, 'high'].notna().sum()
                    parent_stats.loc[idx, 'bar_of_high'] = high_bar

                if low_idx is not None:
                    low_bar = period_data.loc[:low_idx, 'low'].notna().sum()
                    parent_stats.loc[idx, 'bar_of_low'] = low_bar

                # Count expansions and RPCs
                parent_stats.loc[idx, 'expansion_count'] = period_data['range_expansion_flag'].sum()
                parent_stats.loc[idx, 'rpc_count'] = period_data['intra_period_cumulative_rpc'].max()

            return parent_stats

        except Exception as e:
            self.logger.error(f"Error calculating parent stats: {str(e)}")
            raise

    def _process_parent_period(self, period_data: pd.DataFrame) -> pd.DataFrame:
        try:
            active_bars = 0
            cumulative_rpc = 0
            prev_direction = None  # Changed from 'N' to None for initial direction
            prior_intra_high = None
            prior_intra_low = None

            for idx in period_data.index:
                row = period_data.loc[idx]

                if pd.isna(row['high']) or pd.isna(row['low']):
                    continue

                active_bars += 1
                period_data.at[idx, 'intra_period_count'] = active_bars

                # Calculate range expansions
                reu = max(0, row['high'] - prior_intra_high) if prior_intra_high is not None else 0
                red = max(0, prior_intra_low - row['low']) if prior_intra_low is not None else 0

                # Update metrics
                period_data.at[idx, 'intra_period_reu'] = reu
                period_data.at[idx, 'intra_period_red'] = red
                period_data.at[idx, 'range_expansion_flag'] = 1 if (reu > 0 or red > 0) else 0

                # Calculate RPC
                new_direction = self._determine_rpc_direction(
                    reu, red, row['high'], row['low'], row['close'], prev_direction if prev_direction else 'N'
                )

                # First movement establishes direction without incrementing RPC
                if prev_direction is None and (reu > 0 or red > 0):
                    prev_direction = new_direction
                elif new_direction != prev_direction and prev_direction is not None:
                    cumulative_rpc += 1

                period_data.at[idx, 'rpc_direction'] = new_direction
                period_data.at[idx, 'intra_period_cumulative_rpc'] = cumulative_rpc
                period_data.at[idx, 'bar_rpc'] = self._calculate_bar_rpc(
                    reu, red, prev_direction if prev_direction else 'N', new_direction
                )

                # Update prior values for next iteration
                prior_intra_high = max(row['high'], prior_intra_high if prior_intra_high is not None else row['high'])
                prior_intra_low = min(row['low'], prior_intra_low if prior_intra_low is not None else row['low'])

                if reu > 0 or red > 0:  # Only update direction if there's movement
                    prev_direction = new_direction

            return period_data

        except Exception as e:
            self.logger.error(f"Error processing parent period: {e}")
            raise

    def _determine_rpc_direction(
        self, reu: float, red: float,
        high: float, low: float, close: float,
        prev_direction: str
    ) -> str:
        try:
            if reu == 0 and red == 0:
                return prev_direction
            elif reu > 0 and red > 0:
                return 'U' if (close - low) / (high - low) >= 0.5 else 'D'
            elif reu > 0:
                return 'U'
            else:
                return 'D'

        except Exception as e:
            self.logger.error(f"Error determining RPC direction: {e}")
            raise

    def _calculate_bar_rpc(
        self, reu: float, red: float,
        prev_direction: str, new_direction: str
    ) -> int:
        try:
            if reu == 0 and red == 0:
                return 0
            elif reu > 0 and red > 0:
                if prev_direction == 'N' or new_direction != prev_direction:
                    return 2
                return 1
            elif new_direction != prev_direction and prev_direction != 'N':
                return 1
            return 0

        except Exception as e:
            self.logger.error(f"Error calculating bar RPC: {e}")
            raise

class TestTemporalFramework:
    def run_all_tests(self):
        processor = ConnectTemporalPeriods()

        assert processor._calculate_bar_rpc(0, 0, 'N', 'N') == 0, "No expansion should return 0"
        assert processor._calculate_bar_rpc(1, 1, 'N', 'U') == 2, "Two-way expansion should return 2"
        assert processor._calculate_bar_rpc(1, 0, 'D', 'U') == 1, "Direction change should return 1"

        assert processor._determine_rpc_direction(0, 0, 100, 90, 95, 'U') == 'U', "No expansion should maintain direction"
        assert processor._determine_rpc_direction(1, 1, 100, 90, 97, 'N') == 'U', "Close in upper half should return U"
        assert processor._determine_rpc_direction(1, 0, 100, 90, 95, 'D') == 'U', "Upward expansion should return U"

        print("All tests passed!")

if __name__ == "__main__":
    tests = TestTemporalFramework()
    tests.run_all_tests()

    processor = ConnectTemporalPeriods()
    processor.process_files()

All tests passed!


In [None]:
import pandas as pd
import os
import logging
from typing import Tuple, Dict, Optional
from dataclasses import dataclass

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('temporal_processing.log'),
        logging.StreamHandler()
    ]
)

@dataclass
class ParentPeriodStats:
    high: float
    low: float
    duration: int
    completed_bars: int
    rpc_count: int
    bar_of_high: int
    bar_of_low: int

class DataValidationError(Exception):
    pass

class ConnectTemporalPeriods:
    REQUIRED_COLUMNS = {'date', 'open', 'high', 'low', 'close'}

    def __init__(self, child_period: str = "D", parent_period: str = "M"):
        self.child_period = child_period
        self.parent_period = parent_period
        self.logger = logging.getLogger(__name__)

    def validate_dataframe(self, df: pd.DataFrame) -> None:
        missing_cols = self.REQUIRED_COLUMNS - set(df.columns)
        if missing_cols:
            raise DataValidationError(f"Missing required columns: {missing_cols}")

        if not pd.api.types.is_datetime64_any_dtype(df['date']):
            try:
                pd.to_datetime(df['date'])
            except Exception as e:
                raise DataValidationError(f"Invalid date format: {e}")

        price_cols = ['open', 'high', 'low', 'close']
        for col in price_cols:
            if not pd.api.types.is_numeric_dtype(df[col]):
                raise DataValidationError(f"Column {col} must be numeric")

        invalid_rows = df[df['high'] < df['low']].index
        if len(invalid_rows) > 0:
            raise DataValidationError(f"Found {len(invalid_rows)} rows where high < low")

    def process_files(self, input_dir: str = "input/") -> None:
        try:
            os.makedirs("parent_output", exist_ok=True)
            os.makedirs("processed_output", exist_ok=True)

            file_count = 0
            for filename in os.listdir(input_dir):
                if filename.endswith(".csv"):
                    ticker = filename.replace(".csv", "")
                    self.logger.info(f"Processing {ticker}")
                    self.process_single_file(os.path.join(input_dir, filename), ticker)
                    file_count += 1

            self.logger.info(f"Successfully processed {file_count} files")

        except Exception as e:
            self.logger.error(f"Error processing files: {e}")
            raise

    def process_single_file(self, filepath: str, ticker: str) -> None:
        try:
            # Load and validate data
            df = pd.read_csv(filepath)
            df['date'] = pd.to_datetime(df['date'])
            self.validate_dataframe(df)

            # Add period column and initialize fields
            df['parent_period'] = df['date'].dt.to_period(self.parent_period)

            numeric_cols = {
                'intra_period_count': 'int64',
                'intra_period_high': 'float64',
                'intra_period_low': 'float64',
                'intra_period_reu': 'float64',
                'intra_period_red': 'float64',
                'range_expansion_flag': 'int64',
                'bar_rpc': 'int64',
                'intra_period_cumulative_rpc': 'int64'
            }

            for col, dtype in numeric_cols.items():
                df[col] = pd.Series(0, index=df.index, dtype=dtype)

            df['rpc_direction'] = 'N'

            # Process each parent period
            for period in df['parent_period'].unique():
                mask = df['parent_period'] == period
                period_data = df[mask].copy()
                processed_period = self._process_parent_period(period_data)
                df.loc[mask] = processed_period.astype(df[mask].dtypes)

            # Calculate parent stats
            parent_stats = self._calculate_parent_stats(df)

            # Save outputs
            self._safe_save_csv(parent_stats, f"parent_output/{ticker}_parent.csv")
            self._safe_save_csv(df, f"processed_output/{ticker}_processed.csv")

            self.logger.info(f"Successfully processed {ticker}")

        except Exception as e:
            self.logger.error(f"Error processing {ticker}: {e}")
            raise

    def _safe_save_csv(self, df: pd.DataFrame, filepath: str) -> None:
        try:
            df.to_csv(filepath, index=False)
        except Exception as e:
            self.logger.error(f"Error saving to {filepath}: {e}")
            raise

    def _calculate_parent_stats(self, df: pd.DataFrame) -> pd.DataFrame:
        try:
            # Basic stats
            parent_stats = df.groupby('parent_period').agg({
                'date': 'first',
                'high': 'max',
                'low': 'min',
                'range_expansion_flag': 'sum'  # Count
            }).reset_index()

            parent_stats.columns = ['parent_period', 'date', 'parent_high', 'parent_low', 'duration']

            # Initialize supplementary columns
            parent_stats['bar_of_high'] = 0
            parent_stats['bar_of_low'] = 0
            parent_stats['expansion_count'] = 0
            parent_stats['rpc_count'] = 0

            # Calculate additional stats per period
            for period in df['parent_period'].unique():
                period_data = df[df['parent_period'] == period]
                period_high = parent_stats.loc[parent_stats['parent_period'] == period, 'parent_high'].iloc[0]
                period_low = parent_stats.loc[parent_stats['parent_period'] == period, 'parent_low'].iloc[0]

                # First occurrence of high/low
                high_idx = period_data[period_data['high'] == period_high].index.min()
                low_idx = period_data[period_data['low'] == period_low].index.min()

                idx = parent_stats['parent_period'] == period
                if high_idx is not None:
                    high_bar = period_data.loc[:high_idx, 'high'].notna().sum()
                    parent_stats.loc[idx, 'bar_of_high'] = high_bar

                if low_idx is not None:
                    low_bar = period_data.loc[:low_idx, 'low'].notna().sum()
                    parent_stats.loc[idx, 'bar_of_low'] = low_bar

                # Count expansions and RPCs
                parent_stats.loc[idx, 'expansion_count'] = period_data['range_expansion_flag'].sum()
                parent_stats.loc[idx, 'rpc_count'] = period_data['intra_period_cumulative_rpc'].max()

            return parent_stats

        except Exception as e:
            self.logger.error(f"Error calculating parent stats: {str(e)}")
            raise

    def _process_parent_period(self, period_data: pd.DataFrame) -> pd.DataFrame:
        try:
            active_bars = 0
            cumulative_rpc = 0
            prev_direction = None
            prior_intra_high = None
            prior_intra_low = None

            for idx in period_data.index:
                row = period_data.loc[idx]

                if pd.isna(row['high']) or pd.isna(row['low']):
                    continue

                active_bars += 1
                period_data.at[idx, 'intra_period_count'] = active_bars

                # Calculate expansions
                reu = max(0, row['high'] - prior_intra_high) if prior_intra_high is not None else 0
                red = max(0, prior_intra_low - row['low']) if prior_intra_low is not None else 0

                # Update metrics
                period_data.at[idx, 'intra_period_reu'] = reu
                period_data.at[idx, 'intra_period_red'] = red
                has_expansion = (reu > 0 or red > 0)
                period_data.at[idx, 'range_expansion_flag'] = 1 if has_expansion else 0

                # Calculate bar's RPC
                if has_expansion:
                    new_direction = self._determine_rpc_direction(
                        reu, red, row['high'], row['low'], row['close'], prev_direction if prev_direction else 'N'
                    )

                    if prev_direction is None:
                        # First direction establishes
                        prev_direction = new_direction
                        period_data.at[idx, 'bar_rpc'] = 1
                        cumulative_rpc += 1
                    elif new_direction != prev_direction:
                        # New direction counts as RPC
                        period_data.at[idx, 'bar_rpc'] = 1
                        cumulative_rpc += 1
                        prev_direction = new_direction
                    else:
                        # Expansion in same direction
                        period_data.at[idx, 'bar_rpc'] = 0
                else:
                    new_direction = prev_direction if prev_direction else 'N'
                    period_data.at[idx, 'bar_rpc'] = 0

                # Update priors
                prior_intra_high = max(row['high'], prior_intra_high if prior_intra_high is not None else row['high'])
                prior_intra_low = min(row['low'], prior_intra_low if prior_intra_low is not None else row['low'])

            return period_data

        except Exception as e:
            self.logger.error(f"Error processing parent period: {e}")
            raise

    def _determine_rpc_direction(
        self, reu: float, red: float,
        high: float, low: float, close: float,
        prev_direction: str
    ) -> str:
        try:
            if reu == 0 and red == 0:
                return prev_direction
            elif reu > 0 and red > 0:
                return 'U' if (close - low) / (high - low) >= 0.5 else 'D'
            elif reu > 0:
                return 'U'
            else:
                return 'D'

        except Exception as e:
            self.logger.error(f"Error determining RPC direction: {e}")
            raise

    def _calculate_bar_rpc(
        self, reu: float, red: float,
        prev_direction: str, new_direction: str
    ) -> int:
        try:
            if reu == 0 and red == 0:
                return 0
            elif reu > 0 and red > 0:
                if prev_direction == 'N' or new_direction != prev_direction:
                    return 2
                return 1
            elif new_direction != prev_direction and prev_direction != 'N':
                return 1
            return 0

        except Exception as e:
            self.logger.error(f"Error calculating bar RPC: {e}")
            raise

class TestTemporalFramework:
    def run_all_tests(self):
        processor = ConnectTemporalPeriods()

        assert processor._calculate_bar_rpc(0, 0, 'N', 'N') == 0, "No expansion should return 0"
        assert processor._calculate_bar_rpc(1, 1, 'N', 'U') == 2, "Two-way expansion should return 2"
        assert processor._calculate_bar_rpc(1, 0, 'D', 'U') == 1, "Direction change should return 1"

        assert processor._determine_rpc_direction(0, 0, 100, 90, 95, 'U') == 'U', "No expansion should maintain direction"
        assert processor._determine_rpc_direction(1, 1, 100, 90, 97, 'N') == 'U', "Close in upper half should return U"
        assert processor._determine_rpc_direction(1, 0, 100, 90, 95, 'D') == 'U', "Upward expansion should return U"

        print("All tests passed!")

if __name__ == "__main__":
    tests = TestTemporalFramework()
    tests.run_all_tests()

    processor = ConnectTemporalPeriods()
    processor.process_files()

All tests passed!


In [None]:
import pandas as pd
import os
import logging
from typing import Tuple, Dict, Optional
from dataclasses import dataclass

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('temporal_processing.log'),
        logging.StreamHandler()
    ]
)

@dataclass
class ParentPeriodStats:
    high: float
    low: float
    duration: int
    completed_bars: int
    rpc_count: int
    bar_of_high: int
    bar_of_low: int

class DataValidationError(Exception):
    pass

class ConnectTemporalPeriods:
    REQUIRED_COLUMNS = {'date', 'open', 'high', 'low', 'close'}

    def __init__(self, child_period: str = "D", parent_period: str = "M"):
        self.child_period = child_period
        self.parent_period = parent_period
        self.logger = logging.getLogger(__name__)

    def validate_dataframe(self, df: pd.DataFrame) -> None:
        missing_cols = self.REQUIRED_COLUMNS - set(df.columns)
        if missing_cols:
            raise DataValidationError(f"Missing required columns: {missing_cols}")

        if not pd.api.types.is_datetime64_any_dtype(df['date']):
            try:
                pd.to_datetime(df['date'])
            except Exception as e:
                raise DataValidationError(f"Invalid date format: {e}")

        price_cols = ['open', 'high', 'low', 'close']
        for col in price_cols:
            if not pd.api.types.is_numeric_dtype(df[col]):
                raise DataValidationError(f"Column {col} must be numeric")

        invalid_rows = df[df['high'] < df['low']].index
        if len(invalid_rows) > 0:
            raise DataValidationError(f"Found {len(invalid_rows)} rows where high < low")

    def process_files(self, input_dir: str = "input/") -> None:
        try:
            os.makedirs("parent_output", exist_ok=True)
            os.makedirs("processed_output", exist_ok=True)

            file_count = 0
            for filename in os.listdir(input_dir):
                if filename.endswith(".csv"):
                    ticker = filename.replace(".csv", "")
                    self.logger.info(f"Processing {ticker}")
                    self.process_single_file(os.path.join(input_dir, filename), ticker)
                    file_count += 1

            self.logger.info(f"Successfully processed {file_count} files")

        except Exception as e:
            self.logger.error(f"Error processing files: {e}")
            raise

    def process_single_file(self, filepath: str, ticker: str) -> None:
        try:
            # Load and validate data
            df = pd.read_csv(filepath)
            df['date'] = pd.to_datetime(df['date'])
            self.validate_dataframe(df)

            # Add period column and initialize fields
            df['parent_period'] = df['date'].dt.to_period(self.parent_period)

            numeric_cols = {
                'intra_period_count': 'int64',
                'intra_period_high': 'float64',
                'intra_period_low': 'float64',
                'intra_period_reu': 'float64',
                'intra_period_red': 'float64',
                'range_expansion_flag': 'int64',
                'bar_rpc': 'int64',
                'intra_period_cumulative_rpc': 'int64'
            }

            for col, dtype in numeric_cols.items():
                df[col] = pd.Series(0, index=df.index, dtype=dtype)

            df['rpc_direction'] = 'N'

            # Process each parent period
            for period in df['parent_period'].unique():
                mask = df['parent_period'] == period
                period_data = df[mask].copy()
                processed_period = self._process_parent_period(period_data)
                df.loc[mask] = processed_period.astype(df[mask].dtypes)

            # Calculate parent stats
            parent_stats = self._calculate_parent_stats(df)

            # Save outputs
            self._safe_save_csv(parent_stats, f"parent_output/{ticker}_parent.csv")
            self._safe_save_csv(df, f"processed_output/{ticker}_processed.csv")

            self.logger.info(f"Successfully processed {ticker}")

        except Exception as e:
            self.logger.error(f"Error processing {ticker}: {e}")
            raise

    def _safe_save_csv(self, df: pd.DataFrame, filepath: str) -> None:
        try:
            df.to_csv(filepath, index=False)
        except Exception as e:
            self.logger.error(f"Error saving to {filepath}: {e}")
            raise

    def _calculate_parent_stats(self, df: pd.DataFrame) -> pd.DataFrame:
        try:
            # Basic stats
            parent_stats = df.groupby('parent_period').agg({
                'date': 'first',
                'high': 'max',
                'low': 'min',
                'range_expansion_flag': 'sum'  # Count
            }).reset_index()

            parent_stats.columns = ['parent_period', 'date', 'parent_high', 'parent_low', 'duration']

            # Initialize supplementary columns
            parent_stats['bar_of_high'] = 0
            parent_stats['bar_of_low'] = 0
            parent_stats['expansion_count'] = 0
            parent_stats['rpc_count'] = 0

            # Calculate additional stats per period
            for period in df['parent_period'].unique():
                period_data = df[df['parent_period'] == period]
                period_high = parent_stats.loc[parent_stats['parent_period'] == period, 'parent_high'].iloc[0]
                period_low = parent_stats.loc[parent_stats['parent_period'] == period, 'parent_low'].iloc[0]

                # First occurrence of high/low
                high_idx = period_data[period_data['high'] == period_high].index.min()
                low_idx = period_data[period_data['low'] == period_low].index.min()

                idx = parent_stats['parent_period'] == period
                if high_idx is not None:
                    high_bar = period_data.loc[:high_idx, 'high'].notna().sum()
                    parent_stats.loc[idx, 'bar_of_high'] = high_bar

                if low_idx is not None:
                    low_bar = period_data.loc[:low_idx, 'low'].notna().sum()
                    parent_stats.loc[idx, 'bar_of_low'] = low_bar

                # Count expansions and RPCs
                parent_stats.loc[idx, 'expansion_count'] = period_data['range_expansion_flag'].sum()
                parent_stats.loc[idx, 'rpc_count'] = period_data['intra_period_cumulative_rpc'].max()

            return parent_stats

        except Exception as e:
            self.logger.error(f"Error calculating parent stats: {str(e)}")
            raise

    def _process_parent_period(self, period_data: pd.DataFrame) -> pd.DataFrame:
        try:
            active_bars = 0
            cumulative_rpc = 0
            prev_direction = None
            prior_intra_high = None
            prior_intra_low = None

            for idx in period_data.index:
                row = period_data.loc[idx]

                if pd.isna(row['high']) or pd.isna(row['low']):
                    continue

                active_bars += 1
                period_data.at[idx, 'intra_period_count'] = active_bars

                # Calculate expansions
                reu = max(0, row['high'] - prior_intra_high) if prior_intra_high is not None else 0
                red = max(0, prior_intra_low - row['low']) if prior_intra_low is not None else 0

                # Update metrics
                period_data.at[idx, 'intra_period_reu'] = reu
                period_data.at[idx, 'intra_period_red'] = red
                has_expansion = (reu > 0 or red > 0)
                period_data.at[idx, 'range_expansion_flag'] = 1 if has_expansion else 0

                # Calculate bar's RPC
                if has_expansion:
                    new_direction = self._determine_rpc_direction(
                        reu, red, row['high'], row['low'], row['close'], prev_direction if prev_direction else 'N'
                    )

                    if prev_direction is None:
                        # First direction establishes
                        prev_direction = new_direction
                        period_data.at[idx, 'bar_rpc'] = 1
                        cumulative_rpc += 1
                    elif new_direction != prev_direction:
                        # New direction counts as RPC
                        period_data.at[idx, 'bar_rpc'] = 1
                        cumulative_rpc += 1
                        prev_direction = new_direction
                    else:
                        # Expansion in same direction
                        period_data.at[idx, 'bar_rpc'] = 0
                else:
                    new_direction = prev_direction if prev_direction else 'N'
                    period_data.at[idx, 'bar_rpc'] = 0

                period_data.at[idx, 'rpc_direction'] = new_direction
                period_data.at[idx, 'intra_period_cumulative_rpc'] = cumulative_rpc

                # Update priors
                prior_intra_high = max(row['high'], prior_intra_high if prior_intra_high is not None else row['high'])
                prior_intra_low = min(row['low'], prior_intra_low if prior_intra_low is not None else row['low'])

            return period_data

        except Exception as e:
            self.logger.error(f"Error processing parent period: {e}")
            raise

    def _determine_rpc_direction(
        self, reu: float, red: float,
        high: float, low: float, close: float,
        prev_direction: str
    ) -> str:
        try:
            if reu == 0 and red == 0:
                return prev_direction
            elif reu > 0 and red > 0:
                return 'U' if (close - low) / (high - low) >= 0.5 else 'D'
            elif reu > 0:
                return 'U'
            else:
                return 'D'

        except Exception as e:
            self.logger.error(f"Error determining RPC direction: {e}")
            raise

    def _calculate_bar_rpc(
        self, reu: float, red: float,
        prev_direction: str, new_direction: str
    ) -> int:
        try:
            if reu == 0 and red == 0:
                return 0
            elif reu > 0 and red > 0:
                if prev_direction == 'N' or new_direction != prev_direction:
                    return 2
                return 1
            elif new_direction != prev_direction and prev_direction != 'N':
                return 1
            return 0

        except Exception as e:
            self.logger.error(f"Error calculating bar RPC: {e}")
            raise

class TestTemporalFramework:
    def run_all_tests(self):
        processor = ConnectTemporalPeriods()

        assert processor._calculate_bar_rpc(0, 0, 'N', 'N') == 0, "No expansion should return 0"
        assert processor._calculate_bar_rpc(1, 1, 'N', 'U') == 2, "Two-way expansion should return 2"
        assert processor._calculate_bar_rpc(1, 0, 'D', 'U') == 1, "Direction change should return 1"

        assert processor._determine_rpc_direction(0, 0, 100, 90, 95, 'U') == 'U', "No expansion should maintain direction"
        assert processor._determine_rpc_direction(1, 1, 100, 90, 97, 'N') == 'U', "Close in upper half should return U"
        assert processor._determine_rpc_direction(1, 0, 100, 90, 95, 'D') == 'U', "Upward expansion should return U"

        print("All tests passed!")

if __name__ == "__main__":
    tests = TestTemporalFramework()
    tests.run_all_tests()

    processor = ConnectTemporalPeriods()
    processor.process_files()

All tests passed!
