<a href="https://colab.research.google.com/github/BaronVonBussin/NewTransit/blob/main/romulus_20250114.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [36]:
import os
import sys
import pandas as pd
import numpy as np
import math
import time
import logging
from datetime import datetime
from typing import Dict, Tuple, Optional, List, Any
from tqdm import tqdm

# Constants for configuration
VALID_CHILD_PERIODS = ['D', 'W', 'M', 'Q']
VALID_PARENT_PERIODS = ['W', 'M', 'Q', 'Y']
DEFAULT_JOBNAME = 'gelset_20250107'

class GelSetProcessor:
    def __init__(self,
                 input_dir="/content/input",
                 output_child_dir="/content/output_child",
                 output_parent_dir="/content/output_parent",
                 test_dir="/content/test",
                 child_period="D",
                 parent_period="M",
                 jobname="gelset_20250107"):
        self.input_dir = input_dir
        self.output_child_dir = output_child_dir
        self.output_parent_dir = output_parent_dir
        self.test_dir = test_dir
        self.child_period = child_period
        self.parent_period = parent_period
        self.jobname = jobname

        #print("✅ GelSetProcessor initialized!")
        #print(f"📂 Input Directory: {self.input_dir}")
        #print(f"📂 Output Child Directory: {self.output_child_dir}")
        #print(f"📂 Output Parent Directory: {self.output_parent_dir}")
        #print(f"📂 Test Directory: {self.test_dir}")

        self.export_columns = [
            # Input fields
            'ticker', 'date', 'parent_lookup', 'sequence', 'open', 'high', 'low', 'close', 'volume',

            # Prior Child (PC) fields
            'bpb_reu_value', 'bpb_red_value', 'bpb_reu_flag', 'bpb_red_flag',
            'bpb_re_flag', 'bpb_twoway_flag', 'bpb_rpc',
            'bpb_pri_percentr', 'bpb_ce_percent', 'bpb_ce_value',
            'bpb_epc', 'bpb_epc_dir', 'bpb_epc_hp_flag',
             'bpb_e1_value', 'bpb_e2_value', 'bpb_e1_flag', 'bpb_e2_flag',

             # Intra-Parent (IP) fields
            'ip_open', 'ip_high', 'ip_low', 'ip_close',
            'ip_reu_value', 'ip_red_value', 'ip_reu_flag', 'ip_red_flag',
            'ip_re_flag', 'ip_twoway_flag', 'ip_rpc',
            'ip_pri_percentr', 'ip_ce_percent',
            'ip_epc', 'ip_epc_dir', 'ip_epc_hp_flag',
            'ip_e1_value', 'ip_e2_value', 'ip_e1_flag', 'ip_e2_flag',

            # Prior Parent (PP) fields
            'pp_open', 'pp_high', 'pp_low', 'pp_close',
            'pp_reu_value', 'pp_red_value', 'pp_reu_flag', 'pp_red_flag',
            'pp_re_flag', 'pp_twoway_flag', 'pp_rpc',
            'pp_pri_percentr', 'pp_ce_percent',
            'pp_epc', 'pp_epc_dir', 'pp_epc_hp_flag',
            'pp_e1_value', 'pp_e2_value', 'pp_e1_flag', 'pp_e2_flag',
        ]

        for directory in [self.output_child_dir, self.output_parent_dir, self.test_dir]:
            os.makedirs(directory, exist_ok=True)
            print(f"📁 Checked/Created directory: {directory}")

        for directory in [self.output_child_dir, self.output_parent_dir, self.test_dir]:
            try:
                os.makedirs(directory, exist_ok=True)
                logging.info(f"✅ Verified or created directory: {directory}")

                # Check write permissions
                if not os.access(directory, os.W_OK):
                    raise PermissionError(f"❌ No write access to: {directory}")
            except Exception as e:
                logging.error(f"❌ Failed to create/access directory {directory}: {e}")
                raise

        # Log directory paths
        #logging.info(f"📂 Input Directory: {self.input_dir}")
        #logging.info(f"📂 Output Child Directory: {self.output_child_dir}")
        #logging.info(f"📂 Output Parent Directory: {self.output_parent_dir}")
        #logging.info(f"📂 Test Directory: {self.test_dir}")

        # Ensure directories exist
        for directory in [self.output_child_dir, self.output_parent_dir, self.test_dir]:
            os.makedirs(directory, exist_ok=True)
            logging.info(f"✅ Verified or created directory: {directory}")

        for directory in [self.output_child_dir, self.output_parent_dir, self.test_dir]:
            if not os.access(directory, os.W_OK):
                raise PermissionError(f"❌ No write access to: {directory}")

        # All validation for input data--maybe ensure parsed child_period matches setting.
    def validate_input_data(self, df):
        """Validate input data according to spec requirements"""

        if df.empty:
            logging.error("❗ Input data is empty")
            raise ValueError("Input data is empty")

        required_columns = ['open', 'high', 'low', 'close', 'pph', 'ppl', 'ppc']
        missing_cols = [col for col in required_columns if col not in df.columns]
        if missing_cols:
            logging.error(f"❌ Missing required columns: {missing_cols}")
            raise ValueError(f"Missing columns in input data: {missing_cols}")

        required_columns = ['open', 'high', 'low', 'close', 'pph', 'ppl', 'ppc']
        missing_cols = [col for col in required_columns if col not in df.columns]
        if missing_cols:
            logging.error(f"❌ Missing required columns: {missing_cols}")
            raise ValueError(f"Missing columns in input data: {missing_cols}")

        conditions = [
            (df['open'] > 0, "open must be greater than 0"),
            (df['high'] > 0, "high must be greater than 0"),
            (df['low'] > 0, "low must be greater than 0"),
            (df['close'] > 0, "close must be greater than 0"),
            (df['low'] <= df['close'], "low must be less than or equal to close"),
            (df['close'] <= df['high'], "close must be less than or equal to high"),
            (df['low'] <= df['open'], "low must be less than or equal to open"),
            (df['open'] <= df['high'], "open must be less than or equal to high"),
            ((df['high'] - df['low']) != 0, "high minus low cannot be zero"),

            # Prior Parent Fields Validation
            (df['pph'] > 0, "prior parent high must be greater than 0"),
            (df['ppl'] > 0, "prior parent low must be greater than 0"),
            (df['ppc'] > 0, "close must be greater than 0"),
            (df['ppl'] <= df['ppc'], "low must be less than or equal to close"),
            (df['ppc'] <= df['pph'], "close must be less than or equal to high"),
        ]

        for condition, message in conditions:
            if not condition.all():
                logging.error(f"❌ Data validation failed: {message}")
                raise ValueError(f"Data validation failed: {message}")

        logging.info("✅ Input data passed all validation checks.")
        return True

    def safe_max(self, a, b):
        """Safely computes max while handling None values"""
        if a is None:
            return b
        if b is None:
            return a
        return max(a, b)

    def safe_min(self, a, b):
        """Safely computes min while handling None values"""
        if a is None:
            return b
        if b is None:
            return a
        return min(a, b)

    def safe_compare(self, a, b, op='gt'):
        """Safe comparison handling None values"""
        if a is None or b is None:
            return False
        if op == 'gt':
            return a > b
        elif op == 'lt':
            return a < b
        return a == b

    def safe_subtract(self, a, b):
        """Safe subtraction handling None values"""
        try:
            if a is None or b is None:
                return 0
            return a - b
        except Exception as e:
            logging.error(f"❗ Error during subtraction: {e}")
            return 0

    def process_round_one(self, data):

        """
        First Round - Establish intra-group values and prior parent expansions.
        Implements the relationship between intra-parent (IP) values, prior parent (PP) bounds, and prior child (PC) values.
        """
        output_data = []

        # Initialize group state tracker with default values
        group_state = {
            "current_group": None,

            # Prior Child (PC) Fields
            "prior_open": None,
            "prior_high": None,
            "prior_low": None,
            "prior_close": None,

            # Prior Intra-Parent Fields
            "pip_open": None,
            "pip_high": None,
            "pip_low": None,
            "pip_close": None,

            # Intra-Parent (IP) Fields
            "ip_open": None,
            "ip_high": None,
            "ip_low": None,
            "ip_close": None,

            # Intra-Parent State Tracking
            "ip_state_dir": None,
            "ip_state_id": None,
            "ip_rpc_total": 0,

            "prior_ip_state_dir": None,
            "prior_ip_state_id": None,
            "prior_ip_rpc_total": 0,

            # Prior parent fields (from input)
            "pp_high": data['pph'].iloc[0],
            "pp_low": data['ppl'].iloc[0],
            "pp_close": data['ppc'].iloc[0],

            "pp_state_dir": None,
            "pp_state_id": None,
            "pp_rpc_total": 0,

            "prior_pp_state_dir": None,
            "prior_pp_state_id": None,
            "prior_pp_rpc_total": 0,
        }

        for index, row in data.iterrows():
            try:
                print(f"\nProcessing row {index}")
                print(f"BEFORE - pip_high: {group_state['pip_high']}, ip_high: {group_state['ip_high']}, row high: {row['high']}")

                # --- Handle New Parent Group Initialization ---
                if row["parent_lookup"] != group_state["current_group"]:
                    print(f"New group: {row['parent_lookup']}")
                    group_state["current_group"] = row["parent_lookup"]

                    # Initialize Prior Child with current row values
                    group_state["prior_open"] = row["open"]
                    group_state["prior_high"] = row["high"]
                    group_state["prior_low"] = row["low"]
                    group_state["prior_close"] = row["close"]

                    # Initialize Prior Intra-Parent to None for new month
                    group_state["pip_open"] = None
                    group_state["pip_high"] = None
                    group_state["pip_low"] = None
                    group_state["pip_close"] = None

                    # Initialize Intra-Parent with current row values
                    group_state["ip_open"] = row["open"]
                    group_state["ip_high"] = row["high"]
                    group_state["ip_low"] = row["low"]
                    group_state["ip_close"] = row["close"]

                    # Reset state tracking for new month
                    group_state["prior_ip_state_dir"] = None
                    group_state["prior_ip_state_id"] = None
                    group_state["prior_ip_rpc_total"] = None

                    # Reset Prior-Parent comparison values
                    group_state["pippp_high"] = None
                    group_state["pippp_low"] = None
                    group_state["pippp_close"] = None

                    group_state["prior_pp_state_dir"] = None
                    group_state["prior_pp_state_id"] = None
                    group_state["prior_pp_rpc_total"] = None

                    print(f"After new group - pip_high: {group_state['pip_high']}, ip_high: {group_state['ip_high']}")

                # --- Process Current Row ---
                processed_row = row.to_dict()

                # Calculate new IP values
                new_ip_high = self.safe_max(group_state["pip_high"], row["high"])
                print(f"Calculated new_ip_high: {new_ip_high} from pip_high: {group_state['pip_high']} and row high: {row['high']}")

                # Store current IP values for next row's pip before updating
                next_pip_high = group_state["ip_high"]
                print(f"Stored next_pip_high: {next_pip_high} from current ip_high")

                # Update IP values with new calculations
                group_state["ip_high"] = new_ip_high
                print(f"Updated ip_high to: {group_state['ip_high']}")

                # Add stored state values to row
                processed_row.update({
                    # Prior Child values
                    "prior_open": group_state["prior_open"],
                    "prior_high": group_state["prior_high"],
                    "prior_low": group_state["prior_low"],
                    "prior_close": group_state["prior_close"],

                    # Prior Intra-Parent values
                    "pip_open": group_state["pip_open"],
                    "pip_high": group_state["pip_high"],
                    "pip_low": group_state["pip_low"],
                    "pip_close": group_state["pip_close"],

                    # Intra-Parent values
                    "ip_open": group_state["ip_open"],
                    "ip_high": group_state["ip_high"],
                    "ip_low": group_state["ip_low"],
                    "ip_close": group_state["ip_close"],

                    # Prior Parent Comparison values
                   "pippp_high": group_state["pp_high"] if group_state["pip_high"] is None else self.safe_max(group_state["pp_high"], group_state["pip_high"]),
                   "pippp_low": group_state["pp_low"] if group_state["pip_low"] is None else self.safe_min(group_state["pp_low"], group_state["pip_low"]),
                   "pippp_close": group_state["pp_close"] if group_state["pip_close"] is None else group_state["pip_close"],

                    # State tracking
                    "prior_pp_state_dir": group_state["pp_state_dir"],
                    "prior_pp_state_id": group_state["pp_state_id"]
                })

                # Verify row structure before adding
                print(f"\nVerifying processed row {index}")
                print(f"Processed row keys: {processed_row.keys()}")
                if 'pip_high' not in processed_row:
                    print(f"WARNING: pip_high missing from processed row")
                print(f"pip_high in row: {processed_row.get('pip_high', 'MISSING')}")

                output_data.append(processed_row)

                # --- Update state for next row ---

                # Update Prior Child memory with current row values
                group_state["prior_open"] = row["open"]
                group_state["prior_high"] = row["high"]
                group_state["prior_low"] = row["low"]
                group_state["prior_close"] = row["close"]

                # Store previous IP values as PIP for next row
                group_state["pip_high"] = next_pip_high
                print(f"AFTER - pip_high: {group_state['pip_high']}, ip_high: {group_state['ip_high']}")

                # Update state tracking
                group_state["prior_pp_state_dir"] = group_state["pp_state_dir"]
                group_state["prior_pp_state_id"] = group_state["pp_state_id"]

            except Exception as e:
                logging.error(f"Error processing row {index}: {e}")
                print(f"Error details - pip_high: {group_state.get('pip_high')}, ip_high: {group_state.get('ip_high')}")
                raise

        # Before returning DataFrame, verify structure
        print("\nFinal validation:")
        print(f"Total rows processed: {len(output_data)}")
        if output_data:
            print(f"First row keys: {output_data[0].keys()}")
            print(f"Last row keys: {output_data[-1].keys()}")

        result_df = pd.DataFrame(output_data)
        print(f"DataFrame columns: {result_df.columns}")
        print(f"pip_high in DataFrame: {'pip_high' in result_df.columns}")

        return result_df
        #return pd.DataFrame(output_data)

    def process_round_two(self, data):
        """
        Second Round - Range Expansion with Prior Child, Intra-Parent, and Prior Parent Relationships
        """

        # --- Prior Child (PC) Calculations ---

        data["bpb_reu_value"] = data.apply(
            lambda row: self.safe_subtract(row["high"], row["prior_high"]) if self.safe_compare(row["high"], row["prior_high"], 'gt') else 0,
            axis=1
        )
        data["bpb_red_value"] = data.apply(
            lambda row: self.safe_subtract(row["prior_low"], row["low"]) if self.safe_compare(row["low"], row["prior_low"], 'lt') else 0,
            axis=1
        )
        data["bpb_pri_percentr"] = data.apply(
            lambda row: ((row["prior_close"] - row["prior_low"]) / (row["prior_high"] - row["prior_low"]))
            if (row["prior_high"] - row["prior_low"]) != 0 and None not in (row["prior_close"], row["prior_low"], row["prior_high"]) else 0,
            axis=1
        )
        data["bpb_ce_percent"] = data.apply(
            lambda row: None if pd.isna(row["bpb_pri_percentr"]) else
                (1 - row["bpb_pri_percentr"] if row["bpb_pri_percentr"] >= 0.5 else row["bpb_pri_percentr"]),
            axis=1
        )
        data["bpb_epc"] = data["bpb_ce_percent"].apply(
        lambda x: None if pd.isna(x) else max(1, min(5, math.ceil(x / 0.1)))
        )
        data["bpb_epc_dir"] = np.where(
            data["sequence"] == 1,
            None,
            np.where(data["bpb_pri_percentr"] >= 0.5, "U", "D")
        )
        data["bpb_e1_value"] = data.apply(
            lambda row: row["bpb_reu_value"] if row["bpb_pri_percentr"] >= 0.5 else row["bpb_red_value"],
            axis=1
        )
        data["bpb_e2_value"] = data.apply(
            lambda row: row["bpb_reu_value"] if row["bpb_pri_percentr"] < 0.5 else row["bpb_red_value"],
            axis=1
        )
        data["bpb_red_flag"] = data["bpb_red_value"] > 0
        data["bpb_reu_flag"] = data["bpb_reu_value"] > 0
        data["bpb_re_flag"] = data["bpb_reu_flag"] | data["bpb_red_flag"]
        data["bpb_twoway_flag"] = data["bpb_reu_flag"] & data["bpb_red_flag"]

        # --- Intra-Parent (IP) Calculations ---
        data["ip_reu_value"] = data.apply(
            lambda row: self.safe_subtract(row["high"], row["pip_high"]) if self.safe_compare(row["high"], row["pip_high"], 'gt') else 0,
            axis=1
        )
        data["ip_red_value"] = data.apply(
            lambda row: self.safe_subtract(row["pip_low"], row["low"]) if self.safe_compare(row["low"], row["pip_low"], 'lt') else 0,
            axis=1
        )
        data["ip_pri_percentr"] = data.apply(
            lambda row: ((row["pip_close"] - row["pip_low"]) / (row["pip_high"] - row["pip_low"]))
            if (row["pip_high"] - row["pip_low"]) != 0 else 0,
            axis=1
        )
        data["ip_ce_percent"] = data.apply(
            lambda row: None if row["sequence"] == 1 else
                (1 - row["ip_pri_percentr"] if row["ip_pri_percentr"] >= 0.5 else row["ip_pri_percentr"]),
            axis=1
        )
        data["ip_epc"] = data["ip_ce_percent"].apply(
            lambda x: None if pd.isna(x) else max(1, min(5, math.ceil(x / 0.1)))
        )
        data["ip_epc_dir"] = data.apply(
            lambda row: None if row["sequence"] == 1 else ("U" if row["ip_pri_percentr"] >= 0.5 else "D"),
            axis=1
        )
        data["ip_e1_value"] = data.apply(
            lambda row: row["ip_reu_value"] if row["ip_pri_percentr"] >= 0.5 else row["ip_red_value"],
            axis=1
        )
        data["ip_e2_value"] = data.apply(
            lambda row: row["ip_reu_value"] if row["ip_pri_percentr"] < 0.5 else row["ip_red_value"],
            axis=1
        )
        data["ip_reu_flag"] = data["ip_reu_value"] > 0
        data["ip_red_flag"] = data["ip_red_value"] > 0
        data["ip_re_flag"] = data["ip_reu_flag"] | data["ip_red_flag"]
        data["ip_twoway_flag"] = data["ip_reu_flag"] & data["ip_red_flag"]

        # --- Prior Parent (PP) Calculations ---
        data["pp_reu_value"] = data.apply(
            lambda row: self.safe_subtract(row["high"], row["pippp_high"]) if self.safe_compare(row["high"], row["pippp_high"], 'gt') else 0,
            axis=1
        )
        data["pp_red_value"] = data.apply(
            lambda row: self.safe_subtract(row["pippp_low"], row["low"]) if self.safe_compare(row["low"], row["pippp_low"], 'lt') else 0,
            axis=1
        )
        data["pp_ce_percent"] = data.apply(
            lambda row: None if row["sequence"] == 1 else
                (1 - row["pp_pri_percentr"] if row["pp_pri_percentr"] >= 0.5 else row["pp_pri_percentr"]),
            axis=1
        )
        data["pp_epc"] = data["pp_ce_percent"].apply(
            lambda x: None if pd.isna(x) else max(1, min(5, math.ceil(x / 0.1)))
        )
        data["pp_epc_dir"] = data.apply(
            lambda row: None if row["sequence"] == 1 else ("U" if row["pp_pri_percentr"] >= 0.5 else "D"),
            axis=1
        )
        data["pp_e1_value"] = data.apply(
            lambda row: row["pp_reu_value"] if row["pp_pri_percentr"] >= 0.5 else row["pp_red_value"],
            axis=1
        )
        data["pp_e2_value"] = data.apply(
            lambda row: row["pp_reu_value"] if row["pp_pri_percentr"] < 0.5 else row["pp_red_value"],
            axis=1
        )
        data["pp_reu_flag"] = data["pp_reu_value"] > 0
        data["pp_red_flag"] = data["pp_red_value"] > 0
        data["pp_re_flag"] = data["pp_reu_flag"] | data["pp_red_flag"]
        data["pp_twoway_flag"] = data["pp_reu_flag"] & data["pp_red_flag"]

        return data

    def process_round_three_ip(self, data):
        """Third Round - RPC States"""
        state_tracker = {
            "prior_ip_state_dir": None,
            "prior_ip_last_dir": None,
            "prior_ip_rpc_total": 0,
        }

        for index, row in data.iterrows():
            try:
                sequence = row["sequence"]
                ip_last_dir = row["ip_last_dir"]
                ip_twoway_flag = row["ip_twoway_flag"]

                if sequence == 1:
                    ip_state_dir = "N"
                    ip_rpc_from_row = 0
                    ip_rpc_total = 0
                    ip_state_id = f"{row['parent_lookup']}_{sequence}_N"
                else:
                    # Calculate gel_dir_state
                    ip_dir_state = ip_last_dir if ip_last_dir != "N" else state_tracker["prior_ip_state_dir"]

                    # Calculate gel_rpc
                    if sequence == 2 and ip_twoway_flag:
                        ip_rpc_this_row = 2
                    elif ip_last_dir == "N":
                        ip_rpc_this_row = 0
                    elif ip_last_dir == state_tracker["prior_ip_last_dir"] and ip_twoway_flag:
                        ip_rpc_this_row = 2
                    elif ip_last_dir != state_tracker["prior_ip_state_dir"] and ip_last_dir != "N":
                        ip_rpc_this_row = 1
                    else:
                        ip_rpc_this_row = 0

                    # Calculate ip_rpc_total safely
                    ip_rpc_total = state_tracker["prior_ip_rpc_total"] + ip_rpc_this_row if state_tracker["prior_ip_rpc_total"] is not None else ip_rpc_this_row

                    # Generate ip_state_id if new rpc occurs
                    ip_state_id = f"{row['parent_lookup']}_{sequence}_{ip_last_dir}" if ip_rpc_this_row == 1 else state_tracker["prior_ip_state_dir"]

                # Update DataFrame
                data.loc[index, "ip_state_dir"] = ip_state_dir
                data.loc[index, "ip_rpc_this_row"] = ip_rpc_this_row
                data.loc[index, "ip_rpc_total"] = ip_rpc_total
                data.loc[index, "ip_state_id"] = ip_state_id

                # Update state tracker
                state_tracker.update({
                    "prior_ip_state_dir": ip_state_dir,
                    "prior_ip_last_dir": ip_last_dir,
                    "prior_ip_rpc_total": ip_rpc_total,
                })

            except Exception as e:
                logging.error(f"Error processing row {index} in round three ip: {e}")
                print(f"Error details - state_tracker: {state_tracker}")
                raise

        return data

    def process_round_three_pp(self, data):
        """Third Round - RPC States for Child/Prior Parent (PP)"""
        state_tracker = {
            "prior_pp_state_dir": None,
            "prior_pp_last_dir": None,
            "prior_pp_rpc_total": 0,
        }

        for index, row in data.iterrows():
            sequence = row["sequence"]
            pp_last_dir = row["pp_last_dir"]
            pp_twoway_flag = row["pp_twoway_flag"]

            if sequence == 1:
                pp_state_dir = "N"
                pp_rpc_this_row = 0  # Represents RPC state change for this specific row
                pp_rpc_total = 0
                pp_state_id = f"{row['parent_lookup']}_{sequence}_N"
            else:
                # Calculate pp_dir_state
                pp_dir_state = pp_last_dir if pp_last_dir != "N" else state_tracker["prior_pp_state_dir"]

                # Calculate pp_rpc_this_row
                if sequence == 2 and pp_twoway_flag:
                    pp_rpc_this_row = 2
                elif pp_last_dir == "N":
                    pp_rpc_this_row = 0
                elif pp_last_dir == state_tracker["prior_pp_last_dir"] and pp_twoway_flag:
                    pp_rpc_this_row = 2
                elif pp_last_dir != state_tracker["prior_pp_state_dir"] and pp_last_dir != "N":
                    pp_rpc_this_row = 1
                else:
                    pp_rpc_this_row = 0

                # Calculate pp_rpc_total
                pp_rpc_total = state_tracker["prior_pp_rpc_total"] + pp_rpc_this_row

                # Generate pp_state_id if new rpc occurs
                if pp_rpc_this_row == 1:
                    pp_state_id = f"{row['parent_lookup']}_{sequence}_{pp_last_dir}"
                else:
                    pp_state_id = state_tracker["prior_pp_state_dir"]

            # Update state tracker
            state_tracker.update({
                "prior_pp_state_dir": pp_state_dir,
                "prior_pp_last_dir": pp_last_dir,
                "prior_pp_rpc_total": pp_rpc_total
            })

            # Update DataFrame
            data.at[index, "pp_state_dir"] = pp_state_dir
            data.at[index, "pp_rpc_this_row"] = pp_rpc_this_row
            data.at[index, "pp_rpc_total"] = pp_rpc_total
            data.at[index, "pp_state_id"] = pp_state_id

        return data

    def generate_summary(self, data, ticker):
        """Generate summary data for parent periods"""
        summary_data = []

        # Capture timestamp once for consistency
        current_date = datetime.now().date()
        current_time = datetime.now().time()

        for parent, group in data.groupby("parent"):
            if group.empty:
                continue  # Skip empty groups to avoid errors

            lookup_date = group["parent"].iloc[0]
            duration = len(group)

            parent_high = group["gel_h"].max()
            parent_low = group["gel_l"].min()

            bar_of_h = group.loc[group["gel_h"].idxmax(), "sequence"]
            bar_of_l = group.loc[group["gel_l"].idxmin(), "sequence"]

            range_max = group["range"].max()
            range_avg = group["range"].mean()

            A1 = min(bar_of_h, bar_of_l) if bar_of_h and bar_of_l else None
            A2 = max(bar_of_h, bar_of_l) if bar_of_h and bar_of_l else None

            # --- Prior Child (PC) Summary ---
            bpb_reu_count = group["bpb_reu_flag"].sum()
            bpb_red_count = group["bpb_red_flag"].sum()
            bpb_reu_max = group["bpb_reu_value"].max()
            bpb_red_max = group["bpb_red_value"].max()
            bpb_re_count = group["bpb_re_flag"].sum()
            bpb_no_re_count = duration - group["bpb_re_flag"].sum()
            bpb_twoway_count = group["bpb_twoway_flag"].sum()

            # --- Intra-Parent (IP) Summary ---
            ip_reu_count = group["ip_reu_flag"].sum()
            ip_red_count = group["ip_red_flag"].sum()
            ip_reu_first = group.loc[group["ip_reu_flag"], "sequence"].min() if ip_reu_count > 0 else None
            ip_reu_last = group.loc[group["ip_reu_flag"], "sequence"].max() if ip_reu_count > 0 else None
            ip_red_first = group.loc[group["ip_red_flag"], "sequence"].min() if ip_red_count > 0 else None
            ip_red_last = group.loc[group["ip_red_flag"], "sequence"].max() if ip_red_count > 0 else None
            ip_rpc_total = group["ip_rpc_total"].sum()

            # --- Prior Parent (PP) Summary ---
            pp_reu_count = group["pp_reu_flag"].sum()
            pp_red_count = group["pp_red_flag"].sum()
            pp_reu_first = group.loc[group["pp_reu_flag"] == True, "sequence"].min() if pp_reu_count > 0 else None
            pp_reu_last = group.loc[group["pp_reu_flag"] == True, "sequence"].max() if pp_reu_count > 0 else None
            pp_red_first = group.loc[group["pp_red_flag"] == True, "sequence"].min() if pp_red_count > 0 else None
            pp_red_last = group.loc[group["pp_red_flag"] == True, "sequence"].max() if pp_red_count > 0 else None
            pp_rpc_total = group["pp_rpc_this_row"].sum()


            summary_data.append({
                "ticker": ticker,
                "parent": lookup_date,
                "duration": duration,
                "child_period": self.child_period,
                "parent_period": self.parent_period,
                "range_max": range_max,
                "range_avg": range_avg,
                "parent_high": parent_high,
                "parent_low": parent_low,
                "bar_of_h": bar_of_h,
                "bar_of_l": bar_of_l,
                "A1": A1,
                "A2": A2,

                # PC Summary
                "bpb_reu_count": bpb_reu_count,
                "bpb_red_count": bpb_red_count,
                "bpb_reu_max": bpb_reu_max,
                "bpb_red_max": bpb_red_max,
                "bpb_re_count": bpb_re_count,
                "bpb_no_re_count": bpb_no_re_count,
                "bpb_twoway_count": bpb_twoway_count,

                #ip summary
                "ip_reu_count": ip_reu_count,
                "ip_red_count": ip_red_count,
                "ip_reu_first": ip_reu_first,
                "ip_reu_last": ip_reu_last,
                "ip_red_first": ip_red_first,
                "ip_red_last": ip_red_last,
                "ip_rpc_total": ip_rpc_total,

                # PP Sumamry
                "pp_reu_count": pp_reu_count,
                "pp_red_count": pp_red_count,
                "pp_reu_first": pp_reu_first,
                "pp_reu_last": pp_reu_last,
                "pp_red_first": pp_red_first,
                "pp_red_last": pp_red_last,
                "pp_rpc_count": ip_rpc_total,

                # metadata
                "create_date": datetime.now().date(),
                "create_time": datetime.now().time(),
                "jobname": self.jobname
            })

        return pd.DataFrame(summary_data)

    def export_test_file(self, data, ticker):
        """Export all fields after round three processing to a test file"""
        # Assign ticker to the data
        # ✅ Add this to verify the export process
        print(f"Attempting to export test file for {ticker}")

        # ⚠️ Check if DataFrame is empty
        if data.empty:
            print("⚠️ Data is empty, nothing to export.")
            return data  # Stop here if data is empty
        else:
            print(f"Data has {len(data)} rows. Proceeding with export.")

        data['ticker'] = ticker

        # Updated field list to reflect the latest spec
        columns = [
            # Original input fields
            'ticker', 'date', 'parent_lookup', 'sequence', 'open', 'high', 'low', 'close', 'range',
            'chg_h', 'chg_l', 'chg_h_percent', 'chg_l_percent',

            # Prior Child (PC) Fields
            'bpb_reu_value', 'bpb_red_value', 'bpb_reu_flag', 'bpb_red_flag',
            'bpb_pri_percentr', 'bpb_ce_percent', 'bpb_epc', 'bpb_epc_dir',
            'bpb_e1_value', 'bpb_e2_value', 'bpb_re_flag', 'bpb_twoway_flag',

            # Intra-Parent (IP) Fields
            'ip_open', 'ip_high', 'ip_low', 'ip_close',
            'ip_reu_value', 'ip_red_value', 'ip_reu_flag', 'ip_red_flag',
            'ip_pri_percentr', 'ip_ce_percent', 'ip_epc', 'ip_epc_dir',
            'ip_e1_value', 'ip_e2_value', 'ip_re_flag', 'ip_twoway_flag',
            'ip_state_dir', 'ip_rpc_this_row', 'ip_rpc_total', 'ip_state_id',

            # Prior Parent (PP) Fields
            'pp_open', 'pp_high', 'pp_low', 'pp_close',
            'pp_reu_value', 'pp_red_value', 'pp_reu_flag', 'pp_red_flag',
            'pp_pri_percentr', 'pp_ce_percent', 'pp_epc', 'pp_epc_dir',
            'pp_e1_value', 'pp_e2_value', 'pp_re_flag', 'pp_twoway_flag',
            'pp_state_dir', 'pp_rpc_this_row', 'pp_rpc_total', 'pp_state_id'
        ]

        # ✅ Ensure test directory exists
        if not os.path.exists(self.test_dir):
            try:
                os.makedirs(self.test_dir, exist_ok=True)
                print(f"📂 Created missing test directory: {self.test_dir}")
            except Exception as e:
                print(f"❌ Failed to create test directory {self.test_dir}: {str(e)}")
                return data

       # ✅ Check if the directory is writable
        if not os.access(self.test_dir, os.W_OK):
            print(f"❌ No write access to: {self.test_dir}")
            return data

        # ✅ Ensure all columns exist in the DataFrame
        missing_columns = [col for col in columns if col not in data.columns]
        if missing_columns:
            print(f"⚠️ Missing columns in data, cannot export: {missing_columns}")
            return data

        # Create filename with timestamp for uniqueness
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        output_file = os.path.join(self.test_dir, f"{ticker}_test_{timestamp}.csv")

        # Export to CSV with error handling
        try:
            data.to_csv(output_file, columns=columns, index=False)
            print(f"Exported test file to: {output_file}")
        except Exception as e:
            print(f"Failed to export test file for {ticker}: {str(e)}")

        return data

    def process_file(self, filename):
        """Process a single input file through all rounds."""
        try:
            # --- Validate Filename Format ---
            if not filename.endswith(f"_{self.child_period}.csv"):
                raise ValueError(f"Invalid filename format: {filename}. Expected format: {{TICKER}}_{self.child_period}.csv")

            # --- Extract Ticker ---
            ticker = filename.split('_')[0]

            logging.info(f"📥 Reading file: {filename}")

            print(f"Reading file: {filename}")
            # --- Read Input File ---
            input_path = os.path.join(self.input_dir, filename)
            try:
                data = pd.read_csv(input_path)
                logging.info(f"✅ Successfully loaded input file: {input_path}")
            except FileNotFoundError:
                logging.error(f"❌ File not found: {input_path}")
                raise FileNotFoundError(f"Input file not found: {input_path}")
            except pd.errors.EmptyDataError:
                logging.error(f"❗ File is empty: {input_path}")
                raise ValueError(f"Input file is empty: {input_path}")
            except pd.errors.ParserError as e:
                logging.error(f"❗ Error parsing file {input_path}: {e}")
                raise ValueError(f"Error parsing file: {input_path}")
            except Exception as e:
                logging.error(f"❗ Unexpected error reading file {input_path}: {e}")
                raise

            # --- Validate Input Data ---
            self.validate_input_data(data)
            print("✅ Data validation passed")

            # --- Add Required Metadata Columns ---
            data['child_period'] = self.child_period
            data['parent_period'] = self.parent_period
            data['jobname'] = self.jobname

            # --- Process Through All Rounds ---
            data = self.process_round_one(data)
            print("✅ Completed Round 1")
            print(f"🔎 After Round 1 - Data shape: {data.shape}")
            data = self.process_round_two(data)
            print("✅ Completed Round 2")
            print(f"🔎 After Round 2 - Data shape: {data.shape}")
            data = self.process_round_three(data)
            print("✅ Completed Round 3")
            print(f"🔎 After Round 3 - Data shape: {data.shape}")

            # --- Generate Summary ---
            self.validate_input_data(data)
            logging.info("✅ Data validation passed")

            # --- Add Required Metadata Columns ---
            data['child_period'] = self.child_period
            data['parent_period'] = self.parent_period
            data['jobname'] = self.jobname

            # --- Process Through All Rounds ---
            data = self.process_round_one(data)
            logging.info("✅ Completed Round 1")

            data = self.process_round_two(data)
            logging.info("✅ Completed Round 2")

            data = self.process_round_three(data)
            logging.info("✅ Completed Round 3")

            # --- Export Test File ---
            self.export_test_file(data, ticker)
            logging.info("✅ Exported test file")

            # --- Generate Summary ---
            summary_df = self.generate_summary(data, ticker)
            logging.info("✅ Generated summary data")

            # --- Export Test File ---
            self.export_test_file(data, ticker)
            logging.info("✅ Exported test file")

            # --- Export Child Data ---
            child_filename = f"{ticker}_{self.child_period}_processed.csv"
            child_path = os.path.join(self.output_child_dir, child_filename)

            # Check write permission
            if not os.access(self.output_child_dir, os.W_OK):
                logging.error(f"❌ No write access to: {self.output_child_dir}")
                return None, None

            data.to_csv(child_path, columns=self.export_columns, index=False)
            logging.info(f"✅ Exported child data to: {child_path}")

            # --- Export Summary Data ---
            summary_filename = f"{ticker}_{self.parent_period}_summary.csv"
            summary_path = os.path.join(self.output_parent_dir, summary_filename)

            if not os.access(self.output_parent_dir, os.W_OK):
                logging.error(f"❌ No write access to: {self.output_parent_dir}")
                return None, None

            summary_df.to_csv(summary_path, index=False)
            logging.info(f"✅ Exported parent summary to: {summary_path}")

            return data, summary_df

        except Exception as e:
            logging.error(f"❗ Error processing {filename}: {str(e)}")
            return None, None

    def process_all_files(self):
        """Process all eligible files in the input directory with progress tracking and robust error handling."""

        # --- Validate Input Directory ---
        if not os.path.exists(self.input_dir):
            raise FileNotFoundError(f"❌ Input directory {self.input_dir} does not exist.")

        # --- Check Write Permissions for Output Directories ---
        for directory in [self.output_child_dir, self.output_parent_dir, self.test_dir]:
            if not os.access(directory, os.W_OK):
                raise PermissionError(f"❌ No write access to: {directory}")

        results = {}
        processed_files = 0
        errors = []

        # --- List Eligible Files ---
        eligible_files = [f for f in os.listdir(self.input_dir) if f.endswith(f"_{self.child_period}.csv")]
        print(f"📂 Eligible Files: {eligible_files}")  # Debug print

        if not eligible_files:
            logging.warning(f"⚠️ No eligible files found with pattern *_{self.child_period}.csv")
            return results

        logging.info(f"📈 Found {len(eligible_files)} files to process.\n")

        # --- Track Total Processing Time ---
        total_start_time = time.time()

        # --- Process Each File with Progress Bar ---
        for idx, filename in enumerate(tqdm(eligible_files, desc="🚀 Processing Files", unit="file"), start=1):
            file_start_time = time.time()
            try:
                logging.info(f"\n📄 [{idx}/{len(eligible_files)}] Processing {filename}...")

                # --- Process the file ---
                data, summary_df = self.process_file(filename)

                if data is None or data.empty:
                    logging.warning(f"⚠️ Skipping {filename}: No data to process.")
                    continue

                # --- Store Results ---
                results[filename] = {
                    'status': 'success',
                    'data': data,
                    'summary': summary_df,
                    'error': None,
                    'processing_time_sec': round(time.time() - file_start_time, 2)
                }

                processed_files += 1
                logging.info(f"✅ Successfully processed {filename} in {results[filename]['processing_time_sec']} seconds.")

            except Exception as e:
                error_msg = f"❗ Error processing {filename}: {str(e)}"
                logging.error(error_msg)
                errors.append(error_msg)
                results[filename] = {
                    'status': 'error',
                    'data': None,
                    'summary': None,
                    'error': str(e)
                }

        # --- Final Processing Summary ---
        total_elapsed_time = round(time.time() - total_start_time, 2)
        logging.info(f"\n🎉 Processing Complete: {processed_files}/{len(eligible_files)} files successfully processed in {total_elapsed_time} seconds.")

        if errors:
            logging.warning("\n⚠️ Errors encountered during processing:")
            for error in errors:
                logging.warning(f"- {error}")

        return results

# 🔥 Main Execution Block (PLACE THIS AT THE END)
if __name__ == "__main__":
    print("🚀 Starting GelSetProcessor...")

    # Initialize the processor
    processor = GelSetProcessor()

    # Start processing all files
    results = processor.process_all_files()

    logging.info("✅ Processing complete!")

🚀 Starting GelSetProcessor...
📁 Checked/Created directory: /content/output_child
📁 Checked/Created directory: /content/output_parent
📁 Checked/Created directory: /content/test
📂 Eligible Files: ['BACC_D.csv']


🚀 Processing Files:   0%|          | 0/1 [00:00<?, ?file/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Verifying processed row 540
Processed row keys: dict_keys(['date', 'open', 'high', 'low', 'close', 'parent_lookup', 'sequence', 'pph', 'ppl', 'ppc', 'volume', 'child_period', 'parent_period', 'jobname', 'prior_open', 'prior_high', 'prior_low', 'prior_close', 'pip_open', 'pip_high', 'pip_low', 'pip_close', 'ip_open', 'ip_high', 'ip_low', 'ip_close', 'pippp_high', 'pippp_low', 'pippp_close', 'prior_pp_state_dir', 'prior_pp_state_id'])
pip_high in row: 9.40841
AFTER - pip_high: 9.39352, ip_high: 9.40841

Processing row 541
BEFORE - pip_high: 9.39352, ip_high: 9.40841, row high: 9.18512
Calculated new_ip_high: 9.39352 from pip_high: 9.39352 and row high: 9.18512
Stored next_pip_high: 9.40841 from current ip_high
Updated ip_high to: 9.39352

Verifying processed row 541
Processed row keys: dict_keys(['date', 'open', 'high', 'low', 'close', 'parent_lookup', 'sequence', 'pph', 'ppl', 'ppc', 'volume', 'child_period', 'parent_perio

ERROR:root:❗ Error processing BACC_D.csv: unsupported operand type(s) for -: 'float' and 'NoneType'
🚀 Processing Files: 100%|██████████| 1/1 [00:03<00:00,  3.57s/file]
