<a href="https://colab.research.google.com/github/BaronVonBussin/NewTransit/blob/main/new20250113.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import os
import sys
import pandas as pd
import numpy as np
import math
import time
import logging
from datetime import datetime
from typing import Dict, Tuple, Optional, List, Any
from tqdm import tqdm

# ✅ Configure Logging (Place here)
logging.basicConfig(
    level=logging.DEBUG,  # Capture all log levels: DEBUG, INFO, WARNING, ERROR, CRITICAL
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(sys.stdout)  # Log output to the console
    ]
)

# Constants for configuration
VALID_CHILD_PERIODS = ['D', 'W', 'M', 'Q']
VALID_PARENT_PERIODS = ['W', 'M', 'Q', 'Y']
DEFAULT_JOBNAME = 'gelset_20250107'

class GelSetProcessor:
    def __init__(self,
                 input_dir="/content/input",
                 output_child_dir="/content/output_child",
                 output_parent_dir="/content/output_parent",
                 test_dir="/content/test",
                 child_period="D",
                 parent_period="M",
                 jobname="gelset_20250107"):
        self.input_dir = input_dir
        self.output_child_dir = output_child_dir
        self.output_parent_dir = output_parent_dir
        self.test_dir = test_dir
        self.child_period = child_period
        self.parent_period = parent_period
        self.jobname = jobname

        print("✅ GelSetProcessor initialized!")
        print(f"📂 Input Directory: {self.input_dir}")
        print(f"📂 Output Child Directory: {self.output_child_dir}")
        print(f"📂 Output Parent Directory: {self.output_parent_dir}")
        print(f"📂 Test Directory: {self.test_dir}")

        self.export_columns = [
            # Input fields
            'ticker', 'date', 'parent_lookup', 'sequence', 'open', 'high', 'low', 'close', 'volume',

            # Prior Child (PC) fields
            'bpb_reu_value', 'bpb_red_value', 'bpb_reu_flag', 'bpb_red_flag',
            'bpb_re_flag', 'bpb_twoway_flag', 'bpb_rpc',
            'bpb_pri_percentr', 'bpb_ce_percent', 'bpb_ce_value',
            'bpb_epc', 'bpb_epc_dir', 'bpb_epc_hp_flag',
             'bpb_e1_value', 'bpb_e2_value', 'bpb_e1_flag', 'bpb_e2_flag',

             # Intra-Parent (IP) fields
            'ip_open', 'ip_high', 'ip_low', 'ip_close',
            'ip_reu_value', 'ip_red_value', 'ip_reu_flag', 'ip_red_flag',
            'ip_re_flag', 'ip_twoway_flag', 'ip_rpc',
            'ip_pri_percentr', 'ip_ce_percent',
            'ip_epc', 'ip_epc_dir', 'ip_epc_hp_flag',
            'ip_e1_value', 'ip_e2_value', 'ip_e1_flag', 'ip_e2_flag',

            # Prior Parent (PP) fields
            'pp_open', 'pp_high', 'pp_low', 'pp_close',
            'pp_reu_value', 'pp_red_value', 'pp_reu_flag', 'pp_red_flag',
            'pp_re_flag', 'pp_twoway_flag', 'pp_rpc',
            'pp_pri_percentr', 'pp_ce_percent',
            'pp_epc', 'pp_epc_dir', 'pp_epc_hp_flag',
            'pp_e1_value', 'pp_e2_value', 'pp_e1_flag', 'pp_e2_flag',
        ]

        for directory in [self.output_child_dir, self.output_parent_dir, self.test_dir]:
            os.makedirs(directory, exist_ok=True)
            print(f"📁 Checked/Created directory: {directory}")

        for directory in [self.output_child_dir, self.output_parent_dir, self.test_dir]:
            try:
                os.makedirs(directory, exist_ok=True)
                if not os.access(directory, os.W_OK):
                    raise PermissionError(f"❌ No write access to: {directory}")
                print(f"✅ Directory ready: {directory}")
            except PermissionError as pe:
                logging.error(f"❌ Permission denied for directory: {directory} - {pe}")
                sys.exit(1)
            except Exception as e:
                logging.error(f"❗ Error creating directory {directory}: {e}")
                sys.exit(1)

        # Log directory paths
        logging.info(f"📂 Input Directory: {self.input_dir}")
        logging.info(f"📂 Output Child Directory: {self.output_child_dir}")
        logging.info(f"📂 Output Parent Directory: {self.output_parent_dir}")
        logging.info(f"📂 Test Directory: {self.test_dir}")

        # Ensure directories exist
        for directory in [self.output_child_dir, self.output_parent_dir, self.test_dir]:
            os.makedirs(directory, exist_ok=True)
            logging.info(f"✅ Verified or created directory: {directory}")

        for directory in [self.output_child_dir, self.output_parent_dir, self.test_dir]:
            if not os.access(directory, os.W_OK):
                raise PermissionError(f"❌ No write access to: {directory}")

        # All validation for input data--maybe ensure parsed child_period matches setting.
    def validate_input_data(self, df):
        """Validate input data according to spec requirements"""
        required_columns = ['open', 'high', 'low', 'close', 'pph', 'ppl', 'ppc']
        missing_cols = [col for col in required_columns if col not in df.columns]
        if missing_cols:
            raise ValueError(f"Missing columns in input data: {missing_cols}")
        if df.empty:
            logging.error("❗ Input data is empty. Cannot proceed.")
            raise ValueError("Input data is empty.")

        required_columns = ['open', 'high', 'low', 'close', 'pph', 'ppl', 'ppc']
        missing_cols = [col for col in required_columns if col not in df.columns]
        if missing_cols:
            logging.error(f"❌ Missing required columns: {missing_cols}")
            raise ValueError(f"Missing columns in input data: {missing_cols}")

        conditions = [
            (df['open'] > 0, "open must be greater than 0"),
            (df['high'] > 0, "high must be greater than 0"),
            (df['low'] > 0, "low must be greater than 0"),
            (df['close'] > 0, "close must be greater than 0"),
            (df['low'] <= df['close'], "low must be less than or equal to close"),
            (df['close'] <= df['high'], "close must be less than or equal to high"),
            (df['low'] <= df['open'], "low must be less than or equal to open"),
            (df['open'] <= df['high'], "open must be less than or equal to high"),
            ((df['high'] - df['low']) != 0, "high minus low cannot be zero"),

            # Prior Parent Fields Validation
            (df['pph'] > 0, "prior parent high must be greater than 0"),
            (df['ppl'] > 0, "prior parent low must be greater than 0"),
            (df['ppc'] > 0, "close must be greater than 0"),
            (df['ppl'] <= df['ppc'], "low must be less than or equal to close"),
            (df['ppc'] <= df['pph'], "close must be less than or equal to high"),
        ]

        for condition, message in conditions:
            if not condition.all():
                logging.error(f"❌ Data validation failed: {message}")
                raise ValueError(f"Data validation failed: {message}")

        logging.info("✅ Input data passed all validation checks.")
        return True

    #spacer
    def process_round_one(self, data):
        """
        First Round - Establish intra-group values and prior parent expansions.
        Implements the relationship between intra-parent (IP) values, prior parent (PP) bounds, and prior child (PC) values.
        """
        output_data = []

        # Initialize group state tracker with default values
        group_state = {
            "current_group": None,

            # Intra-Parent (IP) Fields
            "ip_open": None,
            "ip_high": None,
            "ip_low": None,
            "ip_close": None,
            "prior_ip_close": None,
            "prior_ip_high": None,
            "prior_ip_low": None,
            "prior_ip_state_dir": None,
            "prior_ip_state_id": None,

            # Prior Child (PC) Fields
            "prior_open": None,
            "prior_high": None,
            "prior_low": None,
            "prior_close": None,

            # Prior Parent (PP) Fields
            "prior_pp_state_dir": None,
            "prior_pp_state_id": None,

            # Range Expansion Tracking
            #"ip_reu_value": 0,
            #"ip_red_value": 0
        }

        for index, row in data.iterrows():
            try:
                # --- Handle New Parent Group Initialization ---
                if row["parent_lookup"] != group_state["current_group"]:
                    group_state["current_group"] = row["parent_lookup"]

                    # Initialize Intra-Parent
                    group_state["ip_open"] = row["open"]
                    group_state["ip_high"] = row["high"]
                    group_state["ip_low"] = row["low"]
                    group_state["prior_ip_close"] = None
                    group_state["prior_ip_high"] = row["pph"]
                    group_state["prior_ip_low"] = row["ppl"]
                    group_state["prior_ip_state_dir"] = None
                    group_state["prior_ip_state_id"] = None

                    # Initialize Prior Child
                    group_state["prior_open"] = None
                    group_state["prior_high"] = None
                    group_state["prior_low"] = None
                    group_state["prior_close"] = None

                    # Initialize Prior Parent
                    group_state["prior_pp_state_dir"] = None
                    group_state["prior_pp_state_id"] = None

                    logging.info(f"🔄 Initialized new parent group: {group_state['current_group']}")

                # --- Safe Calculation of bpb_pri_percentr ---
                if all(v is not None for v in [group_state["prior_close"], group_state["prior_low"], group_state["prior_high"]]):
                    denominator = group_state["prior_high"] - group_state["prior_low"]
                    if denominator != 0:
                        bpb_pri_percentr = (group_state["prior_close"] - group_state["prior_low"]) / denominator
                    else:
                        logging.warning(f"⚠️ Zero division error at index {index} for prior_high - prior_low.")
                        bpb_pri_percentr = 0
                else:
                    logging.warning(f"⚠️ Missing prior values at index {index}.")
                    bpb_pri_percentr = 0

                # --- Calculate ip_pri_percentr ---
                if all(v is not None for v in [group_state["ip_close"], group_state["ip_low"], group_state["ip_high"]]):
                    denominator = group_state["ip_high"] - group_state["ip_low"]
                    if denominator != 0:
                        ip_pri_percentr = (group_state["ip_close"] - group_state["ip_low"]) / denominator
                    else:
                        logging.warning(f"⚠️ Zero division error at index {index} for ip_high - ip_low.")
                        ip_pri_percentr = 0
                else:
                    logging.warning(f"⚠️ Missing intra-parent values at index {index}.")
                    ip_pri_percentr = 0

                # --- Handle First Row (Sequence == 1) ---
                if row["sequence"] == 1:
                    group_state["prior_open"] = row["open"]
                    group_state["prior_high"] = row["high"]
                    group_state["prior_low"] = row["low"]
                    group_state["prior_close"] = row["close"]
                    logging.info(f"🚩 Initialized first sequence row at index {index}.")

                # --- Calculate Range Expansion ---
                ip_reu_value = max(row["high"] - group_state["ip_high"], 0) if group_state["ip_high"] else 0
                ip_red_value = max(group_state["ip_low"] - row["low"], 0) if group_state["ip_low"] else 0

                # --- Calculate bpb_reu_value and bpb_red_value ---
                bpb_reu_value = max(row["high"] - group_state["prior_high"], 0) if group_state["prior_high"] is not None else 0
                bpb_red_value = max(group_state["prior_low"] - row["low"], 0) if group_state["prior_low"] is not None else 0

                # --- Update State ---
                group_state["ip_high"] = max(group_state["ip_high"], row["high"]) if group_state["ip_high"] else row["high"]
                group_state["ip_low"] = min(group_state["ip_low"], row["low"]) if group_state["ip_low"] else row["low"]
                group_state["ip_close"] = row["close"]

                # --- Processed Row ---
                processed_row = row.to_dict()
                processed_row.update({
                    "ip_open": group_state["ip_open"],
                    "ip_high": group_state["ip_high"],
                    "ip_low": group_state["ip_low"],
                    "ip_close": group_state["ip_close"],
                    "ip_reu_value": ip_reu_value,
                    "ip_red_value": ip_red_value,
                    "bpb_pri_percentr": bpb_pri_percentr,
                    "ip_pri_percentr": bpb_pri_percentr,
                    "bpb_reu_value": bpb_reu_value,
                    "bpb_red_value": bpb_red_value
                })
                output_data.append(processed_row)

                # --- Update Prior Values ---
                group_state["prior_open"] = row["open"]
                group_state["prior_high"] = row["high"]
                group_state["prior_low"] = row["low"]
                group_state["prior_close"] = row["close"]

            except Exception as e:
                logging.error(f"❌ Error processing row at index {index}: {str(e)}")

        logging.info("✅ Completed Round 1 processing.")
        return pd.DataFrame(output_data)

    def process_round_two(self, data):
        """
        Second Round - Range Expansion with Prior Child, Intra-Parent, and Prior Parent Relationships
        """

        # --- Prior Child (PC) Calculations ---
        data["bpb_reu_flag"] = data["bpb_reu_value"] > 0
        data["bpb_red_flag"] = data["bpb_red_value"] > 0
        data["bpb_ce_percent"] = data.apply(
            lambda row: None if pd.isna(row["bpb_pri_percentr"]) else
                (1 - row["bpb_pri_percentr"] if row["bpb_pri_percentr"] >= 0.5 else row["bpb_pri_percentr"]),
            axis=1
        )
        data["bpb_epc"] = data["bpb_ce_percent"].apply(
        lambda x: None if pd.isna(x) else max(1, min(5, math.ceil(x / 0.1)))
        )
        data["bpb_epc_dir"] = np.where(
            data["sequence"] == 1,
            None,
            np.where(data["bpb_pri_percentr"] >= 0.5, "U", "D")
        )
        data["bpb_e1_value"] = data.apply(
            lambda row: row["bpb_reu_value"] if row["bpb_pri_percentr"] >= 0.5 else row["bpb_red_value"],
            axis=1
        )
        data["bpb_e2_value"] = data.apply(
            lambda row: row["bpb_reu_value"] if row["bpb_pri_percentr"] < 0.5 else row["bpb_red_value"],
            axis=1
        )
        data["bpb_re_flag"] = data["bpb_reu_flag"] | data["bpb_red_flag"]
        data["bpb_twoway_flag"] = data["bpb_reu_flag"] & data["bpb_red_flag"]

        # --- Intra-Parent (IP) Calculations ---
        data["ip_reu_flag"] = data["ip_reu_value"] > 0
        data["ip_red_flag"] = data["ip_red_value"] > 0

        data["ip_ce_percent"] = data.apply(
            lambda row: None if row["sequence"] == 1 else
                (1 - row["ip_pri_percentr"] if row["ip_pri_percentr"] >= 0.5 else row["ip_pri_percentr"]),
            axis=1
        )
        data["ip_epc"] = data["ip_ce_percent"].apply(
            lambda x: None if pd.isna(x) else max(1, min(5, math.ceil(x / 0.1)))
        )
        data["ip_epc_dir"] = data.apply(
            lambda row: None if row["sequence"] == 1 else ("U" if row["ip_pri_percentr"] >= 0.5 else "D"),
            axis=1
        )
        data["ip_e1_value"] = data.apply(
            lambda row: row["ip_reu_value"] if row["ip_pri_percentr"] >= 0.5 else row["ip_red_value"],
            axis=1
        )
        data["ip_e2_value"] = data.apply(
            lambda row: row["ip_reu_value"] if row["ip_pri_percentr"] < 0.5 else row["ip_red_value"],
            axis=1
        )
        data["ip_re_flag"] = data["ip_reu_flag"] | data["ip_red_flag"]
        data["ip_twoway_flag"] = data["ip_reu_flag"] & data["ip_red_flag"]

        # --- Prior Parent (PP) Calculations ---
        data["pp_reu_flag"] = data["pp_reu_value"] > 0
        data["pp_red_flag"] = data["pp_red_value"] > 0
        data["pp_ce_percent"] = data.apply(
            lambda row: None if row["sequence"] == 1 else
                (1 - row["pp_pri_percentr"] if row["pp_pri_percentr"] >= 0.5 else row["pp_pri_percentr"]),
            axis=1
        )
        data["pp_epc"] = data["pp_ce_percent"].apply(
            lambda x: None if pd.isna(x) else max(1, min(5, math.ceil(x / 0.1)))
        )
        data["pp_epc_dir"] = data.apply(
            lambda row: None if row["sequence"] == 1 else ("U" if row["pp_pri_percentr"] >= 0.5 else "D"),
            axis=1
        )
        data["pp_e1_value"] = data.apply(
            lambda row: row["pp_reu_value"] if row["pp_pri_percentr"] >= 0.5 else row["pp_red_value"],
            axis=1
        )
        data["pp_e2_value"] = data.apply(
            lambda row: row["pp_reu_value"] if row["pp_pri_percentr"] < 0.5 else row["pp_red_value"],
            axis=1
        )
        data["pp_re_flag"] = data["pp_reu_flag"] | data["pp_red_flag"]
        data["pp_twoway_flag"] = data["pp_reu_flag"] & data["pp_red_flag"]

        return data
#ref
    def process_round_three_ip(self, data):
        """Third Round - RPC States"""
        state_tracker = {
            "prior_ip_dir_state": None,
            "prior_ip_last_dir": None,
            "prior_ip_rpc_total": 0,
        }

        for index, row in data.iterrows():
            sequence = row["sequence"]
            ip_last_dir = row["ip_last_dir"]
            ip_twoway_flag = row["ip_twoway_flag"]

            if sequence == 1:
                ip_dir_state = "N"
                ip_rpc_from_row = 0
                ip_rpc_total = 0
                ip_state_id = f"{row['parent_lookup']}_{sequence}_N"
            else:
                # Calculate gel_dir_state
                ip_dir_state = ip_last_dir if ip_last_dir != "N" else state_tracker["prior_ip_dir_state"]

                # Calculate gel_rpc
                if sequence == 2 and ip_twoway_flag:
                    ip_rpc_this_row = 2
                elif ip_last_dir == "N":
                    ip_rpc_this_row = 0
                elif ip_last_dir == state_tracker["prior_ip_last_dir"] and ip_twoway_flag:
                    ip_rpc_this_row = 2
                elif ip_last_dir != state_tracker["prior_ip_dir_state"] and ip_last_dir != "N":
                    ip_rpc_this_row = 1
                else:
                    ip_rpc_this_row = 0

                # Calculate gel_rpc_total
                ip_rpc_total = state_tracker["prior_ip_rpc_total"] + ip_rpc_this_row

                # Generate ip_state_id if new rpc occurs
                if ip_rpc_this_row == 1:
                    ip_state_id = f"{row['parent_lookup']}_{sequence}_{ip_last_dir}"
                else:
                    ip_state_id = state_tracker["prior_ip_dir_state"]

            # Update state tracker
            state_tracker.update({
                "prior_ip_dir_state": ip_dir_state,
                "prior_ip_last_dir": ip_last_dir,
                "prior_ip_rpc_total": ip_rpc_total,
            })

            # Update DataFrame
            data.loc[index, "ip_dir_state"] = ip_dir_state
            data.loc[index, "ip_rpc_this_row"] = ip_rpc_this_row
            data.loc[index, "ip_rpc_total"] = ip_rpc_total
            data.loc[index, "ip_state_id"] = ip_state_id

        return data

    def process_round_three_pp(self, data):
        """Third Round - RPC States for Child/Prior Parent (PP)"""
        state_tracker = {
            "prior_pp_dir_state": None,
            "prior_pp_last_dir": None,
            "prior_pp_rpc_total": 0,
        }

        for index, row in data.iterrows():
            sequence = row["sequence"]
            pp_last_dir = row["pp_last_dir"]
            pp_twoway_flag = row["pp_twoway_flag"]

            if sequence == 1:
                pp_dir_state = "N"
                pp_rpc_this_row = 0  # Represents RPC state change for this specific row
                pp_rpc_total = 0
                pp_state_id = f"{row['parent_lookup']}_{sequence}_N"
            else:
                # Calculate pp_dir_state
                pp_dir_state = pp_last_dir if pp_last_dir != "N" else state_tracker["prior_pp_dir_state"]

                # Calculate pp_rpc_this_row
                if sequence == 2 and pp_twoway_flag:
                    pp_rpc_this_row = 2
                elif pp_last_dir == "N":
                    pp_rpc_this_row = 0
                elif pp_last_dir == state_tracker["prior_pp_last_dir"] and pp_twoway_flag:
                    pp_rpc_this_row = 2
                elif pp_last_dir != state_tracker["prior_pp_dir_state"] and pp_last_dir != "N":
                    pp_rpc_this_row = 1
                else:
                    pp_rpc_this_row = 0

                # Calculate pp_rpc_total
                pp_rpc_total = state_tracker["prior_pp_rpc_total"] + pp_rpc_this_row

                # Generate pp_state_id if new rpc occurs
                if pp_rpc_this_row == 1:
                    pp_state_id = f"{row['parent_lookup']}_{sequence}_{pp_last_dir}"
                else:
                    pp_state_id = state_tracker["prior_pp_dir_state"]

            # Update state tracker
            state_tracker.update({
                "prior_pp_dir_state": pp_dir_state,
                "prior_pp_last_dir": pp_last_dir,
                "prior_pp_rpc_total": pp_rpc_total
            })

            # Update DataFrame
            data.at[index, "pp_dir_state"] = pp_dir_state
            data.at[index, "pp_rpc_this_row"] = pp_rpc_this_row
            data.at[index, "pp_rpc_total"] = pp_rpc_total
            data.at[index, "pp_state_id"] = pp_state_id

        return data

    def generate_summary(self, data, ticker):
        """Generate summary data for parent periods"""
        summary_data = []

        # Capture timestamp once for consistency
        current_date = datetime.now().date()
        current_time = datetime.now().time()

        for parent, group in data.groupby("parent"):
            if group.empty:
                continue  # Skip empty groups to avoid errors

            lookup_date = group["parent"].iloc[0]
            duration = len(group)

            parent_high = group["gel_h"].max()
            parent_low = group["gel_l"].min()

            bar_of_h = group.loc[group["gel_h"].idxmax(), "sequence"]
            bar_of_l = group.loc[group["gel_l"].idxmin(), "sequence"]

            range_max = group["range"].max()
            range_avg = group["range"].mean()

            A1 = min(bar_of_h, bar_of_l) if bar_of_h and bar_of_l else None
            A2 = max(bar_of_h, bar_of_l) if bar_of_h and bar_of_l else None

            # --- Prior Child (PC) Summary ---
            bpb_reu_count = group["bpb_reu_flag"].sum()
            bpb_red_count = group["bpb_red_flag"].sum()
            bpb_reu_max = group["bpb_reu_value"].max()
            bpb_red_max = group["bpb_red_value"].max()
            bpb_re_count = group["bpb_re_flag"].sum()
            bpb_no_re_count = duration - group["bpb_re_flag"].sum()
            bpb_twoway_count = group["bpb_twoway_flag"].sum()

            # --- Intra-Parent (IP) Summary ---
            ip_reu_count = group["ip_reu_flag"].sum()
            ip_red_count = group["ip_red_flag"].sum()
            ip_reu_first = group.loc[group["ip_reu_flag"], "sequence"].min() if ip_reu_count > 0 else None
            ip_reu_last = group.loc[group["ip_reu_flag"], "sequence"].max() if ip_reu_count > 0 else None
            ip_red_first = group.loc[group["ip_red_flag"], "sequence"].min() if ip_red_count > 0 else None
            ip_red_last = group.loc[group["ip_red_flag"], "sequence"].max() if ip_red_count > 0 else None
            ip_rpc_total = group["ip_rpc_total"].sum()

            # --- Prior Parent (PP) Summary ---
            pp_reu_count = group["pp_reu_flag"].sum()
            pp_red_count = group["pp_red_flag"].sum()
            pp_reu_first = group.loc[group["pp_reu_flag"] == True, "sequence"].min() if pp_reu_count > 0 else None
            pp_reu_last = group.loc[group["pp_reu_flag"] == True, "sequence"].max() if pp_reu_count > 0 else None
            pp_red_first = group.loc[group["pp_red_flag"] == True, "sequence"].min() if pp_red_count > 0 else None
            pp_red_last = group.loc[group["pp_red_flag"] == True, "sequence"].max() if pp_red_count > 0 else None
            pp_rpc_total = group["pp_rpc_this_row"].sum()


            summary_data.append({
                "ticker": ticker,
                "parent": lookup_date,
                "duration": duration,
                "child_period": self.child_period,
                "parent_period": self.parent_period,
                "range_max": range_max,
                "range_avg": range_avg,
                "parent_high": parent_high,
                "parent_low": parent_low,
                "bar_of_h": bar_of_h,
                "bar_of_l": bar_of_l,
                "A1": A1,
                "A2": A2,

                # PC Summary
                "bpb_reu_count": bpb_reu_count,
                "bpb_red_count": bpb_red_count,
                "bpb_reu_max": bpb_reu_max,
                "bpb_red_max": bpb_red_max,
                "bpb_re_count": bpb_re_count,
                "bpb_no_re_count": bpb_no_re_count,
                "bpb_twoway_count": bpb_twoway_count,

                #ip summary
                "ip_reu_count": ip_reu_count,
                "ip_red_count": ip_red_count,
                "ip_reu_first": ip_reu_first,
                "ip_reu_last": ip_reu_last,
                "ip_red_first": ip_red_first,
                "ip_red_last": ip_red_last,
                "ip_rpc_total": ip_rpc_total,

                # PP Sumamry
                "pp_reu_count": pp_reu_count,
                "pp_red_count": pp_red_count,
                "pp_reu_first": pp_reu_first,
                "pp_reu_last": pp_reu_last,
                "pp_red_first": pp_red_first,
                "pp_red_last": pp_red_last,
                "pp_rpc_count": ip_rpc_total,

                # metadata
                "create_date": datetime.now().date(),
                "create_time": datetime.now().time(),
                "jobname": self.jobname
            })

        return pd.DataFrame(summary_data)

    def export_test_file(self, data, ticker):
        """Export all fields after round three processing to a test file"""
        # Assign ticker to the data
        # ✅ Add this to verify the export process
        print(f"Attempting to export test file for {ticker}")

        # ⚠️ Check if DataFrame is empty
        if data.empty:
            print("⚠️ Data is empty, nothing to export.")
            return data  # Stop here if data is empty
        else:
            print(f"Data has {len(data)} rows. Proceeding with export.")

        data['ticker'] = ticker

        # Updated field list to reflect the latest spec
        columns = [
            # Original input fields
            'ticker', 'date', 'parent_lookup', 'sequence', 'open', 'high', 'low', 'close', 'range',
            'chg_h', 'chg_l', 'chg_h_percent', 'chg_l_percent',

            # Prior Child (PC) Fields
            'bpb_reu_value', 'bpb_red_value', 'bpb_reu_flag', 'bpb_red_flag',
            'bpb_pri_percentr', 'bpb_ce_percent', 'bpb_epc', 'bpb_epc_dir',
            'bpb_e1_value', 'bpb_e2_value', 'bpb_re_flag', 'bpb_twoway_flag',

            # Intra-Parent (IP) Fields
            'ip_open', 'ip_high', 'ip_low', 'ip_close',
            'ip_reu_value', 'ip_red_value', 'ip_reu_flag', 'ip_red_flag',
            'ip_pri_percentr', 'ip_ce_percent', 'ip_epc', 'ip_epc_dir',
            'ip_e1_value', 'ip_e2_value', 'ip_re_flag', 'ip_twoway_flag',
            'ip_dir_state', 'ip_rpc_this_row', 'ip_rpc_total', 'ip_state_id',

            # Prior Parent (PP) Fields
            'pp_open', 'pp_high', 'pp_low', 'pp_close',
            'pp_reu_value', 'pp_red_value', 'pp_reu_flag', 'pp_red_flag',
            'pp_pri_percentr', 'pp_ce_percent', 'pp_epc', 'pp_epc_dir',
            'pp_e1_value', 'pp_e2_value', 'pp_re_flag', 'pp_twoway_flag',
            'pp_dir_state', 'pp_rpc_this_row', 'pp_rpc_total', 'pp_state_id'
        ]

        # ✅ Ensure test directory exists
       if not os.path.exists(self.test_dir):
            try:
                os.makedirs(self.test_dir, exist_ok=True)
                print(f"📂 Created missing test directory: {self.test_dir}")
            except Exception as e:
                print(f"❌ Failed to create test directory {self.test_dir}: {str(e)}")
                return data

        # ✅ Check if the directory is writable
        if not os.access(self.test_dir, os.W_OK):
            print(f"❌ No write access to: {self.test_dir}")
            return data

        # ✅ Ensure all columns exist in the DataFrame
        missing_columns = [col for col in columns if col not in data.columns]
        if missing_columns:
            print(f"⚠️ Missing columns in data, cannot export: {missing_columns}")
            return data

        # Create filename with timestamp for uniqueness
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        output_file = os.path.join(self.test_dir, f"{ticker}_test_{timestamp}.csv")

        # Export to CSV with error handling
        try:
            data.to_csv(output_file, columns=columns, index=False)
            print(f"Exported test file to: {output_file}")
        except Exception as e:
            print(f"Failed to export test file for {ticker}: {str(e)}")

        return data
#ref
    def process_file(self, filename):
        """Process a single input file through all rounds."""
        try:
            # --- Validate Filename Format ---
            if not filename.endswith(f"_{self.child_period}.csv"):
                raise ValueError(f"Invalid filename format: {filename}. Expected format: {{TICKER}}_{self.child_period}.csv")

            # --- Extract Ticker ---
            ticker = filename.split('_')[0]

            print(f"Reading file: {filename}")
            # --- Read Input File ---
            input_path = os.path.join(self.input_dir, filename)
            try:
                data = pd.read_csv(input_path)
                logging.info(f"✅ Successfully loaded input file: {input_path}")
            except FileNotFoundError:
                logging.error(f"❌ File not found: {input_path}")
                raise FileNotFoundError(f"Input file not found: {input_path}")
            except pd.errors.EmptyDataError:
                logging.error(f"❗ File is empty: {input_path}")
                raise ValueError(f"Input file is empty: {input_path}")
            except pd.errors.ParserError as e:
                logging.error(f"❗ Error parsing file {input_path}: {e}")
                raise ValueError(f"Error parsing file: {input_path}")
            except Exception as e:
                logging.error(f"❗ Unexpected error reading file {input_path}: {e}")
                raise

            # --- Validate Input Data ---
            self.validate_input_data(data)
            print("✅ Data validation passed")

            # --- Add Required Metadata Columns ---
            data['child_period'] = self.child_period
            data['parent_period'] = self.parent_period
            data['jobname'] = self.jobname

            # --- Process Through All Rounds ---
            data = self.process_round_one(data)
            print("✅ Completed Round 1")
            data = self.process_round_two(data)
            print("✅ Completed Round 2")
            data = self.process_round_three(data)
            print("✅ Completed Round 3")

            # --- Generate Summary ---
            summary_df = self.generate_summary(data, ticker)

            # --- Export Test File ---
            self.export_test_file(data, ticker)
            print("✅ Exported test file")

            # --- Export Child Data ---
            child_filename = f"{ticker}_{self.child_period}_processed.csv"
            child_path = os.path.join(self.output_child_dir, child_filename)
            data.to_csv(child_path, index=False)
            logging.info(f"Exported child data to: {child_path}")
            print(f"✅ Exported child data to: {child_path}")

            # --- Export Summary Data ---
            summary_filename = f"{ticker}_{self.parent_period}_summary.csv"
            summary_path = os.path.join(self.output_parent_dir, summary_filename)
            summary_df.to_csv(summary_path, index=False)
            logging.info(f"Exported parent summary to: {summary_path}")
            print(f"✅ Exported parent summary to: {summary_path}")

            return data, summary_df

        except FileNotFoundError:
            logging.error(f"File not found: {filename}")
        except pd.errors.EmptyDataError:
            logging.error(f"File {filename} is empty.")
        except Exception as e:
            logging.error(f"Error processing {filename}: {str(e)}")
            return None, None
#ref
    def process_all_files(self):
        """Process all eligible files in the input directory with progress tracking and robust error handling."""

        # --- Validate Input Directory ---
        if not os.path.exists(self.input_dir):
            raise FileNotFoundError(f"❌ Input directory {self.input_dir} does not exist.")

        results = {}
        processed_files = 0
        errors = []

        # --- List Eligible Files ---
        eligible_files = [f for f in os.listdir(self.input_dir) if f.endswith(f"_{self.child_period}.csv")]
        print(f"📂 Eligible Files: {eligible_files}")  # Debug print

        if not eligible_files:
            logging.warning(f"⚠️ No eligible files found with pattern *_{self.child_period}.csv")
            return results

        logging.info(f"📈 Found {len(eligible_files)} files to process.\n")

        # --- Process Each File with Progress Bar ---
        for idx, filename in enumerate(tqdm(eligible_files, desc="🚀 Processing Files", unit="file"), start=1):
            start_time = time.time()
            try:
                logging.info(f"\n📄 [{idx}/{len(eligible_files)}] Processing {filename}...")

                # --- Process the file ---
                data, summary_df = self.process_file(filename)

                # --- Store Results ---
                results[filename] = {
                    'status': 'success',
                    'data': data,
                    'summary': summary_df,
                    'error': None,
                    'processing_time_sec': round(time.time() - start_time, 2)
                }

                processed_files += 1
                logging.info(f"✅ Successfully processed {filename} in {results[filename]['processing_time_sec']} seconds.")

            except Exception as e:
                error_msg = f"❗ Error processing {filename}: {str(e)}"
                logging.error(error_msg)
                errors.append(error_msg)
                results[filename] = {
                    'status': 'error',
                    'data': None,
                    'summary': None,
                    'error': str(e)
                }

        # --- Final Processing Summary ---
        logging.info(f"\n🎉 Processing Complete: {processed_files}/{len(eligible_files)} files successfully processed.")

        if errors:
            logging.warning("\n⚠️ Errors encountered during processing:")
            for error in errors:
                logging.warning(f"- {error}")

        return results

# 🔥 Main Execution Block (PLACE THIS AT THE END)
if __name__ == "__main__":
    print("🚀 Starting GelSetProcessor...")

    # Initialize the processor
    processor = GelSetProcessor()

    # Start processing all files
    results = processor.process_all_files()

    print("✅ Processing complete!")

🚀 Starting GelSetProcessor...
✅ GelSetProcessor initialized!
📂 Input Directory: /content/input
📂 Output Child Directory: /content/output_child
📂 Output Parent Directory: /content/output_parent
📂 Test Directory: /content/test
📁 Checked/Created directory: /content/output_child
📁 Checked/Created directory: /content/output_parent
📁 Checked/Created directory: /content/test
✅ Directory ready: /content/output_child
✅ Directory ready: /content/output_parent
✅ Directory ready: /content/test
📂 Eligible Files: ['BAC_D.csv']




Reading file: BAC_D.csv
✅ Data validation passed




✅ Completed Round 1


ERROR:root:Error processing BAC_D.csv: 'pp_reu_value'
🚀 Processing Files: 100%|██████████| 1/1 [00:04<00:00,  4.40s/file]

✅ Processing complete!



