<a href="https://colab.research.google.com/github/BaronVonBussin/NewTransit/blob/main/Untitled58.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
import os
import pandas as pd
import numpy as np
import math
import time
import logging
from tqdm import tqdm
from datetime import datetime
from typing import Dict, Tuple, Optional, List, Any



class GelSetProcessor:
    def __init__(self, input_dir="/content/input", output_dir="/content/output"):
        self.input_dir = input_dir
        self.output_dir = output_dir
        self.export_columns = [
            'ticker', 'date', 'parent_lookup', 'sequence', 'open', 'high', 'low', 'close',
            'prior_high', 'prior_low', 'prior_close',
            'bpb_reu_value', 'bpb_red_value', 'bpb_reu_flag', 'bpb_red_flag', 'bpb_re_flag', 'bpb_twoway_flag',
            'bpb_pri_percentr', 'bpb_ce_percent',
            'bpb_epc', 'bpb_epc_dir', 'bpb_epc_hp_flag',
            'bpb_e1_value', 'bpb_e2_value', 'bpb_e1_flag', 'bpb_e2_flag',

            'ip_open', 'ip_high', 'ip_low', 'ip_close',
            'pip_open', 'pip_high', 'pip_low', 'pip_close',
            'ip_reu_value', 'ip_red_value', 'ip_reu_flag', 'ip_red_flag', 'ip_re_flag', 'ip_twoway_flag',
            'ip_pri_percentr', 'ip_ce_percent',
            'ip_epc', 'ip_epc_dir', 'ip_epc_hp_flag',
            'ip_e1_value', 'ip_e2_value', 'ip_e1_flag', 'ip_e2_flag',

        ]
        os.makedirs(self.output_dir, exist_ok=True)

        for directory in [self.output_dir]:
            os.makedirs(directory, exist_ok=True)

    def validate_input_data(self, df):
        required_columns = ['date', 'open', 'high', 'low', 'close', 'parent_lookup', 'sequence']
        missing_cols = [col for col in required_columns if col not in df.columns]
        if missing_cols:
            raise ValueError(f"Missing columns in input data: {missing_cols}")

        conditions = [
            (df['open'] > 0, "open must be greater than 0"),
            (df['high'] > 0, "high must be greater than 0"),
            (df['low'] > 0, "low must be greater than 0"),
            (df['close'] > 0, "close must be greater than 0"),
            (df['low'] <= df['close'], "low must be less than or equal to close"),
            (df['close'] <= df['high'], "close must be less than or equal to high"),
            (df['low'] <= df['open'], "low must be less than or equal to open"),
            (df['open'] <= df['high'], "open must be less than or equal to high"),
            ((df['high'] - df['low']) != 0, "high minus low cannot be zero"),

        ]

    def process_input(self, data):
        output_data = []

        # Initialize group state tracker with default values
        group_state = {
            "current_group": None,
            # Prior Intra-Parent Fields
            "intra_ip_open": None,
            "intra_ip_high": None,
            "intra_ip_low": None,
            "intra_ip_close": None,
            "pip_open": None,
            "pip_high": None,
            "pip_low": None,
            "pip_close": None,
        }

        for index, row in data.iterrows():
            if row["parent_lookup"] != group_state["current_group"]:
                group_state["current_group"] = row["parent_lookup"]
                # Initialize Prior Intra-Parent to None for new month
                group_state["intra_ip_open"] = row["open"]
                group_state["intra_ip_high"] = row["high"]
                group_state["intra_ip_low"] = row["low"]
                group_state["intra_ip_low"] = row["close"]
                prior_percentr = None
            else:
                # Update intra-group high and low: END OF ROW HIGH/LOW OF GEL
                group_state["intra_ip_high"] = max(group_state["intra_ip_high"], row["high"])
                group_state["intra_ip_low"] = min(group_state["intra_ip_low"], row["low"])

                group_state["pip_high"] = group_state["intra_ip_high"]
                group_state["pip_low"] = group_state["intra_ip_low"]

            processed_row = row.to_dict()
            processed_row.update({
                # Prior Intra-Parent values
                "ip_high": group_state["intra_ip_high"],
                "ip_low": group_state["intra_ip_low"],
                "pip_high": group_state["pip_high"],
                "pip_low": group_state["pip_low"],
                "pip_close": group_state["pip_close"],
            })

            output_data.append(processed_row)

        return pd.DataFrame(output_data)

    def process_file(self, filename):
        input_path = os.path.join(self.input_dir, filename)
        ticker = filename.split('_')[0]
        data = pd.read_csv(input_path)

        self.validate_input_data(data)

        data['ticker'] = ticker

        # Initialize prior day columns
        data['prior_high'] = data['high'].shift(1)
        data['prior_low'] = data['low'].shift(1)
        data['prior_close'] = data['close'].shift(1)

        data['ip_high'] = data.apply(
            lambda row: row['high'] if row["sequence"] == 1 else None,
            axis=1
        )
        data['ip_low'] = data.apply(
            lambda row: row['low'] if row["sequence"] == 1 else None,
            axis=1
        )
        data['ip_open'] = data['open']
        data['ip_close'] = data['close']

        data['ip_high'] = data.apply(
            lambda row: row['high'] if row["sequence"] == 1 else None,
            axis=1
        )
        data['ip_low'] = data.apply(
            lambda row: row['low'] if row["sequence"] == 1 else None,
            axis=1
        )

                # Calculate pip values based on shifted ip values
        data['pip_open'] = data['ip_open'].shift(1)
        data['pip_high'] = data['ip_high'].shift(1)
        data['pip_low'] = data['ip_low'].shift(1)
        data['pip_close'] = data['ip_close'].shift(1)

        # Update ip_high for non-sequence-1 rows
        data['ip_high'] = data.apply(
            lambda row: row['high'] if row["sequence"] == 1
            else max(row['high'], data.loc[data.index[data.index.get_loc(row.name)-1], 'ip_high']),
            axis=1
        )
        data['ip_low'] = data.apply(
            lambda row: row['low'] if row["sequence"] == 1
            else max(row['low'], data.loc[data.index[data.index.get_loc(row.name)-1], 'ip_low']),
            axis=1
        )


        # Calculate Range Expansions
        data['bpb_reu_value'] = data.apply(
            lambda row: max(row['high'] - row['prior_high'], 0) if  not pd.isna(row['prior_high']) else 0,
            axis=1
        )
        data['bpb_red_value'] = data.apply(
            lambda row: max(row['prior_low'] - row['low'], 0) if not pd.isna(row['prior_low']) else 0,
            axis=1
        )
        # Expansion Flags
        data['bpb_reu_flag'] = data['bpb_reu_value'] > 0
        data['bpb_red_flag'] = data['bpb_red_value'] > 0
        data['bpb_re_flag'] = data['bpb_reu_flag'] | data['bpb_red_flag']
        data['bpb_twoway_flag'] = data['bpb_reu_flag'] & data['bpb_red_flag']

        data["bpb_pri_percentr"] = data.apply(
            lambda row: (row['prior_close'] - row['prior_low']) / (row['prior_high'] - row['prior_low'])
            if pd.notna(row['prior_high']) and (row['prior_high'] - row['prior_low']) != 0
            else 0,
            axis=1
        )
        data["bpb_ce_percent"] = data.apply(
            lambda row: row["bpb_pri_percentr"] if row["bpb_pri_percentr"] < 0.50 else 1 - row["bpb_pri_percentr"],
            axis=1
        )
        data["bpb_epc"] = data["bpb_ce_percent"].apply(
        lambda x: None if pd.isna(x) or not isinstance(x, (int, float)) else max(1, min(5, math.ceil(float(x) / 0.1)))
        )
        data["bpb_epc_dir"] = np.where(data["bpb_pri_percentr"] >= 0.5, "U", "D")
        data["bpb_epc_hp_flag"] = np.where(data["bpb_ce_percent"] <= 0.25, "HP", "LP")
        data["bpb_e1_value"] = data.apply(
            lambda row: row["bpb_reu_value"] if row["bpb_pri_percentr"] >= 0.5 else row["bpb_red_value"],
            axis=1
        )
        data["bpb_e2_value"] = data.apply(
            lambda row: row["bpb_reu_value"] if row["bpb_pri_percentr"] < 0.5 else row["bpb_red_value"],
            axis=1
        )
        data['bpb_e1_flag'] = data['bpb_e1_value'] > 0
        data['bpb_e2_flag'] = data['bpb_e2_value'] > 0

        # Calculate Range Expansions
        data['bpb_reu_value'] = data.apply(
            lambda row: max(row['high'] - row['prior_high'], 0) if  not pd.isna(row['prior_high']) else 0,
            axis=1
        )
        data['bpb_red_value'] = data.apply(
            lambda row: max(row['prior_low'] - row['low'], 0) if not pd.isna(row['prior_low']) else 0,
            axis=1
        )
        # Expansion Flags

        data['ip_open'] = data['pip_open']
        data['ip_high'] = data['high'] if data["sequence"] == 1 else max(data['high'], data['pip_high'])
        data['ip_low'] = data['low'] if data["sequence"] == 1 else min(data['low'], data['pip_low'])
        data['ip_close'] = data['close']

        data['ip_reu_flag'] = data['ip_reu_value'] > 0
        data['ip_red_flag'] = data['ip_red_value'] > 0
        data['ip_re_flag'] = data['ip_reu_flag'] | data['ip_red_flag']
        data['ip_twoway_flag'] = data['ip_reu_flag'] & data['ip_red_flag']

        data["ip_pri_percentr"] = data.apply(
            lambda row: (row['pip_close'] - row['pip_low']) / (row['pip_high'] - row['pip_low'])
            if pd.notna(row['pip_high']) and (row['pip_high'] - row['pip_low']) != 0
            else 0,
            axis=1
        )
        data["ip_ce_percent"] = data.apply(
            lambda row: row["ip_pri_percentr"] if row["ip_pri_percentr"] < 0.50 else 1 - row["ip_pri_percentr"],
            axis=1
        )
        data["ip_epc"] = data["ip_ce_percent"].apply(
        lambda x: None if pd.isna(x) or not isinstance(x, (int, float)) else max(1, min(5, math.ceil(float(x) / 0.1)))
        )
        data["ip_epc_dir"] = np.where(data["ip_pri_percentr"] >= 0.5, "U", "D")
        data["ip_epc_hp_flag"] = np.where(data["ip_ce_percent"] <= 0.25, "HP", "LP")
        data["ip_e1_value"] = data.apply(
            lambda row: row["ip_reu_value"] if row["ip_pri_percentr"] >= 0.5 else row["ip_red_value"],
            axis=1
        )
        data["ip_e2_value"] = data.apply(
            lambda row: row["ip_reu_value"] if row["ip_pri_percentr"] < 0.5 else row["ip_red_value"],
            axis=1
        )
        data['ip_e1_flag'] = data['ip_e1_value'] > 0
        data['ip_e2_flag'] = data['ip_e2_value'] > 0


        # Export processed file
        output_path = os.path.join(self.output_dir, f"processed_{filename}")
        data.to_csv(output_path, columns=self.export_columns, index=False)
        logging.info(f"✅ Exported processed data to {output_path}")

    def process_all_files(self):
        files = [f for f in os.listdir(self.input_dir) if f.endswith('.csv')]
        if not files:
            logging.warning("⚠️ No CSV files found in input directory.")
            return

        for file in tqdm(files, desc="Processing Files"):
            try:
                self.process_file(file)
            except Exception as e:
                logging.error(f"❗ Error processing {file}: {e}")

# --- Main Execution ---
if __name__ == "__main__":
    processor = GelSetProcessor(input_dir="./input", output_dir="./output")
    processor.process_all_files()

Processing Files:   0%|          | 0/1 [00:00<?, ?it/s]ERROR:root:❗ Error processing BAC_D.csv: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
Processing Files: 100%|██████████| 1/1 [00:01<00:00,  1.38s/it]
