<a href="https://colab.research.google.com/github/BaronVonBussin/NewTransit/blob/main/iterate_bpb_20250114.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
import os
import pandas as pd
import numpy as np
import math
import time
import logging
from datetime import datetime
from typing import Dict, Tuple, Optional, List, Any
from tqdm import tqdm

# Constants for configuration
VALID_CHILD_PERIODS = ['D', 'W', 'M', 'Q']
VALID_PARENT_PERIODS = ['W', 'M', 'Q', 'Y']
DEFAULT_JOBNAME = 'gelset_20250107'

class GelSetProcessor:
    def __init__(self, input_dir="/content/input", output_dir="/content/output"):
        self.input_dir = input_dir
        self.output_dir = output_dir
        self.export_columns = [
            'ticker', 'date', 'open', 'high', 'low', 'close',
            'prior_high', 'prior_low', 'prior_close',
            'reu_value', 'red_value', 'reu_flag', 'red_flag', 're_flag', 'twoway_flag',
            'pri_percentr', 'ce_percent',
            'epc', 'epc_dir', 'epc_hp_flag',
            'e1_value', 'e2_value', 'e1_flag', 'e2_flag',

        ]
        os.makedirs(self.output_dir, exist_ok=True)

        for directory in [self.output_dir]:
            os.makedirs(directory, exist_ok=True)

    def validate_input_data(self, df):
        required_columns = ['date', 'open', 'high', 'low', 'close']
        missing_cols = [col for col in required_columns if col not in df.columns]
        if missing_cols:
            raise ValueError(f"Missing columns in input data: {missing_cols}")

        conditions = [
            (df['open'] > 0, "open must be greater than 0"),
            (df['high'] > 0, "high must be greater than 0"),
            (df['low'] > 0, "low must be greater than 0"),
            (df['close'] > 0, "close must be greater than 0"),
            (df['low'] <= df['close'], "low must be less than or equal to close"),
            (df['close'] <= df['high'], "close must be less than or equal to high"),
            (df['low'] <= df['open'], "low must be less than or equal to open"),
            (df['open'] <= df['high'], "open must be less than or equal to high"),
            ((df['high'] - df['low']) != 0, "high minus low cannot be zero"),

        ]

    def process_file(self, filename):
        input_path = os.path.join(self.input_dir, filename)
        ticker = filename.split('_')[0]
        data = pd.read_csv(input_path)

        self.validate_input_data(data)

        data['ticker'] = ticker

        # Initialize prior day columns
        data['prior_high'] = data['high'].shift(1)
        data['prior_low'] = data['low'].shift(1)
        data['prior_close'] = data['close'].shift(1)

        # Calculate Range Expansions
        data['reu_value'] = data.apply(
            lambda row: max(row['high'] - row['prior_high'], 0) if  not pd.isna(row['prior_high']) else 0,
            axis=1
        )
        data['red_value'] = data.apply(
            lambda row: max(row['prior_low'] - row['low'], 0) if not pd.isna(row['prior_low']) else 0,
            axis=1
        )

        # Expansion Flags
        data['reu_flag'] = data['reu_value'] > 0
        data['red_flag'] = data['red_value'] > 0
        data['re_flag'] = data['reu_flag'] | data['red_flag']
        data['twoway_flag'] = data['reu_flag'] & data['red_flag']

        data["pri_percentr"] = data.apply(
            lambda row: (row['prior_close'] - row['prior_low']) / (row['prior_high'] - row['prior_low'])
            if pd.notna(row['prior_high']) and (row['prior_high'] - row['prior_low']) != 0
            else 0,
            axis=1
        )
        data["ce_percent"] = data.apply(
            lambda row: row["pri_percentr"] if row["pri_percentr"] < 0.50 else 1 - row["pri_percentr"],
            axis=1
        )
        data["epc"] = data["ce_percent"].apply(
        lambda x: None if pd.isna(x) or not isinstance(x, (int, float)) else max(1, min(5, math.ceil(float(x) / 0.1)))
        )
        data["epc_dir"] = np.where(data["pri_percentr"] >= 0.5, "U", "D")
        data["epc_hp_flag"] = np.where(data["ce_percent"] <= 0.25, "HP", "LP")
        data["e1_value"] = data.apply(
            lambda row: row["reu_value"] if row["pri_percentr"] >= 0.5 else row["red_value"],
            axis=1
        )
        data["e2_value"] = data.apply(
            lambda row: row["reu_value"] if row["pri_percentr"] < 0.5 else row["red_value"],
            axis=1
        )
        data['e1_flag'] = data['e1_value'] > 0
        data['e2_flag'] = data['e2_value'] > 0

        # Export processed file
        output_path = os.path.join(self.output_dir, f"processed_{filename}")
        data.to_csv(output_path, columns=self.export_columns, index=False)
        logging.info(f"✅ Exported processed data to {output_path}")

    def process_all_files(self):
        files = [f for f in os.listdir(self.input_dir) if f.endswith('.csv')]
        if not files:
            logging.warning("⚠️ No CSV files found in input directory.")
            return

        for file in tqdm(files, desc="Processing Files"):
            try:
                self.process_file(file)
            except Exception as e:
                logging.error(f"❗ Error processing {file}: {e}")

# --- Main Execution ---
if __name__ == "__main__":
    processor = GelSetProcessor(input_dir="./input", output_dir="./output")
    processor.process_all_files()



Processing Files: 100%|██████████| 1/1 [00:00<00:00,  1.29it/s]
