<a href="https://colab.research.google.com/github/BaronVonBussin/NewTransit/blob/main/voodoo_gelset_20250222.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
### formally fivestarstunna

import os
import sys
import pandas as pd
import numpy as np
import math
from datetime import datetime
from typing import Dict, Tuple, Optional, List, Any

# Constants for configuration
VALID_CHILD_PERIODS = ['D', 'W', 'M', 'Q']
VALID_PARENT_PERIODS = ['W', 'M', 'Q', 'Y']
DEFAULT_JOBNAME = 'gelset_20250107'

class GelSetProcessor:
    def __init__(self, input_dir="/content/input",
                 output_child_dir="/content/output_child",
                 output_parent_dir="/content/output_parent",
                 child_period="D",
                 parent_period="M",
                 jobname="gelset_20250107"):

        self.input_dir = input_dir
        self.output_child_dir = output_child_dir
        self.output_parent_dir = output_parent_dir
        self.child_period = child_period
        self.parent_period = parent_period
        self.jobname = jobname

        # Create output directories if they don't exist
        os.makedirs(output_child_dir, exist_ok=True)
        os.makedirs(output_parent_dir, exist_ok=True)

    def validate_input_data(self, df):
        """Validate input data according to spec requirements"""
        conditions = [
            (df['open'] > 0, "open must be greater than 0"),
            (df['high'] > 0, "high must be greater than 0"),
            (df['low'] > 0, "low must be greater than 0"),
            (df['close'] > 0, "close must be greater than 0"),
            (df['low'] <= df['close'], "low must be less than or equal to close"),
            (df['close'] <= df['high'], "close must be less than or equal to high"),
            (df['low'] <= df['open'], "low must be less than or equal to open"),
            (df['open'] <= df['high'], "open must be less than or equal to high"),
            ((df['high'] - df['low']) != 0, "high minus low cannot be zero")
        ]

        for condition, message in conditions:
            if not condition.all():
                raise ValueError(f"Data validation failed: {message}")

        return True

    def process_round_one(self, data):
        """First Round - Establish intra-group values"""
        # Initialize output data list
        output_data = []

        # Initialize group state tracker
        group_state = {
            "current_group": None,
            "intra_group_h": None,
            "intra_group_l": None,
            "prior_gelc": None,
            "prior_gelh": None,
            "prior_gell": None,
        }

        # Process each row
        for index, row in data.iterrows():
            if row["parent"] != group_state["current_group"]:
                # Reset group state for new group
                group_state["current_group"] = row["parent"]
                group_state["intra_group_h"] = row["high"]
                group_state["intra_group_l"] = row["low"]
                group_state["prior_gelc"] = None
                group_state["prior_gelh"] = None
                group_state["prior_gell"] = None

                range_expansion_up = 0
                range_expansion_down = 0
                prior_percent_r = None
            else:
                # Calculate range expansions
                if row["high"] > group_state["intra_group_h"]:
                    range_expansion_up = row["high"] - group_state["intra_group_h"]
                else:
                    range_expansion_up = 0

                if row["low"] < group_state["intra_group_l"]:
                    range_expansion_down = group_state["intra_group_l"] - row["low"]
                else:
                    range_expansion_down = 0

                # Calculate prior_percent_r
                if group_state["prior_gelc"] is not None:
                    prior_percent_r = (group_state["prior_gelc"] - group_state["prior_gell"]) / (
                        group_state["prior_gelh"] - group_state["prior_gell"]
                    ) if (group_state["prior_gelh"] - group_state["prior_gell"]) != 0 else None
                else:
                    prior_percent_r = None

            # Update intra-group high and low
            group_state["intra_group_h"] = max(group_state["intra_group_h"], row["high"])
            group_state["intra_group_l"] = min(group_state["intra_group_l"], row["low"])

            # Save current row values as prior for next row
            group_state["prior_gelc"] = row["close"]
            group_state["prior_gelh"] = group_state["intra_group_h"]
            group_state["prior_gell"] = group_state["intra_group_l"]

            # Create processed row
            processed_row = row.to_dict()
            processed_row.update({
                "gel_h": group_state["intra_group_h"],
                "gel_l": group_state["intra_group_l"],
                "reu_value": range_expansion_up,
                "red_value": range_expansion_down,
                "prior_percent_r": prior_percent_r
            })
            output_data.append(processed_row)

        return pd.DataFrame(output_data)

    def process_round_two(self, data):
        """Second Round - Range Expansion"""
        # Apply all the transformations from round 2
        data["reu_flag"] = data.apply(
            lambda row: None if row["sequence"] == 1 else (True if row["reu_value"] > 0 else False),
            axis=1
        )

        data["red_flag"] = data.apply(
            lambda row: None if row["sequence"] == 1 else (True if row["red_value"] > 0 else False),
            axis=1
        )

        data["ce_percent"] = data.apply(
            lambda row: None if row["sequence"] == 1 else
                (1 - row["prior_percent_r"] if row["prior_percent_r"] >= 0.5 else row["prior_percent_r"]),
            axis=1
        )

        data["epc"] = data.apply(
            lambda row: None if row["sequence"] == 1 else
                (math.ceil((1 - row["prior_percent_r"]) / 0.1) if row["prior_percent_r"] >= 0.5
                 else math.ceil(row["prior_percent_r"] / 0.1)),
            axis=1
        )

        data["epc_dir"] = data.apply(
            lambda row: None if row["sequence"] == 1 else ("U" if row["prior_percent_r"] >= 0.5 else "D"),
            axis=1
        )

        data["e1_value"] = data.apply(
            lambda row: None if row["sequence"] == 1 else
                (row["reu_value"] if row["prior_percent_r"] >= 0.5 else row["red_value"]),
            axis=1
        )

        data["e2_value"] = data.apply(
            lambda row: None if row["sequence"] == 1 else
                (row["reu_value"] if row["prior_percent_r"] < 0.5 else row["red_value"]),
            axis=1
        )

        data["re_flag"] = data.apply(
            lambda row: None if row["sequence"] == 1 else
                (True if row["reu_value"] + row["red_value"] != 0 else False),
            axis=1
        )

        data["twoway"] = data.apply(
            lambda row: None if row["sequence"] == 1 else
                ((True if row["red_flag"] == True else False) if row["reu_flag"] == True else False),
            axis=1
        )

        data["twoway_fre_dir"] = data.apply(
            lambda row: None if row["sequence"] == 1 else
                (row["epc_dir"] if row["twoway"] == True else "N"),
            axis=1
        )

        data["oneway_fre_dir"] = data.apply(
            lambda row: None if row["sequence"] == 1 else
                (("D" if row["red_flag"] == True else "N") if row["reu_flag"] == False else "U"),
            axis=1
        )

        data["last_dir"] = data.apply(
            lambda row: None if row["sequence"] == 1 else
                ("D" if row["twoway"] and row["twoway_fre_dir"] == "U" else
                 "U" if row["twoway"] and row["twoway_fre_dir"] == "D" else
                 "U" if row["reu_flag"] else
                 "D" if row["red_flag"] else
                 "N"),
            axis=1
        )

        return data

    def process_round_three(self, data):
        """Third Round - RPC States"""
        state_tracker = {
            "prior_dir_state": None,
            "prior_last_dir": None,
            "prior_gel_rpc_total": 0,
        }

        for index, row in data.iterrows():
            sequence = row["sequence"]
            last_dir = row["last_dir"]
            twoway = row["twoway"]

            if sequence == 1:
                dir_state = None
                gel_rpc = None
                gel_rpc_total = None
            else:
                # Calculate dir_state
                if last_dir != "N":
                    dir_state = last_dir
                else:
                    dir_state = state_tracker["prior_dir_state"]

                # Calculate gel_rpc
                if sequence == 2 and twoway:
                    gel_rpc = 2
                elif last_dir == "N":
                    gel_rpc = 0
                elif last_dir == state_tracker["prior_last_dir"] and twoway:
                    gel_rpc = 2
                elif last_dir != state_tracker["prior_dir_state"] and last_dir != "N":
                    gel_rpc = 1
                else:
                    gel_rpc = 0

                # Calculate gel_rpc_total
                gel_rpc_total = (gel_rpc or 0) + (state_tracker["prior_gel_rpc_total"] or 0)

            # Update state tracker
            state_tracker["prior_dir_state"] = dir_state
            state_tracker["prior_last_dir"] = last_dir
            state_tracker["prior_gel_rpc_total"] = gel_rpc_total

            # Update DataFrame
            data.loc[index, "dir_state"] = dir_state
            data.loc[index, "gel_rpc"] = gel_rpc
            data.loc[index, "gel_rpc_total"] = gel_rpc_total

        return data

    def generate_summary(self, data, ticker):
        """Generate summary data for parent periods"""
        summary_data = []

        for parent, group in data.groupby("parent"):
            lookup_date = group["parent"].iloc[0]
            duration = len(group)
            parent_high = group["gel_h"].max()
            parent_low = group["gel_l"].min()
            bar_of_h = group.loc[group["gel_h"].idxmax(), "sequence"]
            bar_of_l = group.loc[group["gel_l"].idxmin(), "sequence"]
            reu_count = group["reu_flag"].sum()
            red_count = group["red_flag"].sum()

            reu_first = group.loc[group["reu_flag"] == True, "sequence"].min() if reu_count > 0 else None
            reu_last = group.loc[group["reu_flag"] == True, "sequence"].max() if reu_count > 0 else None
            red_first = group.loc[group["red_flag"] == True, "sequence"].min() if red_count > 0 else None
            red_last = group.loc[group["red_flag"] == True, "sequence"].max() if red_count > 0 else None

            rpc_total = group["gel_rpc"].sum()

            summary_data.append({
                "ticker": ticker,
                "parent": lookup_date,
                "duration": duration,
                "child_period": self.child_period,
                "parent_period": self.parent_period,
                "parent_high": parent_high,
                "parent_low": parent_low,
                "bar_of_h": bar_of_h,
                "bar_of_l": bar_of_l,
                "reu_count": reu_count,
                "red_count": red_count,
                "reu_first": reu_first,
                "reu_last": reu_last,
                "red_first": red_first,
                "red_last": red_last,
                "rpc_total": rpc_total,
                "create_date": datetime.now().date(),
                "create_time": datetime.now().time(),
                "jobname": self.jobname
            })

        return pd.DataFrame(summary_data)

    def process_file(self, filename):
        """Process a single input file through all rounds"""
        # Extract ticker from filename
        ticker = filename.split('_')[0]

        # Read input file
        input_path = os.path.join(self.input_dir, filename)
        data = pd.read_csv(input_path)

        # Validate input data
        self.validate_input_data(data)

        # Add required columns
        data['child_period'] = self.child_period
        data['parent_period'] = self.parent_period
        data['jobname'] = self.jobname

        # Process through all rounds
        data = self.process_round_one(data)
        data = self.process_round_two(data)
        data = self.process_round_three(data)

        # Generate summary
        summary_df = self.generate_summary(data, ticker)

        # Export child data
        child_filename = f"{ticker}_{self.child_period}_gel.csv"
        child_path = os.path.join(self.output_child_dir, child_filename)
        data.to_csv(child_path, index=False)

        # Export summary data
        summary_filename = f"{ticker}_{self.parent_period}_gel.csv"
        summary_path = os.path.join(self.output_parent_dir, summary_filename)
        summary_df.to_csv(summary_path, index=False)

        return data, summary_df

    def process_all_files(self):
        """Process all eligible files in the input directory"""
        if not os.path.exists(self.input_dir):
            raise FileNotFoundError(f"Input directory {self.input_dir} does not exist")

        results = {}
        processed_files = 0
        errors = []

        # Get list of all eligible files first
        eligible_files = [f for f in os.listdir(self.input_dir)
                         if f.endswith(f"_{self.child_period}.csv")]

        if not eligible_files:
            print(f"No eligible files found with pattern *_{self.child_period}.csv")
            return results

        print(f"Found {len(eligible_files)} files to process")

        # Process each eligible file
        for filename in eligible_files:
            ticker = filename.split('_')[0]
            try:
                print(f"Processing {filename}...")

                # Process the file
                child_df, summary_df = self.process_file(filename)

                # Store results
                results[filename] = {
                    'child_df': child_df,
                    'summary_df': summary_df,
                    'status': 'success',
                    'error': None
                }

                processed_files += 1
                print(f"Successfully processed {filename}")
                print(f"Generated {len(summary_df)} parent-level summaries for {ticker}")

            except Exception as e:
                error_msg = f"Error processing {filename}: {str(e)}"
                print(error_msg)
                errors.append(error_msg)

                # Store error in results
                results[filename] = {
                    'child_df': None,
                    'summary_df': None,
                    'status': 'error',
                    'error': str(e)
                }

        # Print final summary
        print("\nProcessing Complete:")
        print(f"Successfully processed: {processed_files}/{len(eligible_files)} files")

        if errors:
            print("\nErrors encountered:")
            for error in errors:
                print(f"- {error}")

        return results

def validate_periods(child_period: str, parent_period: str) -> bool:

    #Validate the child and parent period combinations.
    #Args:
    #    child_period (str): The child period code (D, W, M, Q)
    #    parent_period (str): The parent period code (W, M, Q, Y)
    #Returns:
    #    bool: True if valid, raises ValueError if invalid
    #Raises:
    #    ValueError: If period combination is invalid

    if child_period not in VALID_CHILD_PERIODS:
        raise ValueError(f"Invalid child_period: {child_period}. Must be one of {VALID_CHILD_PERIODS}")

    if parent_period not in VALID_PARENT_PERIODS:
        raise ValueError(f"Invalid parent_period: {parent_period}. Must be one of {VALID_PARENT_PERIODS}")

    # Check valid combinations
    period_order = ['D', 'W', 'M', 'Q', 'Y']
    child_idx = period_order.index(child_period)
    parent_idx = period_order.index(parent_period)

    if child_idx >= parent_idx:
        raise ValueError(f"Invalid period combination: {child_period} -> {parent_period}. Child period must be smaller than parent period.")

    return True

def setup_logging(log_dir: str = "logs") -> None:
    """Set up logging configuration."""
    os.makedirs(log_dir, exist_ok=True)
    log_file = os.path.join(log_dir, f"gelset_processing_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log")

    # Configure logging (basic setup - can be enhanced as needed)
    import logging
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        handlers=[
            logging.FileHandler(log_file),
            logging.StreamHandler(sys.stdout)
        ]
    )

def process_gelset_data(
    input_dir: str = '/content/input',
    output_child_dir: str = '/content/output_child',
    output_parent_dir: str = '/content/output_parent',
    child_period: str = 'D',
    parent_period: str = 'M',
    jobname: str = DEFAULT_JOBNAME
) -> Dict[str, Any]:
    """
    Process GelSet data with the given parameters.

    Args:
        input_dir: Directory containing input CSV files
        output_child_dir: Directory for child output files
        output_parent_dir: Directory for parent summary files
        child_period: Child period code (D, W, M, Q)
        parent_period: Parent period code (W, M, Q, Y)
        jobname: Job name for tracking

    Returns:
        Dict containing processing results
    """
    try:
        # Validate period combination
        validate_periods(child_period, parent_period)

        # Initialize processor
        processor = GelSetProcessor(
            input_dir=input_dir,
            output_child_dir=output_child_dir,
            output_parent_dir=output_parent_dir,
            child_period=child_period,
            parent_period=parent_period,
            jobname=jobname
        )

        # Process files
        results = processor.process_all_files()

        # Report results
        success_count = sum(1 for r in results.values() if r['status'] == 'success')
        print(f"\nProcessing complete. Successfully processed {success_count}/{len(results)} files.")

        return results

    except Exception as e:
        print(f"Error during processing: {str(e)}")
        return {'error': str(e)}

# Example usage in Jupyter/Colab:
if __name__ == "__main__":
    # For notebooks, just run with default parameters
    results = process_gelset_data()

Found 11 files to process
Processing C_D.csv...
Successfully processed C_D.csv
Generated 371 parent-level summaries for C
Processing CAT_D.csv...
Successfully processed CAT_D.csv
Generated 371 parent-level summaries for CAT
Processing BAC_D.csv...
Successfully processed BAC_D.csv
Generated 371 parent-level summaries for BAC
Processing BA_D.csv...
Successfully processed BA_D.csv
Generated 371 parent-level summaries for BA
Processing BABA_D.csv...
Successfully processed BABA_D.csv
Generated 123 parent-level summaries for BABA
Processing AMZN_D.csv...
Successfully processed AMZN_D.csv
Generated 331 parent-level summaries for AMZN
Processing BMY_D.csv...
Successfully processed BMY_D.csv
Generated 371 parent-level summaries for BMY
Processing BIIB_D.csv...
Successfully processed BIIB_D.csv
Generated 371 parent-level summaries for BIIB
Processing AXP_D.csv...
Successfully processed AXP_D.csv
Generated 371 parent-level summaries for AXP
Processing BK_D.csv...
Successfully processed BK_D.csv
G