<a href="https://colab.research.google.com/github/BaronVonBussin/NewTransit/blob/main/romulus_20250108.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Romulus

In [None]:
#!/usr/bin/env python3
"""
GelSet Data Processor with Prior Parent Range Expansion Logic
"""

import os
import sys
import pandas as pd
import numpy as np
import math
from datetime import datetime
from typing import Dict, Tuple, Optional, List, Any

# Constants for configuration
VALID_CHILD_PERIODS = ['D', 'W', 'M', 'Q']
VALID_PARENT_PERIODS = ['W', 'M', 'Q', 'Y']
DEFAULT_JOBNAME = 'gelset_20250107'

class GelSetProcessor:
    def __init__(self, input_dir="/content/input",
                 output_child_dir="/content/output_child",
                 output_parent_dir="/content/output_parent",
                 test_dir="/content/test",
                 child_period="D",
                 parent_period="M",
                 jobname="gelset_20250107"):

        self.input_dir = input_dir
        self.output_child_dir = output_child_dir
        self.output_parent_dir = output_parent_dir
        self.test_dir = test_dir
        self.child_period = child_period
        self.parent_period = parent_period
        self.jobname = jobname

        self.export_columns = [
            # Original input fields
            'ticker', 'date', 'parent', 'sequence', 'open', 'high', 'low', 'close', 'range', 'chg_h', 'chg_l', 'chg_h_percent', 'chg_l_percent',
            'gap', 'island_gap', 'bpb_reu', 'bpb_red', 'bpb_reu_flag', 'bpb_red_flag', 'bpb_pripr', 'bpb_ce_percent', 'bpb_epc', 'bpb_epc_dir',
            'bpb_e1_value', 'bpb_e2_value', 'bpb_re_flag', 'bpb_twoway',

            # Round one fields
            'gel_o', 'gel_h', 'gel_l',
            'gel_reu', 'gel_red',
            'gel_reu_flag', 'gel_red_flag',
            'gel_pripr', 'gel_ce_percent', 'gel_epc', 'gel_epc_dir', 'gel_prirange',
            'gel_e1_value', 'gel_e2_value',
            'gel_re_flag', 'gel_twoway',
            'gel_twoway_fre_dir', 'gel_oneway_fre_dir',
            'gel_last_dir',
            'gel_oco_flag', 'gel_noco_dir',

            # Round two fields
            'pph', 'ppl',
            'icph', 'icpl',
            'refh', 'refl',
            'pp_reu', 'pp_red',
            'pp_reu_flag', 'pp_red_flag',
            'pp_co_flag', 'pp_noco_dir',
        ]

        # Create output directories--one for child, one for parent, and one test (can remove eventually)
        for directory in [output_child_dir, output_parent_dir, test_dir]:
            os.makedirs(directory, exist_ok=True)

    # All validation for input data--maybe ensure parsed child_period matches setting.
    def validate_input_data(self, df):
        """Validate input data according to spec requirements"""
        conditions = [
            (df['open'] > 0, "open must be greater than 0"),
            (df['high'] > 0, "high must be greater than 0"),
            (df['low'] > 0, "low must be greater than 0"),
            (df['close'] > 0, "close must be greater than 0"),
            (df['low'] <= df['close'], "low must be less than or equal to close"),
            (df['close'] <= df['high'], "close must be less than or equal to high"),
            (df['low'] <= df['open'], "low must be less than or equal to open"),
            (df['open'] <= df['high'], "open must be less than or equal to high"),
            ((df['high'] - df['low']) != 0, "high minus low cannot be zero"),
            (df['pph'] > 0, "prior parent high must be greater than 0"),
            (df['ppl'] > 0, "prior parent low must be greater than 0"),
            (df['ppc'] > 0, "close must be greater than 0"),
            (df['ppl'] <= df['ppc'], "low must be less than or equal to close"),
            (df['ppc'] <= df['pph'], "close must be less than or equal to high"),
        ]

        for condition, message in conditions:
            if not condition.all():
                raise ValueError(f"Data validation failed: {message}")

        return True

    def process_round_one(self, data):
        """
        First Round - Establish intra-group values and prior parent expansions
        Implements the relationship between gel values, prior parent bounds, and reference values
        """
        # Initialize output data list
        output_data = []

        # Initialize group state tracker--these are memory fields, initiate with None.
        group_state = {
            "current_group": None,
            "intra_group_h": None,
            "intra_group_l": None,
            "prior_gelo": None,
            "prior_gelc": None,
            "prior_gelh": None,
            "prior_gell": None,
            "prior_icph": None,  # Track previous row's icph
            "prior_icpl": None,   # Track previous row's icpl
            "pbh": None,  # Track previous row's high
            "pbl": None,   # Track previous row's low
            "pbc": None,   # Track previous row's close
        }

        # Process each row
        for index, row in data.iterrows():
            # identify new parent group
            if row["parent"] != group_state["current_group"]:
                # Reset group state for new group
                group_state["current_group"] = row["parent"]
                group_state["intra_group_h"] = row["high"]
                group_state["intra_group_l"] = row["low"]
                group_state["gel_o"] = row["open"]
                group_state["prior_gelc"] = None
                group_state["prior_gelh"] = None
                group_state["prior_gell"] = None
                group_state["prior_icph"] = row["pph"]  # For sequence 1, use pph
                group_state["prior_icpl"] = row["ppl"]  # For sequence 1, use ppl
                group_state["pbc"] = None  # Add this line to initialize pbc

                range_expansion_up = 0
                range_expansion_down = 0
                prior_percent_r = None
            else:
                # Calculate regular range expansions
                if row["high"] > group_state["intra_group_h"]:
                    range_expansion_up = row["high"] - group_state["intra_group_h"]
                else:
                    range_expansion_up = 0

                if row["low"] < group_state["intra_group_l"]:
                    range_expansion_down = group_state["intra_group_l"] - row["low"]
                else:
                    range_expansion_down = 0

            range = row["high"] - row["low"]
            chg_h = row["high"] - row["open"]
            chg_l = row["open"] - row["low"]
            chg_h_percent = (row["high"] - row["open"])/row["open"]
            chg_l_percent = (row["open"] - row["low"])/row["open"]
            body = max(row["open"], row["close"]) - min(row["open"], row["close"])
            gap = abs(row["open"] - group_state["pbc"]) if group_state["pbc"] is not None else None
            island_gap = (
                0 if group_state["pbh"] is None or group_state["pbl"] is None
                else (row["open"] - group_state["pbh"]) if row["open"] > group_state["pbh"]
                else (row["low"] - row["open"]) if row["open"] < group_state["pbl"]
                else 0
            )

            # Update intra-group high and low: END OF ROW HIGH/LOW OF GEL
            group_state["intra_group_h"] = max(group_state["intra_group_h"], row["high"])
            group_state["intra_group_l"] = min(group_state["intra_group_l"], row["low"])

            # Calculate icph/icpl (using gel values AT OPEN against parent bounds)
            icph = max(group_state["intra_group_h"], row["pph"])
            icpl = min(group_state["intra_group_l"], row["ppl"])

            # Get refh/refl from prior state
            refh = group_state["prior_icph"]
            refl = group_state["prior_icpl"]

            # Calculate prior parent range expansions
            pp_reu = max(group_state["intra_group_h"] - refh, 0) if refh is not None else 0
            pp_red = max(refl - group_state["intra_group_l"], 0) if refl is not None else 0

            # Calculate prior parent range expansions
            bpb_reu = max(row["high"] - group_state["pbh"], 0) if group_state["pbh"] is not None else 0
            bpb_red = max(group_state["pbl"] - row["low"], 0) if group_state["pbl"] is not None else 0

            # Calculate prior_percent_r
            if group_state["prior_gelc"] is not None and (group_state["prior_gelh"] - group_state["prior_gell"]) != 0:
                prior_percent_r = (group_state["prior_gelc"] - group_state["prior_gell"]) / (
                    group_state["prior_gelh"] - group_state["prior_gell"]
                )
            else:
                prior_percent_r = None

            if group_state["prior_gelc"] is not None and group_state["prior_gelh"] - group_state["prior_gell"] != 0:
                gel_prior_range = group_state["prior_gelh"] - group_state["prior_gell"]
            else:
                gel_prior_range = None

            if group_state["pbc"] is not None and (group_state["pbh"] - group_state["pbl"]) != 0:
                bpb_pripr = (group_state["pbc"] - group_state["pbl"]) / (
                    group_state["pbh"] - group_state["pbl"]
                )
            else:
                bpb_pripr = None

            # Update prior values for next row
            #group_state["prior_gelo"] = row["gelo"]
            group_state["prior_gelc"] = row["close"]
            group_state["prior_gelh"] = group_state["intra_group_h"]
            group_state["prior_gell"] = group_state["intra_group_l"]
            group_state["prior_icph"] = icph
            group_state["prior_icpl"] = icpl
            group_state["pbc"] = row["close"]

            # Create processed row
            processed_row = row.to_dict()
            processed_row.update({
                "gel_o": group_state["gel_o"],
                "gel_h": group_state["intra_group_h"],
                "gel_l": group_state["intra_group_l"],
                "gel_reu": range_expansion_up,
                "gel_red": range_expansion_down,
                "icph": icph,
                "icpl": icpl,
                "refh": refh,
                "refl": refl,
                "pp_reu": pp_reu,
                "pp_red": pp_red,
                "gel_pripr": prior_percent_r,
                "gel_prirange": gel_prior_range,
                "bpb_reu": bpb_reu,
                "bpb_red": bpb_red,
                "pbh": group_state["pbh"],
                "pbl": group_state["pbl"],
                "pbc": group_state["pbc"],
                "bpb_pripr": bpb_pripr,
                "range": range,
                "chg_h": chg_h,
                "chg_l": chg_l,
                "chg_h_percent": chg_h_percent,
                "chg_l_percent": chg_l_percent,
                "body": body,
                "gap": gap,
                "island_gap": island_gap
            })
            output_data.append(processed_row)

            group_state["pbh"] = row["high"]
            group_state["pbl"] = row["low"]
            group_state["pbc"] = row["close"]

        return pd.DataFrame(output_data)

    def process_round_two(self, data):
        """Second Round - Range Expansion"""
        # Regular range expansions
        data["bpb_reu_flag"] = data.apply(
            lambda row: True if row["bpb_reu"] > 0 else False,
            axis=1
        )

        data["bpb_red_flag"] = data.apply(
            lambda row: True if row["bpb_red"] > 0 else False,
            axis=1
        )

        data["bpb_ce_percent"] = data.apply(
            lambda row: None if pd.isna(row["bpb_pripr"]) else
                (1 - row["bpb_pripr"] if row["bpb_pripr"] >= 0.5 else row["bpb_pripr"]),
            axis=1
        )

        data["bpb_epc"] = data.apply(
            lambda row: None if pd.isna(row["bpb_ce_percent"]) else
                math.ceil(row["bpb_ce_percent"] / 0.1),
            axis=1
        )

        data["bpb_epc"] = data.apply(
            lambda row: None if pd.isna(row["bpb_ce_percent"]) else
                max(1, min(5, math.ceil(row["bpb_ce_percent"] / 0.1))),
            axis=1
        )

        data["bpb_epc_dir"] = data.apply(
            lambda row: None if row["sequence"] == 1 else ("U" if row["bpb_pripr"] >= 0.5 else "D"),
            axis=1
        )

        data["bpb_e1_value"] = data.apply(
            lambda row: row["bpb_reu"] if row["bpb_pripr"] >= 0.5 else row["bpb_red"],
            axis=1
        )

        data["bpb_e2_value"] = data.apply(
            lambda row: row["bpb_reu"] if row["bpb_pripr"] < 0.5 else row["bpb_red"],
            axis=1
        )

        data["bpb_re_flag"] = data.apply(
            lambda row: True if row["bpb_reu"] + row["bpb_red"] != 0 else False,
            axis=1
        )

        data["bpb_twoway"] = data.apply(
            lambda row: (True if row["bpb_red_flag"] == True else False) if row["bpb_reu_flag"] == True else False,
            axis=1
        )

        data["gel_reu_flag"] = data.apply(
            lambda row: None if row["sequence"] == 1 else (True if row["gel_reu"] > 0 else False),
            axis=1
        )

        data["gel_red_flag"] = data.apply(
            lambda row: None if row["sequence"] == 1 else (True if row["gel_red"] > 0 else False),
            axis=1
        )

        # Prior parent range expansions
        data["pp_reu_flag"] = data.apply(
            lambda row: None if row["sequence"] == 1 else (True if row["pp_reu"] > 0 else False),
            axis=1
        )

        data["pp_red_flag"] = data.apply(
            lambda row: None if row["sequence"] == 1 else (True if row["pp_red"] > 0 else False),
            axis=1
        )

        data["gel_ce_percent"] = data.apply(
            lambda row: None if row["sequence"] == 1 else
                (1 - row["gel_pripr"] if row["gel_pripr"] >= 0.5 else row["gel_pripr"]),
            axis=1
        )

        data["gel_epc"] = data.apply(
            lambda row: None if row["sequence"] == 1 else
                (math.ceil((1 - row["gel_pripr"]) / 0.1) if row["gel_pripr"] >= 0.5
                 else math.ceil(row["gel_pripr"] / 0.1)),
            axis=1
        )

        data["gel_epc_dir"] = data.apply(
            lambda row: None if row["sequence"] == 1 else ("U" if row["gel_pripr"] >= 0.5 else "D"),
            axis=1
        )

        data["gel_e1_value"] = data.apply(
            lambda row: None if row["sequence"] == 1 else
                (row["gel_reu"] if row["gel_pripr"] >= 0.5 else row["gel_red"]),
            axis=1
        )

        data["gel_e2_value"] = data.apply(
            lambda row: None if row["sequence"] == 1 else
                (row["gel_reu"] if row["gel_pripr"] < 0.5 else row["gel_red"]),
            axis=1
        )

        data["gel_re_flag"] = data.apply(
            lambda row: None if row["sequence"] == 1 else
                (True if row["gel_reu"] + row["gel_red"] != 0 else False),
            axis=1
        )

        data["gel_twoway"] = data.apply(
            lambda row: None if row["sequence"] == 1 else
                ((True if row["gel_red_flag"] == True else False) if row["gel_reu_flag"] == True else False),
            axis=1
        )

        data["gel_twoway_fre_dir"] = data.apply(
            lambda row: None if row["sequence"] == 1 else
                (row["gel_epc_dir"] if row["gel_twoway"] == True else "N"),
            axis=1
        )

        data["gel_oneway_fre_dir"] = data.apply(
            lambda row: None if row["sequence"] == 1 else
                (("D" if row["gel_red_flag"] == True else "N") if row["gel_reu_flag"] == False else "U"),
            axis=1
        )

        data["gel_last_dir"] = data.apply(
            lambda row: None if row["sequence"] == 1 else
                ("D" if row["gel_twoway"] and row["gel_twoway_fre_dir"] == "U" else
                 "U" if row["gel_twoway"] and row["gel_twoway_fre_dir"] == "D" else
                 "U" if row["gel_reu_flag"] else
                 "D" if row["gel_red_flag"] else
                 "N"),
            axis=1
        )
        data["gel_oco_flag"] = data.apply(
            lambda row: None if row["sequence"] == 1 else
                (False if row["low"] > row["gel_o"] else False if row["high"] < row["gel_o"] else True),
            axis=1
        )
        data["gel_noco_dir"] = data.apply(
            lambda row: None if row["sequence"] == 1 else
                (None if row["gel_oco_flag"] == True else "A" if row["low"] > row["gel_o"] else "B"),
            axis=1
        )

        data["pp_co_flag"] = data.apply(
            lambda row: None if row["sequence"] == 1 else
                (False if row["low"] > row["pph"] else False if row["high"] < row["ppl"] else True),
            axis=1
        )
        data["pp_noco_dir"] = data.apply(
            lambda row: None if row["sequence"] == 1 else
                (None if row["pp_co_flag"] == True else "A" if row["low"] > row["pph"] else "B"),
            axis=1
        )
        return data

    def process_round_three(self, data):
        """Third Round - RPC States"""
        state_tracker = {
            "prior_gel_dir_state": None,
            "prior_gel_last_dir": None,
            "prior_gel_rpc_total": 0,
        }

        for index, row in data.iterrows():
            sequence = row["sequence"]
            gel_last_dir = row["gel_last_dir"]
            gel_twoway = row["gel_twoway"]

            if sequence == 1:
                gel_dir_state = None
                gel_rpc = None
                gel_rpc_total = None
            else:
                # Calculate gel_dir_state
                if gel_last_dir != "N":
                    gel_dir_state = gel_last_dir
                else:
                    gel_dir_state = state_tracker["prior_gel_dir_state"]

                # Calculate gel_rpc
                if sequence == 2 and gel_twoway:
                    gel_rpc = 2
                elif gel_last_dir == "N":
                    gel_rpc = 0
                elif gel_last_dir == state_tracker["prior_gel_last_dir"] and gel_twoway:
                    gel_rpc = 2
                elif gel_last_dir != state_tracker["prior_gel_dir_state"] and gel_last_dir != "N":
                    gel_rpc = 1
                else:
                    gel_rpc = 0

                # Calculate gel_rpc_total
                gel_rpc_total = (gel_rpc or 0) + (state_tracker["prior_gel_rpc_total"] or 0)

            # Update state tracker
            state_tracker["prior_gel_dir_state"] = gel_dir_state
            state_tracker["prior_gel_last_dir"] = gel_last_dir
            state_tracker["prior_gel_rpc_total"] = gel_rpc_total

            # Update DataFrame
            data.loc[index, "gel_dir_state"] = gel_dir_state
            data.loc[index, "gel_rpc"] = gel_rpc
            #data.loc[index, "gel_rpc_total"] = gel_rpc_total

        return data

    def generate_summary(self, data, ticker):
        """Generate summary data for parent periods"""
        summary_data = []

        for parent, group in data.groupby("parent"):
            lookup_date = group["parent"].iloc[0]
            duration = len(group)
            parent_high = group["gel_h"].max()
            parent_low = group["gel_l"].min()
            bar_of_h = group.loc[group["gel_h"].idxmax(), "sequence"]
            bar_of_l = group.loc[group["gel_l"].idxmin(), "sequence"]
            bpb_reu_count = group["bpb_reu_flag"].sum()
            bpb_red_count = group["bpb_red_flag"].sum()
            bpb_reu_max = group["bpb_reu"].max()
            bpb_red_max = group["bpb_red"].max()
            bpb_re_count = group["bpb_re_flag"].sum()
            bpb_no_re_count = duration - group["bpb_re_flag"].sum()
            bpb_twoway_count = group["bpb_twoway"].sum()
            range_max = group["range"].max()
            range_avg = group["range"].mean()
            bpb_re_count = group["bpb_re_flag"].sum()
            bpb_no_re_count = duration - group["bpb_re_flag"].sum()
            bpb_twoway_count = group["bpb_twoway"].sum()
            A1 = min(bar_of_h, bar_of_l)
            A2 = max(bar_of_h, bar_of_l)
            gel_reu_count = group["gel_reu_flag"].sum()
            gel_red_count = group["gel_red_flag"].sum()
            gel_reu_first = group.loc[group["gel_reu_flag"] == True, "sequence"].min() if gel_reu_count > 0 else None
            gel_reu_last = group.loc[group["gel_reu_flag"] == True, "sequence"].max() if gel_reu_count > 0 else None
            gel_red_first = group.loc[group["gel_red_flag"] == True, "sequence"].min() if gel_red_count > 0 else None
            gel_red_last = group.loc[group["gel_red_flag"] == True, "sequence"].max() if gel_red_count > 0 else None

            gel_rpc_total = group["gel_rpc"].sum()

            pp_reu_count = group["pp_reu_flag"].sum()
            pp_red_count = group["pp_red_flag"].sum()
            pp_reu_first = group.loc[group["pp_reu_flag"] == True, "sequence"].min() if pp_reu_count > 0 else None
            pp_reu_last = group.loc[group["pp_reu_flag"] == True, "sequence"].max() if pp_reu_count > 0 else None
            pp_red_first = group.loc[group["pp_red_flag"] == True, "sequence"].min() if pp_red_count > 0 else None
            pp_red_last = group.loc[group["pp_red_flag"] == True, "sequence"].max() if pp_red_count > 0 else None

            summary_data.append({
                "ticker": ticker,
                "parent": lookup_date,
                "duration": duration,
                "child_period": self.child_period,
                "parent_period": self.parent_period,
                "range_max": range_max,
                "range_avg": range_avg,
                "bpb_reu_count": bpb_reu_count,
                "bpb_red_count": bpb_red_count,
                "bpb_reu_max": bpb_reu_max,
                "bpb_red_max": bpb_red_max,
                "bpb_re_count": bpb_re_count,
                "bpb_no_re_count": bpb_no_re_count,
                "bpb_twoway_count": bpb_twoway_count,
                "parent_high": parent_high,
                "parent_low": parent_low,
                "bar_of_h": bar_of_h,
                "bar_of_l": bar_of_l,
                "A1": min(bar_of_h, bar_of_l),
                "A2": max(bar_of_h, bar_of_l),
                "gel_reu_count": gel_reu_count,
                "gel_red_count": gel_red_count,
                "gel_reu_first": gel_reu_first,
                "gel_reu_last": gel_reu_last,
                "gel_red_first": gel_red_first,
                "gel_red_last": gel_red_last,
                "pp_reu_count": pp_reu_count,
                "pp_red_count": pp_red_count,
                "pp_reu_first": pp_reu_first,
                "pp_reu_last": pp_reu_last,
                "pp_red_first": pp_red_first,
                "pp_red_last": pp_red_last,
                "gel_rpc_total": gel_rpc_total,
                "create_date": datetime.now().date(),
                "create_time": datetime.now().time(),
                "jobname": self.jobname
            })

        return pd.DataFrame(summary_data)

    def export_test_file(self, data, ticker):
        """Export all fields after round two processing to a test file"""
        # All fields present at this point
        data['ticker'] = ticker
        columns = [
            # Original input fields
            'ticker', 'date', 'parent', 'sequence', 'open', 'high', 'low', 'close', 'range', 'chg_h', 'chg_l', 'chg_h_percent', 'chg_l_percent',
            'gap', 'island_gap', 'bpb_reu', 'bpb_red', 'bpb_reu_flag', 'bpb_red_flag', 'bpb_pripr', 'bpb_ce_percent', 'bpb_epc', 'bpb_epc_dir',
            'bpb_e1_value', 'bpb_e2_value', 'bpb_re_flag', 'bpb_twoway',

            # Round one fields
            'gel_o', 'gel_h', 'gel_l',
            'gel_reu', 'gel_red',
            'gel_reu_flag', 'gel_red_flag',
            'gel_pripr', 'gel_ce_percent', 'gel_epc', 'gel_epc_dir', 'gel_prirange',
            'gel_e1_value', 'gel_e2_value',
            'gel_re_flag', 'gel_twoway',
            'gel_twoway_fre_dir', 'gel_oneway_fre_dir',
            'gel_last_dir',
            'gel_oco_flag', 'gel_noco_dir',

            # Round two fields
            'pph', 'ppl',
            'icph', 'icpl',
            'refh', 'refl',
            'pp_reu', 'pp_red',
            'pp_reu_flag', 'pp_red_flag',
            'pp_co_flag', 'pp_noco_dir',
        ]

        # Export to CSV
        output_file = os.path.join(self.test_dir, f"{ticker}_test1.csv")
        data.to_csv(output_file, columns=self.export_columns, index=False)
        print(f"Exported test file to: {output_file}")

        return data

    def process_file(self, filename):
        """Process a single input file through all rounds"""
        # Extract ticker from filename
        ticker = filename.split('_')[0]

        # Read input file
        input_path = os.path.join(self.input_dir, filename)
        data = pd.read_csv(input_path)

        # Validate input data
        self.validate_input_data(data)

        # Add required columns
        data['child_period'] = self.child_period
        data['parent_period'] = self.parent_period
        data['jobname'] = self.jobname

        # Process through rounds
        data = self.process_round_one(data)
        data = self.process_round_two(data)
        data = self.process_round_three(data)

        # Generate summary
        summary_df = self.generate_summary(data, ticker)

        # Export test file after round two
        data = self.export_test_file(data, ticker)

        # Export child data
        child_filename = f"{ticker}_{self.child_period}_gel.csv"
        child_path = os.path.join(self.output_child_dir, child_filename)
        data.to_csv(child_path, columns=self.export_columns, index=False)

        # Export summary data
        summary_filename = f"{ticker}_{self.parent_period}_gel.csv"
        summary_path = os.path.join(self.output_parent_dir, summary_filename)
        summary_df.to_csv(summary_path, index=False)

        return data, summary_df

    def process_all_files(self):
        """Process all eligible files in the input directory"""
        if not os.path.exists(self.input_dir):
            raise FileNotFoundError(f"Input directory {self.input_dir} does not exist")

        results = {}
        processed_files = 0
        errors = []

        # Get list of eligible files
        eligible_files = [f for f in os.listdir(self.input_dir)
                         if f.endswith(f"_{self.child_period}.csv")]

        if not eligible_files:
            print(f"No eligible files found with pattern *_{self.child_period}.csv")
            return results

        print(f"Found {len(eligible_files)} files to process")

        for filename in eligible_files:
            try:
                print(f"Processing {filename}...")
                data = self.process_file(filename)
                results[filename] = {
                    'status': 'success',
                    'data': data,
                    'error': None
                }
                processed_files += 1
                print(f"Successfully processed {filename}")

            except Exception as e:
                error_msg = f"Error processing {filename}: {str(e)}"
                print(error_msg)
                errors.append(error_msg)
                results[filename] = {
                    'status': 'error',
                    'data': None,
                    'error': str(e)
                }

        print(f"\nProcessing Complete: {processed_files}/{len(eligible_files)} files")
        if errors:
            print("\nErrors encountered:")
            for error in errors:
                print(f"- {error}")

        return results

def validate_periods(child_period: str, parent_period: str) -> bool:

    #Validate the child and parent period combinations.
    #Args:
    #    child_period (str): The child period code (D, W, M, Q)
    #    parent_period (str): The parent period code (W, M, Q, Y)
    #Returns:
    #    bool: True if valid, raises ValueError if invalid
    #Raises:
    #    ValueError: If period combination is invalid

    if child_period not in VALID_CHILD_PERIODS:
        raise ValueError(f"Invalid child_period: {child_period}. Must be one of {VALID_CHILD_PERIODS}")

    if parent_period not in VALID_PARENT_PERIODS:
        raise ValueError(f"Invalid parent_period: {parent_period}. Must be one of {VALID_PARENT_PERIODS}")

    # Check valid combinations
    period_order = ['D', 'W', 'M', 'Q', 'Y']
    child_idx = period_order.index(child_period)
    parent_idx = period_order.index(parent_period)

    if child_idx >= parent_idx:
        raise ValueError(f"Invalid period combination: {child_period} -> {parent_period}. Child period must be smaller than parent period.")

    return True

def setup_logging(log_dir: str = "logs") -> None:
    """Set up logging configuration."""
    os.makedirs(log_dir, exist_ok=True)
    log_file = os.path.join(log_dir, f"gelset_processing_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log")

    # Configure logging (basic setup - can be enhanced as needed)
    import logging
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        handlers=[
            logging.FileHandler(log_file),
            logging.StreamHandler(sys.stdout)
        ]
    )

def process_gelset_data(
    input_dir: str = '/content/input',
    output_child_dir: str = '/content/output_child',
    output_parent_dir: str = '/content/output_parent',
    child_period: str = 'D',
    parent_period: str = 'M',
    jobname: str = DEFAULT_JOBNAME
) -> Dict[str, Any]:
    """
    Process GelSet data with the given parameters.

    Args:
        input_dir: Directory containing input CSV files
        output_child_dir: Directory for child output files
        output_parent_dir: Directory for parent summary files
        child_period: Child period code (D, W, M, Q)
        parent_period: Parent period code (W, M, Q, Y)
        jobname: Job name for tracking

    Returns:
        Dict containing processing results
    """
    try:
        # Validate period combination
        validate_periods(child_period, parent_period)

        # Initialize processor
        processor = GelSetProcessor(
            input_dir=input_dir,
            output_child_dir=output_child_dir,
            output_parent_dir=output_parent_dir,
            child_period=child_period,
            parent_period=parent_period,
            jobname=jobname
        )

        # Process files
        results = processor.process_all_files()

        # Report results
        success_count = sum(1 for r in results.values() if r['status'] == 'success')
        print(f"\nProcessing complete. Successfully processed {success_count}/{len(results)} files.")

        return results

    except Exception as e:
        print(f"Error during processing: {str(e)}")
        return {'error': str(e)}

# Example usage in Jupyter/Colab:
if __name__ == "__main__":
    # For notebooks, just run with default parameters
    results = process_gelset_data()

Found 11 files to process
Processing MMM_D.csv...
Exported test file to: /content/test/MMM_test1.csv
Successfully processed MMM_D.csv
Processing JPM_D.csv...
Exported test file to: /content/test/JPM_test1.csv
Successfully processed JPM_D.csv
Processing GOLD_D.csv...
Exported test file to: /content/test/GOLD_test1.csv
Successfully processed GOLD_D.csv
Processing COST_D.csv...
Exported test file to: /content/test/COST_test1.csv
Successfully processed COST_D.csv
Processing IBM_D.csv...
Exported test file to: /content/test/IBM_test1.csv
Successfully processed IBM_D.csv
Processing BAC_D.csv...
Exported test file to: /content/test/BAC_test1.csv
Successfully processed BAC_D.csv
Processing AXP_D.csv...
Exported test file to: /content/test/AXP_test1.csv
Successfully processed AXP_D.csv
Processing JNJ_D.csv...
Exported test file to: /content/test/JNJ_test1.csv
Successfully processed JNJ_D.csv
Processing BA_D.csv...
Exported test file to: /content/test/BA_test1.csv
Successfully processed BA_D.csv

In [None]:
from google.colab import files
import os
import zipfile

def zip_and_download(source_dir, period):
    # Count files
    file_count = len([name for name in os.listdir(source_dir) if os.path.isfile(os.path.join(source_dir, name))])
    print(f"Found {file_count} {period} files")

    # Create zip file
    zip_filename = f'{period}_files.zip'
    with zipfile.ZipFile(zip_filename, 'w') as zipf:
        for file in os.listdir(source_dir):
            file_path = os.path.join(source_dir, file)
            if os.path.isfile(file_path):
                zipf.write(file_path, arcname=file)

    # Download zip file
    files.download(zip_filename)
    print(f"Downloaded {zip_filename}")

# Create and download zips for each directory
print("\nCreating and downloading zip files...\n")
zip_and_download('/content/output_child', 'child')
zip_and_download('/content/output_parent', 'parent')
#zip_and_download('/content/testone', 'test')


Creating and downloading zip files...

Found 73 child files


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded child_files.zip
Found 73 parent files


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded parent_files.zip


In [None]:
import pandas as pd
import os

# Use same tickers as cell 1
tickers = ["SPY", "TQQQ", "QQQ", "SQQQ", "EEM", "XLF", "GLD", "XLE", "EFA", "GDX", "XLK",
           "TLT", "XLV", "FXI", "XLY",
           "XLI", "XLU", "XLP", "XLB"]

# Create export directories if they don't exist
os.makedirs('/content/export_daily', exist_ok=True)
os.makedirs('/content/export_weekly', exist_ok=True)
os.makedirs('/content/export_monthly', exist_ok=True)

def validate_data(df, ticker):
    """Validate OHLC data for errors"""
    # Store original row count
    original_count = len(df)

    # Round all values to 4 decimal places
    for col in ['Open', 'High', 'Low', 'Close']:
        df[col] = df[col].round(4)

    # Create mask for each condition
    non_zero = (df['Open'] > 0) & (df['High'] > 0) & (df['Low'] > 0) & (df['Close'] > 0)
    valid_low = (df['Low'] <= df['Close']) & (df['Low'] <= df['Open'])
    valid_high = (df['High'] >= df['Close']) & (df['High'] >= df['Open'])
    valid_range = (df['High'] - df['Low']) > 0

    # Combine all conditions
    valid_rows = non_zero & valid_low & valid_high & valid_range

    # Filter data
    df_clean = df[valid_rows].copy()

    # Report removed rows
    removed_count = original_count - len(df_clean)
    if removed_count > 0:
        print(f"{ticker}: Removed {removed_count} invalid rows out of {original_count}")

    return df_clean

def load_data(ticker):
    file_path = f'{ticker}_OHLC.csv'
    df = pd.read_csv(file_path)
    df['Date'] = pd.to_datetime(df['Date'])

    # Ensure column order for daily data
    column_order = ['Date', 'Open', 'High', 'Low', 'Close']
    df = df[column_order]

    # Validate and clean data before proceeding
    df = validate_data(df, ticker)

    return df

def aggregate_data(df):
    agg_rules = {
        'Open': 'first',
        'High': 'max',
        'Low': 'min',
        'Close': 'last'
    }

    weekly_data = df.set_index('Date').resample('W').agg(agg_rules).reset_index()
    monthly_data = df.set_index('Date').resample('M').agg(agg_rules).reset_index()

    column_order = ['Date', 'Open', 'High', 'Low', 'Close']
    weekly_data = weekly_data[column_order]
    monthly_data = monthly_data[column_order]

    # Validate aggregated data
    weekly_data = validate_data(weekly_data, f"{df.name}_weekly")
    monthly_data = validate_data(monthly_data, f"{df.name}_monthly")

    return weekly_data, monthly_data

def save_to_csv(data, ticker, period):
    if period == 'daily':
        file_path = f'/content/export_daily/{ticker}_daily.csv'
    elif period == 'weekly':
        file_path = f'/content/export_weekly/{ticker}_weekly.csv'
    else:
        file_path = f'/content/export_monthly/{ticker}_monthly.csv'

    data.to_csv(file_path, index=False)
    print(f"Saved {file_path}")

def process_ticker(ticker):
    print(f"Processing {ticker}...")
    try:
        # Load and save daily data
        df = load_data(ticker)
        df.name = ticker  # Add name attribute for reference in aggregation

        if len(df) > 0:  # Only proceed if we have valid data
            save_to_csv(df, ticker, 'daily')

            # Process and save weekly/monthly data
            weekly_data, monthly_data = aggregate_data(df)

            if len(weekly_data) > 0:
                save_to_csv(weekly_data, ticker, 'weekly')
            if len(monthly_data) > 0:
                save_to_csv(monthly_data, ticker, 'monthly')

            print(f"Successfully processed {ticker}")
        else:
            print(f"No valid data for {ticker}")

    except Exception as e:
        print(f"Error processing {ticker}: {str(e)}")

# Process all tickers
for ticker in tickers:
    process_ticker(ticker)

In [None]:
import pandas as pd
import os

# Use same tickers as cell 1
tickers = ["SPY", "TQQQ", "QQQ", "SQQQ", "EEM", "XLF", "GLD", "XLE", "EFA", "GDX", "XLK",
           "TLT", "XLV", "FXI", "XLY",
           "XLI", "XLU", "XLP", "XLB"]

# Create export directories if they don't exist
os.makedirs('/content/export_daily', exist_ok=True)
os.makedirs('/content/export_weekly', exist_ok=True)
os.makedirs('/content/export_monthly', exist_ok=True)

def validate_data(df, ticker):
    """Validate OHLC data for errors"""
    # Store original row count
    original_count = len(df)

    # Round all values to 4 decimal places
    for col in ['Open', 'High', 'Low', 'Close']:
        df[col] = df[col].round(4)

    # Create mask for each condition
    non_zero = (df['Open'] > 0) & (df['High'] > 0) & (df['Low'] > 0) & (df['Close'] > 0)
    valid_low = (df['Low'] <= df['Close']) & (df['Low'] <= df['Open'])
    valid_high = (df['High'] >= df['Close']) & (df['High'] >= df['Open'])
    valid_range = (df['High'] - df['Low']) > 0

    # Combine all conditions
    valid_rows = non_zero & valid_low & valid_high & valid_range

    # Filter data
    df_clean = df[valid_rows].copy()

    # Report removed rows
    removed_count = original_count - len(df_clean)
    if removed_count > 0:
        print(f"{ticker}: Removed {removed_count} invalid rows out of {original_count}")

    return df_clean

def load_data(ticker):
    file_path = f'{ticker}_OHLC.csv'
    df = pd.read_csv(file_path)
    df['Date'] = pd.to_datetime(df['Date'])

    # Ensure column order for daily data
    column_order = ['Date', 'Open', 'High', 'Low', 'Close']
    df = df[column_order]

    # Validate and clean data before proceeding
    df = validate_data(df, ticker)

    return df

def aggregate_data(df):
    agg_rules = {
        'Open': 'first',
        'High': 'max',
        'Low': 'min',
        'Close': 'last'
    }

    weekly_data = df.set_index('Date').resample('W').agg(agg_rules).reset_index()
    monthly_data = df.set_index('Date').resample('M').agg(agg_rules).reset_index()

    column_order = ['Date', 'Open', 'High', 'Low', 'Close']
    weekly_data = weekly_data[column_order]
    monthly_data = monthly_data[column_order]

    # Validate aggregated data
    weekly_data = validate_data(weekly_data, f"{df.name}_weekly")
    monthly_data = validate_data(monthly_data, f"{df.name}_monthly")

    return weekly_data, monthly_data

def save_to_csv(data, ticker, period):
    if period == 'daily':
        file_path = f'/content/export_daily/{ticker}_daily.csv'
    elif period == 'weekly':
        file_path = f'/content/export_weekly/{ticker}_weekly.csv'
    else:
        file_path = f'/content/export_monthly/{ticker}_monthly.csv'

    data.to_csv(file_path, index=False)
    print(f"Saved {file_path}")

def process_ticker(ticker):
    print(f"Processing {ticker}...")
    try:
        # Load and save daily data
        df = load_data(ticker)
        df.name = ticker  # Add name attribute for reference in aggregation

        if len(df) > 0:  # Only proceed if we have valid data
            save_to_csv(df, ticker, 'daily')

            # Process and save weekly/monthly data
            weekly_data, monthly_data = aggregate_data(df)

            if len(weekly_data) > 0:
                save_to_csv(weekly_data, ticker, 'weekly')
            if len(monthly_data) > 0:
                save_to_csv(monthly_data, ticker, 'monthly')

            print(f"Successfully processed {ticker}")
        else:
            print(f"No valid data for {ticker}")

    except Exception as e:
        print(f"Error processing {ticker}: {str(e)}")

# Process all tickers
for ticker in tickers:
    process_ticker(ticker)

In [None]:
from google.colab import files
import os

# Function to download all files from a directory
def download_directory_files(directory):
    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)
        if os.path.isfile(filepath):
            files.download(filepath)

# Download files from each export directory
print("Downloading child files...")
download_directory_files('/content/output_child')

print("Downloading parent files...")
download_directory_files('/content/output_parent')

print("Downloading test files...")
download_directory_files('/content/testone')

Downloading child files...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading parent files...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading test files...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>