<a href="https://colab.research.google.com/github/BaronVonBussin/NewTransit/blob/main/gel_V1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from dataclasses import dataclass
from typing import List, Dict, Optional
import pandas as pd
import os
import logging
from datetime import datetime

@dataclass
class ProcessorConfig:
    child_period: str
    parent_period: str
    w_or_dur: int = 1
    m_or_dur: int = 5
    y_or_dur: int = 3

class GelProcessor:
    def __init__(self, config: ProcessorConfig):
        self.config = config
        self.setup_logging()

    def setup_logging(self):
        logging.basicConfig(level=logging.INFO,
                          format='%(asctime)s - %(levelname)s - %(message)s')

    def generate_serial_id(self) -> int:
        now = datetime.now()
        return int(f"{now.year}{now.month:02d}{now.day:02d}{now.hour:02d}{now.minute:02d}{now.second:02d}")

    def create_parent_bars(self, df: pd.DataFrame) -> pd.DataFrame:
        df = df.copy().reset_index(drop=True)

        if self.config.parent_period == 'W':
            grouper = pd.Grouper(key='date', freq='W-FRI')
        elif self.config.parent_period == 'M':
            grouper = pd.Grouper(key='date', freq='ME')
        else:
            grouper = pd.Grouper(key='date', freq='YE')

        groups = df.groupby(grouper)
        parent_data = []

        for name, group in groups:
            group = group.reset_index(drop=True)
            if len(group) == 0:
                continue

            parent_row = {
                'date': group['date'].iloc[0],
                'open': group['open'].iloc[0],
                'high': group['high'].max(),
                'low': group['low'].min(),
                'close': group['close'].iloc[-1],
                'duration': len(group),
                'trading_bop': group.index.size,
                'bar_of_h': group[group['high'] == group['high'].max()].index[0] + 1,
                'bar_of_l': group[group['low'] == group['low'].min()].index[0] + 1
            }

            if self.config.parent_period == 'M':
                parent_row['lookup_date'] = parent_row['date'].strftime('%Y/%m')
            else:
                parent_row['lookup_date'] = f"{parent_row['date'].year}_{parent_row['date'].isocalendar()[1]:02d}"

            parent_data.append(parent_row)

        parent_df = pd.DataFrame(parent_data)
        return parent_df

    def process_child_prior_bar(self, df: pd.DataFrame) -> pd.DataFrame:
        df = df.copy().reset_index(drop=True)

        # Basic metrics
        df['range'] = df['high'] - df['low']
        df['percent_r'] = (df['close'] - df['low']) / df['range']

        # Edge bias first
        df['bias_dir'] = (df['percent_r'].shift(1) >= 0.5).astype(int)

        # Expansions
        df['reu_value'] = (df['high'] - df['high'].shift(1)).clip(lower=0)
        df['red_value'] = (df['low'].shift(1) - df['low']).clip(lower=0)
        df['re_value'] = df['reu_value'] + df['red_value']

        # Flags
        df['reu_flag'] = (df['reu_value'] > 0).astype(int)
        df['red_flag'] = (df['red_value'] > 0).astype(int)
        df['re_flag'] = ((df['reu_flag'] | df['red_flag']) > 0).astype(int)
        df['re_twoway_flag'] = ((df['reu_flag'] & df['red_flag']) > 0).astype(int)
        df['re_twoway_fre_dir'] = df.apply(lambda x: 1 if x['re_twoway_flag'] and x['bias_dir'] else 0, axis=1)

        df['ce_percent'] = df.apply(lambda x: 1 - x['percent_r'] if x['bias_dir'] == 1 else x['percent_r'], axis=1)
        df['epc'] = (df['ce_percent'] / 0.1).apply(lambda x: int(x + 0.99))
        df['epc_hp'] = ((df['percent_r'].shift(1) > 0.25) & (df['percent_r'].shift(1) < 0.75)).astype(int)

        # Reference values
        df['ref_o'] = df['open'].shift(1)
        df['ref_h'] = df['high'].shift(1)
        df['ref_l'] = df['low'].shift(1)
        df['ref_c'] = df['close'].shift(1)

        # E1/E2 values
        df['e1_value'] = df.apply(lambda x: x['reu_value'] if x['bias_dir'] == 1 else x['red_value'], axis=1)
        df['e2_value'] = df.apply(lambda x: x['red_value'] if x['bias_dir'] == 1 else x['reu_value'], axis=1)
        df['e1_ere_flag'] = (df['e1_value'] > 0).astype(int)
        df['e2_ere_flag'] = (df['e2_value'] > 0).astype(int)

        return df

    def process_gel_parent(self, child_df: pd.DataFrame, parent_df: pd.DataFrame) -> pd.DataFrame:
        child_df = child_df.copy().reset_index(drop=True)
        parent_df = parent_df.copy().reset_index(drop=True)
        results = []

        for i, parent_row in parent_df.iterrows():
            next_date = parent_df['date'].iloc[i+1] if i < len(parent_df)-1 else child_df['date'].max()
            mask = (child_df['date'] >= parent_row['date']) & (child_df['date'] < next_date)
            period_children = child_df[mask].copy().reset_index(drop=True)

            if len(period_children) == 0:
                continue

            gel_data = {'gel_o': period_children['open'].iloc[0]}
            gel_h = gel_data['gel_o']
            gel_l = gel_data['gel_o']

            for idx, child_row in period_children.iterrows():
                gel_h = max(gel_h, child_row['high'])
                gel_l = min(gel_l, child_row['low'])

                row_data = {
                    **child_row,
                    'gel_h': gel_h,
                    'gel_l': gel_l,
                    'gel_c': child_row['close'],
                    'gel_range': gel_h - gel_l,
                }

                if row_data['gel_range'] != 0:
                    row_data['gel_percent_r'] = (row_data['gel_c'] - row_data['gel_l']) / row_data['gel_range']
                else:
                    row_data['gel_percent_r'] = 0.5

                results.append(row_data)

        result_df = pd.DataFrame(results)
        if len(result_df) > 0:
            result_df['gel_bias_dir'] = (result_df['gel_percent_r'] >= 0.5).astype(int)
            result_df['gel_ce_percent'] = result_df.apply(
                lambda x: 1 - x['gel_percent_r'] if x['gel_bias_dir'] == 1 else x['gel_percent_r'], axis=1)
            result_df['gel_hp_flag'] = (result_df['gel_ce_percent'] <= 0.25).astype(int)

        return result_df

    def process_gel_prior_parent(self, child_df: pd.DataFrame, parent_df: pd.DataFrame) -> pd.DataFrame:
        child_df = child_df.copy().reset_index(drop=True)
        parent_df = parent_df.copy().reset_index(drop=True)
        results = []

        for i, parent_row in parent_df.iterrows():
            if i == 0:
                continue

            prior_parent = parent_df.iloc[i-1]
            next_date = parent_df['date'].iloc[i+1] if i < len(parent_df)-1 else child_df['date'].max()
            mask = (child_df['date'] >= parent_row['date']) & (child_df['date'] < next_date)
            period_children = child_df[mask].copy().reset_index(drop=True)

            if len(period_children) == 0:
                continue

            pp_data = {
                'pp_o': prior_parent['open'],
                'pp_h': prior_parent['high'],
                'pp_l': prior_parent['low'],
                'pp_c': prior_parent['close'],
                'pp_range': prior_parent['high'] - prior_parent['low']
            }

            if pp_data['pp_range'] != 0:
                pp_data['pp_percent_r'] = (pp_data['pp_c'] - pp_data['pp_l']) / pp_data['pp_range']
            else:
                pp_data['pp_percent_r'] = 0.5

            ic_h = period_children['high'].iloc[0]
            ic_l = period_children['low'].iloc[0]

            for idx, child_row in period_children.iterrows():
                ic_h = max(ic_h, child_row['high'])
                ic_l = min(ic_l, child_row['low'])

                result = {
                    **child_row,
                    **pp_data,
                    'ic_h': ic_h,
                    'ic_l': ic_l,
                    'ic_c': child_row['close'],
                    'ref_h': max(pp_data['pp_h'], ic_h),
                    'ref_l': min(pp_data['pp_l'], ic_l)
                }
                results.append(result)

        return pd.DataFrame(results)

    def process_ticker(self, ticker: str):
        input_path = f'/content/input/{ticker}_{self.config.child_period}_1.csv'

        try:
            df = pd.read_csv(input_path)
            df['date'] = pd.to_datetime(df['date'])

            parent_df = self.create_parent_bars(df)
            child_prior_bar_df = self.process_child_prior_bar(df)
            gel_parent_df = self.process_gel_parent(df, parent_df)
            gel_prior_parent_df = self.process_gel_prior_parent(df, parent_df)

            output_paths = {
                'gel_child': f'{ticker}_child{self.config.child_period}.csv',
                'gel_parent': f'{ticker}_parent{self.config.parent_period}.csv',
                'gel_childgel': f'{ticker}_childgel{self.config.child_period}.csv',
                'gel_priorparent': f'{ticker}_gelparent{self.config.parent_period}.csv'
            }

            for folder, filename in output_paths.items():
                os.makedirs(folder, exist_ok=True)
                output_df = {
                    'gel_child': child_prior_bar_df,
                    'gel_parent': parent_df,
                    'gel_childgel': gel_parent_df,
                    'gel_priorparent': gel_prior_parent_df
                }[folder]

                # Add common fields
                output_df['serial_id'] = self.generate_serial_id()
                output_df['create_date'] = datetime.now().strftime('%Y-%m-%d')
                output_df['create_time'] = datetime.now().strftime('%H:%M:%S')
                output_df['jobname'] = 'GetSet_20150101'
                output_df['ticker'] = ticker
                output_df['child_period'] = self.config.child_period
                output_df['parent_period'] = self.config.parent_period

                output_df.to_csv(f'{folder}/{filename}', index=False)

            logging.info(f"Successfully processed {ticker}")

        except Exception as e:
            logging.error(f"Error processing {ticker}: {str(e)}")
            raise

def process_data(child_period='D', parent_period='M'):
    config = ProcessorConfig(
        child_period=child_period,
        parent_period=parent_period
    )

    processor = GelProcessor(config)

    input_files = [f for f in os.listdir('/content/input') if f.endswith('.csv')]
    tickers = set(f.split('_')[0] for f in input_files)

    for ticker in tickers:
        processor.process_ticker(ticker)

# Run the processor
process_data(child_period='D', parent_period='M')

In [None]:
from dataclasses import dataclass
from typing import List, Dict, Optional
import pandas as pd
import os
import logging
from datetime import datetime

@dataclass
class ProcessorConfig:
    child_period: str
    parent_period: str
    w_or_dur: int = 1
    m_or_dur: int = 5
    y_or_dur: int = 3

class GelProcessor:
    def __init__(self, config: ProcessorConfig):
        self.config = config
        self.setup_logging()

    def setup_logging(self):
        logging.basicConfig(level=logging.INFO,
                          format='%(asctime)s - %(levelname)s - %(message)s')

    def generate_serial_id(self) -> int:
        now = datetime.now()
        return int(f"{now.year}{now.month:02d}{now.day:02d}{now.hour:02d}{now.minute:02d}{now.second:02d}{now.microsecond:06d}")

    def create_parent_bars(self, df: pd.DataFrame) -> pd.DataFrame:
        df = df.copy().reset_index(drop=True)

        if self.config.parent_period == 'W':
            grouper = pd.Grouper(key='date', freq='W-FRI')
        elif self.config.parent_period == 'M':
            grouper = pd.Grouper(key='date', freq='ME')
        else:
            grouper = pd.Grouper(key='date', freq='YE')

        groups = df.groupby(grouper)
        parent_data = []

        for name, group in groups:
            group = group.reset_index(drop=True)
            if len(group) == 0:
                continue

            parent_row = {
                'date': group['date'].iloc[0],
                'open': group['open'].iloc[0],
                'high': group['high'].max(),
                'low': group['low'].min(),
                'close': group['close'].iloc[-1],
                'duration': len(group),
                'trading_bop': group.index.size,
                'bar_of_h': group[group['high'] == group['high'].max()].index[0] + 1,
                'bar_of_l': group[group['low'] == group['low'].min()].index[0] + 1,
                'serial_id': self.generate_serial_id()
            }

            if self.config.parent_period == 'M':
                parent_row['lookup_date'] = parent_row['date'].strftime('%Y/%m')
            else:
                parent_row['lookup_date'] = f"{parent_row['date'].year}_{parent_row['date'].isocalendar()[1]:02d}"

            parent_data.append(parent_row)

        parent_df = pd.DataFrame(parent_data)
        return parent_df

    def process_child_prior_bar(self, df: pd.DataFrame) -> pd.DataFrame:
        df = df.copy().reset_index(drop=True)

        # Basic metrics
        df['range'] = df['high'] - df['low']
        df['percent_r'] = (df['close'] - df['low']) / df['range']

        # Edge bias first
        df['bias_dir'] = (df['percent_r'].shift(1) >= 0.5).astype(int)

        # Expansions
        df['reu_value'] = (df['high'] - df['high'].shift(1)).clip(lower=0)
        df['red_value'] = (df['low'].shift(1) - df['low']).clip(lower=0)
        df['re_value'] = df['reu_value'] + df['red_value']

        # Flags
        df['reu_flag'] = (df['reu_value'] > 0).astype(int)
        df['red_flag'] = (df['red_value'] > 0).astype(int)
        df['re_flag'] = ((df['reu_flag'] | df['red_flag']) > 0).astype(int)
        df['re_twoway_flag'] = ((df['reu_flag'] & df['red_flag']) > 0).astype(int)
        df['re_twoway_fre_dir'] = df.apply(lambda x: 1 if x['re_twoway_flag'] and x['bias_dir'] else 0, axis=1)

        df['ce_percent'] = df.apply(lambda x: 1 - x['percent_r'] if x['bias_dir'] == 1 else x['percent_r'], axis=1)
        df['epc'] = (df['ce_percent'] / 0.1).apply(lambda x: int(x + 0.99))
        df['epc_hp'] = ((df['percent_r'].shift(1) > 0.25) & (df['percent_r'].shift(1) < 0.75)).astype(int)

        # Reference values
        df['ref_o'] = df['open'].shift(1)
        df['ref_h'] = df['high'].shift(1)
        df['ref_l'] = df['low'].shift(1)
        df['ref_c'] = df['close'].shift(1)

        # E1/E2 values
        df['e1_value'] = df.apply(lambda x: x['reu_value'] if x['bias_dir'] == 1 else x['red_value'], axis=1)
        df['e2_value'] = df.apply(lambda x: x['red_value'] if x['bias_dir'] == 1 else x['reu_value'], axis=1)
        df['e1_ere_flag'] = (df['e1_value'] > 0).astype(int)
        df['e2_ere_flag'] = (df['e2_value'] > 0).astype(int)

        # Add serial_id
        df['serial_id'] = df.apply(lambda _: self.generate_serial_id(), axis=1)

        return df

    def process_gel_parent(self, child_df: pd.DataFrame, parent_df: pd.DataFrame) -> pd.DataFrame:
        child_df = child_df.copy().reset_index(drop=True)
        parent_df = parent_df.copy().reset_index(drop=True)
        results = []

        for i, parent_row in parent_df.iterrows():
            next_date = parent_df['date'].iloc[i+1] if i < len(parent_df)-1 else child_df['date'].max()
            mask = (child_df['date'] >= parent_row['date']) & (child_df['date'] < next_date)
            period_children = child_df[mask].copy().reset_index(drop=True)

            if len(period_children) == 0:
                continue

            gel_data = {'gel_o': period_children['open'].iloc[0]}
            gel_h = gel_data['gel_o']
            gel_l = gel_data['gel_o']
            previous_percent_r = 0.5  # Initialize for first bar

            for idx, child_row in period_children.iterrows():
                gel_h = max(gel_h, child_row['high'])
                gel_l = min(gel_l, child_row['low'])
                current_close = child_row['close']

                row_data = {
                    **child_row,
                    'gel_h': gel_h,
                    'gel_l': gel_l,
                    'gel_c': current_close,
                    'gel_range': gel_h - gel_l,
                    'serial_id': self.generate_serial_id()
                }

                if row_data['gel_range'] != 0:
                    row_data['gel_percent_r'] = (current_close - gel_l) / row_data['gel_range']
                else:
                    row_data['gel_percent_r'] = 0.5

                row_data['gel_ce_percent'] = 1 - previous_percent_r if previous_percent_r >= 0.5 else previous_percent_r
                row_data['gel_bias_dir'] = 1 if previous_percent_r >= 0.5 else 0
                row_data['gel_hp_flag'] = 1 if row_data['gel_ce_percent'] <= 0.25 else 0

                previous_percent_r = row_data['gel_percent_r']
                results.append(row_data)

        return pd.DataFrame(results)

    def process_gel_prior_parent(self, child_df: pd.DataFrame, parent_df: pd.DataFrame) -> pd.DataFrame:
        child_df = child_df.copy().reset_index(drop=True)
        parent_df = parent_df.copy().reset_index(drop=True)
        results = []

        for i, parent_row in parent_df.iterrows():
            if i == 0:
                continue

            prior_parent = parent_df.iloc[i-1]
            next_date = parent_df['date'].iloc[i+1] if i < len(parent_df)-1 else child_df['date'].max()
            mask = (child_df['date'] >= parent_row['date']) & (child_df['date'] < next_date)
            period_children = child_df[mask].copy().reset_index(drop=True)

            if len(period_children) == 0:
                continue

            pp_data = {
                'pp_o': prior_parent['open'],
                'pp_h': prior_parent['high'],
                'pp_l': prior_parent['low'],
                'pp_c': prior_parent['close'],
                'pp_range': prior_parent['high'] - prior_parent['low']
            }

            if pp_data['pp_range'] != 0:
                pp_data['pp_percent_r'] = (pp_data['pp_c'] - pp_data['pp_l']) / pp_data['pp_range']
            else:
                pp_data['pp_percent_r'] = 0.5

            ic_h = period_children['high'].iloc[0]
            ic_l = period_children['low'].iloc[0]
            previous_ref_h = pp_data['pp_h']
            previous_ref_l = pp_data['pp_l']

            for idx, child_row in period_children.iterrows():
                ic_h = max(ic_h, child_row['high'])
                ic_l = min(ic_l, child_row['low'])

                # Calculate expansions
                reu_value = max(child_row['high'] - previous_ref_h, 0)
                red_value = max(previous_ref_l - child_row['low'], 0)

                # Edge bias calculations
                if pp_data['pp_range'] != 0:
                    bias_dir = 1 if pp_data['pp_percent_r'] >= 0.5 else 0
                else:
                    bias_dir = 0

                # Map expansions to E1/E2
                e1_value = reu_value if bias_dir == 1 else red_value
                e2_value = red_value if bias_dir == 1 else reu_value

                result = {
                    **child_row,
                    **pp_data,
                    'ic_h': ic_h,
                    'ic_l': ic_l,
                    'ic_c': child_row['close'],
                    'ref_h': max(pp_data['pp_h'], ic_h),
                    'ref_l': min(pp_data['pp_l'], ic_l),
                    'reu_value': reu_value,
                    'red_value': red_value,
                    'bias_dir': bias_dir,
                    'e1_value': e1_value,
                    'e2_value': e2_value,
                    'e1_flag': 1 if e1_value > 0 else 0,
                    'e2_flag': 1 if e2_value > 0 else 0,
                    'serial_id': self.generate_serial_id()
                }

                previous_ref_h = result['ref_h']
                previous_ref_l = result['ref_l']
                results.append(result)

        return pd.DataFrame(results)

    def process_ticker(self, ticker: str):
        print(f"\nProcessing {ticker}:")
        input_path = f'/content/input/{ticker}_{self.config.child_period}_1.csv'

        try:
            df = pd.read_csv(input_path)
            df['date'] = pd.to_datetime(df['date'])
            print(f"Date Range: {df['date'].min().strftime('%Y-%m-%d')} to {df['date'].max().strftime('%Y-%m-%d')}")
            print(f"Child Bars: {len(df)}")

            parent_df = self.create_parent_bars(df)
            child_prior_bar_df = self.process_child_prior_bar(df)
            gel_parent_df = self.process_gel_parent(df, parent_df)
            gel_prior_parent_df = self.process_gel_prior_parent(df, parent_df)

            output_paths = {
                'gel_child': f'{ticker}_child{self.config.child_period}.csv',
                'gel_parent': f'{ticker}_parent{self.config.parent_period}.csv',
                'gel_childgel': f'{ticker}_childgel{self.config.child_period}.csv',
                'gel_priorparent': f'{ticker}_gelparent{self.config.parent_period}.csv'
            }

            for folder, filename in output_paths.items():
                os.makedirs(folder, exist_ok=True)
                output_df = {
                    'gel_child': child_prior_bar_df,
                    'gel_parent': parent_df,
                    'gel_childgel': gel_parent_df,
                    'gel_priorparent': gel_prior_parent_df
                }[folder]

                # Add common fields
                output_df['create_date'] = datetime.now().strftime('%Y-%m-%d')
                output_df['create_time'] = datetime.now().strftime('%H:%M:%S')
                output_df['jobname'] = 'GetSet_20150101'
                output_df['ticker'] = ticker
                output_df['child_period'] = self.config.child_period
                output_df['parent_period'] = self.config.parent_period

                output_df.to_csv(f'{folder}/{filename}', index=False)

            logging.info(f"Successfully processed {ticker}")

        except Exception as e:
            logging.error(f"Error processing {ticker}: {str(e)}")
            raise

def process_data(child_period='D', parent_period='M'):
    print(f"\nProcessing with {child_period} child bars and {parent_period} parent bars")
    config = ProcessorConfig(
        child_period=child_period,
        parent_period=parent_period
    )

    processor = GelProcessor(config)

    input_files = [f for f in os.listdir('/content/input') if f.endswith('.csv')]
    tickers = set(f.split('_')[0] for f in input_files)

    for ticker in tickers:
        processor.process_ticker(ticker)

# Run the processor
process_data(child_period='D', parent_period='M')


Processing with D child bars and M parent bars

Processing MMM:
Date Range: 2016-01-04 to 2019-12-30
Child Bars: 1005


In [None]:
from dataclasses import dataclass
from typing import List, Dict, Optional
import pandas as pd
import os
import logging
from datetime import datetime

@dataclass
class ProcessorConfig:
    child_period: str
    parent_period: str
    w_or_dur: int = 1
    m_or_dur: int = 5
    y_or_dur: int = 3

class GelProcessor:
    def __init__(self, config: ProcessorConfig):
        self.config = config
        self.setup_logging()

    def setup_logging(self):
        logging.basicConfig(level=logging.INFO,
                          format='%(asctime)s - %(levelname)s - %(message)s')

    def generate_serial_id(self) -> int:
        now = datetime.now()
        return int(f"{now.year}{now.month:02d}{now.day:02d}{now.hour:02d}{now.minute:02d}{now.second:02d}{now.microsecond:06d}")

    def create_parent_bars(self, df: pd.DataFrame) -> pd.DataFrame:
        df = df.copy().reset_index(drop=True)

        if self.config.parent_period == 'W':
            grouper = pd.Grouper(key='date', freq='W-FRI')
        elif self.config.parent_period == 'M':
            grouper = pd.Grouper(key='date', freq='ME')
        else:
            grouper = pd.Grouper(key='date', freq='YE')

        groups = df.groupby(grouper)
        parent_data = []

        for name, group in groups:
            group = group.reset_index(drop=True)
            if len(group) == 0:
                continue

            parent_row = {
                'date': group['date'].iloc[0],
                'open': group['open'].iloc[0],
                'high': group['high'].max(),
                'low': group['low'].min(),
                'close': group['close'].iloc[-1],
                'duration': len(group),
                'trading_bop': group.index.size,
                'bar_of_h': group[group['high'] == group['high'].max()].index[0] + 1,
                'bar_of_l': group[group['low'] == group['low'].min()].index[0] + 1,
                'serial_id': self.generate_serial_id()
            }

            if self.config.parent_period == 'M':
                parent_row['lookup_date'] = parent_row['date'].strftime('%Y/%m')
            else:
                parent_row['lookup_date'] = f"{parent_row['date'].year}_{parent_row['date'].isocalendar()[1]:02d}"

            parent_data.append(parent_row)

        parent_df = pd.DataFrame(parent_data)
        return parent_df

    def process_child_prior_bar(self, df: pd.DataFrame) -> pd.DataFrame:
        df = df.copy().reset_index(drop=True)

        # Basic metrics
        df['cpc_range'] = df['high'] - df['low']
        df['cpc_percent_r'] = (df['close'] - df['low']) / df['cpc_range']

        # Edge bias first
        df['cpc_bias_dir'] = (df['cpc_percent_r'].shift(1) >= 0.5).astype(int)

        # Expansions
        df['cpc_reu_value'] = (df['high'] - df['high'].shift(1)).clip(lower=0)
        df['cpc_red_value'] = (df['low'].shift(1) - df['low']).clip(lower=0)
        df['cpc_re_value'] = df['cpc_reu_value'] + df['cpc_red_value']

        # Flags
        df['cpc_reu_flag'] = (df['cpc_reu_value'] > 0).astype(int)
        df['cpc_red_flag'] = (df['cpc_red_value'] > 0).astype(int)
        df['cpc_re_flag'] = ((df['cpc_reu_flag'] | df['cpc_red_flag']) > 0).astype(int)
        df['cpc_re_twoway_flag'] = ((df['cpc_reu_flag'] & df['cpc_red_flag']) > 0).astype(int)
        df['cpc_re_twoway_fre_dir'] = df.apply(lambda x: 1 if x['cpc_re_twoway_flag'] and x['cpc_bias_dir'] else 0, axis=1)

        df['cpc_ce_percent'] = df.apply(lambda x: 1 - x['cpc_percent_r'] if x['cpc_bias_dir'] == 1 else x['cpc_percent_r'], axis=1)
        df['cpc_epc'] = df.apply(lambda x: 1 if x['cpc_percent_r'] == 0 else int((x['cpc_ce_percent'] / 0.1) + 0.99), axis=1)
        df['cpc_epc_hp'] = ((df['cpc_percent_r'].shift(1) > 0.25) & (df['cpc_percent_r'].shift(1) < 0.75)).astype(int)

        # Reference values
        df['cpc_ref_o'] = df['open'].shift(1)
        df['cpc_ref_h'] = df['high'].shift(1)
        df['cpc_ref_l'] = df['low'].shift(1)
        df['cpc_ref_c'] = df['close'].shift(1)

        # E1/E2 values
        df['cpc_e1_value'] = df.apply(lambda x: x['cpc_reu_value'] if x['cpc_bias_dir'] == 1 else x['cpc_red_value'], axis=1)
        df['cpc_e2_value'] = df.apply(lambda x: x['cpc_red_value'] if x['cpc_bias_dir'] == 1 else x['cpc_reu_value'], axis=1)
        df['cpc_e1_ere_flag'] = (df['cpc_e1_value'] > 0).astype(int)
        df['cpc_e2_ere_flag'] = (df['cpc_e2_value'] > 0).astype(int)

        # Add serial_id
        df['cpc_serial_id'] = df.apply(lambda _: self.generate_serial_id(), axis=1)

        return df

    def process_gel_parent(self, child_df: pd.DataFrame, parent_df: pd.DataFrame) -> pd.DataFrame:
        child_df = child_df.copy().reset_index(drop=True)
        parent_df = parent_df.copy().reset_index(drop=True)
        results = []

        for i, parent_row in parent_df.iterrows():
            next_date = parent_df['date'].iloc[i+1] if i < len(parent_df)-1 else child_df['date'].max()
            mask = (child_df['date'] >= parent_row['date']) & (child_df['date'] < next_date)
            period_children = child_df[mask].copy().reset_index(drop=True)

            if len(period_children) == 0:
                continue

            gel_data = {'gel_o': period_children['open'].iloc[0]}
            gel_h = gel_data['gel_o']
            gel_l = gel_data['gel_o']
            previous_percent_r = 0.5  # Initialize for first bar

            for idx, child_row in period_children.iterrows():
                gel_h = max(gel_h, child_row['high'])
                gel_l = min(gel_l, child_row['low'])
                current_close = child_row['close']

                row_data = {
                    **child_row,
                    'gel_h': gel_h,
                    'gel_l': gel_l,
                    'gel_c': current_close,
                    'gel_range': gel_h - gel_l,
                    'serial_id': self.generate_serial_id()
                }

                if row_data['gel_range'] != 0:
                    row_data['gel_percent_r'] = (current_close - gel_l) / row_data['gel_range']
                else:
                    row_data['gel_percent_r'] = 0.5

                row_data['gel_ce_percent'] = 1 - previous_percent_r if previous_percent_r >= 0.5 else previous_percent_r
                row_data['gel_bias_dir'] = 1 if previous_percent_r >= 0.5 else 0
                row_data['gel_hp_flag'] = 1 if row_data['gel_ce_percent'] <= 0.25 else 0

                previous_percent_r = row_data['gel_percent_r']
                results.append(row_data)

        return pd.DataFrame(results)

    def process_gel_prior_parent(self, child_df: pd.DataFrame, parent_df: pd.DataFrame) -> pd.DataFrame:
        child_df = child_df.copy().reset_index(drop=True)
        parent_df = parent_df.copy().reset_index(drop=True)
        results = []

        for i, parent_row in parent_df.iterrows():
            if i == 0:
                continue

            prior_parent = parent_df.iloc[i-1]
            next_date = parent_df['date'].iloc[i+1] if i < len(parent_df)-1 else child_df['date'].max()
            mask = (child_df['date'] >= parent_row['date']) & (child_df['date'] < next_date)
            period_children = child_df[mask].copy().reset_index(drop=True)

            if len(period_children) == 0:
                continue

            pp_data = {
                'pp_o': prior_parent['open'],
                'pp_h': prior_parent['high'],
                'pp_l': prior_parent['low'],
                'pp_c': prior_parent['close'],
                'pp_range': prior_parent['high'] - prior_parent['low']
            }

            if pp_data['pp_range'] != 0:
                pp_data['pp_percent_r'] = (pp_data['pp_c'] - pp_data['pp_l']) / pp_data['pp_range']
            else:
                pp_data['pp_percent_r'] = 0.5

            ic_h = period_children['high'].iloc[0]
            ic_l = period_children['low'].iloc[0]
            previous_ref_h = pp_data['pp_h']
            previous_ref_l = pp_data['pp_l']

            for idx, child_row in period_children.iterrows():
                ic_h = max(ic_h, child_row['high'])
                ic_l = min(ic_l, child_row['low'])

                # Calculate expansions
                reu_value = max(child_row['high'] - previous_ref_h, 0)
                red_value = max(previous_ref_l - child_row['low'], 0)

                # Edge bias calculations
                if pp_data['pp_range'] != 0:
                    bias_dir = 1 if pp_data['pp_percent_r'] >= 0.5 else 0
                else:
                    bias_dir = 0

                # Map expansions to E1/E2
                e1_value = reu_value if bias_dir == 1 else red_value
                e2_value = red_value if bias_dir == 1 else reu_value

                result = {
                    **child_row,
                    **pp_data,
                    'cpp_ic_h': ic_h,
                    'cpp_ic_l': ic_l,
                    'cpp_ic_c': child_row['close'],
                    'cpp_ref_h': max(pp_data['pp_h'], ic_h),
                    'cpp_ref_l': min(pp_data['pp_l'], ic_l),
                    'cpp_reu_value': reu_value,
                    'cpp_red_value': red_value,
                    'cpp_bias_dir': bias_dir,
                    'cpp_e1_value': e1_value,
                    'cpp_e2_value': e2_value,
                    'cpp_e1_flag': 1 if e1_value > 0 else 0,
                    'cpp_e2_flag': 1 if e2_value > 0 else 0,
                    'cpp_serial_id': self.generate_serial_id()
                }

                previous_ref_h = result['cpp_ref_h']
                previous_ref_l = result['cpp_ref_l']
                results.append(result)

        return pd.DataFrame(results)

    def process_ticker(self, ticker: str):
        print(f"\nProcessing {ticker}:")
        input_path = f'/content/input/{ticker}_{self.config.child_period}_1.csv'

        try:
            df = pd.read_csv(input_path)
            df['date'] = pd.to_datetime(df['date'])
            print(f"Initial Date Range: {df['date'].min().strftime('%Y-%m-%d')} to {df['date'].max().strftime('%Y-%m-%d')}")
            print(f"Initial Child Bars: {len(df)}")

            # Check if required columns exists
            required_columns = ['open', 'high', 'low', 'close']
            if not all(col in df.columns for col in required_columns):
                logging.error(f"Missing required columns in {ticker}. Skipping processing.")
                return
            #print(f"Date Range: {df['date'].min().strftime('%Y-%m-%d')} to {df['date'].max().strftime('%Y-%m-%d')}")
            #print(f"Child Bars: {len(df)}")

              # Filter out rows where open, high, low, or close are zero or blank
            df = df[
                (df['open'] != 0) & (df['open'].notna()) &
                (df['high'] != 0) & (df['high'].notna()) &
                (df['low'] != 0) & (df['low'].notna()) &
                (df['close'] != 0) & (df['close'].notna())
            ]

            print(f"Filtered Date Range: {df['date'].min().strftime('%Y-%m-%d')} to {df['date'].max().strftime('%Y-%m-%d')}")
            print(f"Filtered Child Bars: {len(df)}")

            if len(df) == 0:
                logging.warning(f"No valid data after filtering for {ticker}, skipping.")
                return

            parent_df = self.create_parent_bars(df)
            child_prior_bar_df = self.process_child_prior_bar(df)
            gel_parent_df = self.process_gel_parent(df, parent_df)
            gel_prior_parent_df = self.process_gel_prior_parent(df, parent_df)

            output_paths = {
                'gel_child': f'{ticker}_child{self.config.child_period}.csv',
                'gel_parent': f'{ticker}_parent{self.config.parent_period}.csv',
                'gel_childgel': f'{ticker}_childgel{self.config.child_period}.csv',
                'gel_priorparent': f'{ticker}_gelparent{self.config.parent_period}.csv'
            }

            for folder, filename in output_paths.items():
                os.makedirs(folder, exist_ok=True)
                output_df = {
                    'gel_child': child_prior_bar_df,
                    'gel_parent': parent_df,
                    'gel_childgel': gel_parent_df,
                    'gel_priorparent': gel_prior_parent_df
                }[folder]

                # Add common fields
                output_df['create_date'] = datetime.now().strftime('%Y-%m-%d')
                output_df['create_time'] = datetime.now().strftime('%H:%M:%S')
                output_df['jobname'] = 'GetSet_20150101'
                output_df['ticker'] = ticker
                output_df['child_period'] = self.config.child_period
                output_df['parent_period'] = self.config.parent_period

                output_df.to_csv(f'{folder}/{filename}', index=False)

            logging.info(f"Successfully processed {ticker}")

        except Exception as e:
            logging.error(f"Error processing {ticker}: {str(e)}")
            raise

def process_data(child_period='D', parent_period='M'):
    print(f"\nProcessing with {child_period} child bars and {parent_period} parent bars")
    config = ProcessorConfig(
        child_period=child_period,
        parent_period=parent_period
    )

    processor = GelProcessor(config)

    input_files = [f for f in os.listdir('/content/input') if f.endswith('.csv')]
    tickers = set(f.split('_')[0] for f in input_files)

    for ticker in tickers:
        processor.process_ticker(ticker)

# Run the processor
process_data(child_period='D', parent_period='M')


Processing with D child bars and M parent bars

Processing WTI:
Initial Date Range: 2009-12-31 to 2022-08-26
Initial Child Bars: 3292
Filtered Date Range: 2009-12-31 to 2022-08-26
Filtered Child Bars: 3200

Processing XLB:
Initial Date Range: 1998-12-22 to 2024-12-18
Initial Child Bars: 6540
Filtered Date Range: 1998-12-22 to 2024-12-18
Filtered Child Bars: 6540

Processing AAPL:
Initial Date Range: 2016-01-04 to 2019-12-30
Initial Child Bars: 1005
Filtered Date Range: 2016-01-04 to 2019-12-30
Filtered Child Bars: 1005

Processing MMM:
Initial Date Range: 2016-01-04 to 2019-12-30
Initial Child Bars: 1005
Filtered Date Range: 2016-01-04 to 2019-12-30
Filtered Child Bars: 1005

Processing AFL:
Initial Date Range: 2016-01-04 to 2019-12-30
Initial Child Bars: 1005
Filtered Date Range: 2016-01-04 to 2019-12-30
Filtered Child Bars: 1005


In [None]:
import pandas as pd
import numpy as np

def audit_csv_file(file_path):
    """Audits a CSV file for zero or NaN values in specified columns."""
    try:
        df = pd.read_csv(file_path)
        print(f"File loaded successfully: {file_path}")
        print(f"Number of rows: {len(df)}")

        required_columns = ['open', 'high', 'low', 'close']
        for col in required_columns:
            if col not in df.columns:
                print(f"Error: Column '{col}' not found.")
                return

        for col in required_columns:

          zero_count = (df[col] == 0).sum()
          nan_count = df[col].isna().sum()

          if zero_count > 0 or nan_count > 0:
            print(f"Column: '{col}' has {zero_count} zero values and {nan_count} NaN values.")
          else:
            print(f"Column '{col}' has no zero or NaN values.")

          # Identify rows with zeros or NaNs
          if zero_count > 0 or nan_count > 0:
                print(f"  Rows with zero or NaN in '{col}':")
                problem_rows = df[((df[col] == 0) | (df[col].isna()))]
                for index, row in problem_rows.iterrows():
                  print(f"    Index: {index}, Values: {row.to_dict()}")


    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage (replace with the actual path to the problematic file)
file_path = '/content/input/XLB_D_1.csv'
audit_csv_file(file_path)

File loaded successfully: /content/input/XLB_D_1.csv
Number of rows: 6540
Column 'open' has no zero or NaN values.
Column 'high' has no zero or NaN values.
Column 'low' has no zero or NaN values.
Column 'close' has no zero or NaN values.


In [None]:
import os
import pandas as pd
import pyarrow.parquet as pq

# Define folders
folders = ["/content/gel_child", "/content/gel_childgel", "/content/gel_parent"]

# Output Parquet file
output_parquet = "/content/parquet_child/all_stocks.parquet"

# Combine all data into a single DataFrame
all_data = []

for folder in folders:
    for file_name in os.listdir(folder):
        if file_name.endswith('.csv'):
            file_path = os.path.join(folder, file_name)
            stock_ticker = file_name.split('_')[0]  # Extract stock ticker
            df = pd.read_csv(file_path)
            df['Ticker'] = stock_ticker  # Add a Ticker column
            all_data.append(df)

# Combine all DataFrames
combined_df = pd.concat(all_data, ignore_index=True)

# Save as a single Parquet file with all stock data
combined_df.to_parquet(output_parquet, engine='pyarrow')

print(f"Data saved to {output_parquet}")


Data saved to /content/parquet_child/all_stocks.parquet
