<a href="https://colab.research.google.com/github/BaronVonBussin/NewTransit/blob/main/gel_set_test_20250105.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import pandas as pd
import numpy as np
from datetime import datetime

# Parameters
child_period = "D"  # Default
parent_period = "M"  # Default
jobname = "BPB_20250104"
input_folder_child = "/content/input_child"
input_folder_parent = "/content/input_parent"
output_folder_gel_child = "output_gel_child"
output_folder_gel_parent = "output_gel_parent"

# Utility functions
def validate_child_row(row):
    """Validate child row data."""
    try:
        assert row['low'] <= row['open'] <= row['high'], "Invalid open value"
        assert row['low'] <= row['close'] <= row['high'], "Invalid close value"
        assert (row['high'] - row['low']) > 0, "High-Low <= 0"
        assert not pd.isna(row).any(), "Row contains NaN values"
        return True
    except AssertionError as e:
        return False

def validate_parent_row(row, parent_period):
    """Validate parent row data."""
    try:
        assert row['low'] <= row['open'] <= row['high'], "Invalid open value"
        assert row['low'] <= row['close'] <= row['high'], "Invalid close value"
        assert (row['high'] - row['low']) > 0, "High-Low <= 0"
        if parent_period == "M":
            assert 'lookup_month' in row, "Missing lookup_month for parent row"
        elif parent_period == "W":
            assert 'lookup_week' in row, "Missing lookup_week for parent row"
        return True
    except AssertionError as e:
        return False

def fetch_files(folder, file_type):
    """Fetch all CSV files from the specified folder."""
    return [os.path.join(folder, f) for f in os.listdir(folder) if f.endswith('.csv') and file_type in f]

# Processing functions
def process_parent_file(file_path):
    """Process parent file and return a DataFrame."""
    file_name = os.path.basename(file_path)
    ticker, period = file_name.split('_')[:2]
    df = pd.read_csv(file_path)

    # Validate rows
    df = df[df.apply(lambda row: validate_parent_row(row, parent_period), axis=1)]

    # Add reference fields
    df['serial_id'] = [f"{datetime.now().strftime('%Y%m%d%H%M%S')}{i}" for i in range(len(df))]
    df['row_number'] = range(1, len(df) + 1)
    df['ticker'] = ticker
    df['parent_lookup_date'] = df['lookup_month'] if parent_period == "M" else df['lookup_week']
    df['trading_bop'] = df['dur']  # Assuming duration corresponds to trading_bop for parent
    df['create_date'] = datetime.now().date()
    df['create_time'] = datetime.now().time()
    df['jobname'] = jobname

    return df

def process_child_file(file_path, parent_df):
    """Process child file and return a DataFrame."""
    file_name = os.path.basename(file_path)
    ticker, period = file_name.split('_')[:2]

    # Read file
    df = pd.read_csv(file_path)

    # Debug column names
    print("Columns in the DataFrame before normalization:")
    for col in df.columns:
        print(f"'{col}' (length: {len(col)})")

    # Normalize column names
    df.columns = df.columns.str.strip().str.lower()

    # Rename columns to avoid conflicts
    df.rename(columns={
        'open': 'open_fixed',
        'high': 'high_fixed',
        'low': 'low_fixed',
        'close': 'close_fixed'
    }, inplace=True)

    # Verify renamed columns
    print("Columns after renaming:", df.columns.tolist())

    # Validate rows
    df = df[
        (df['low_fixed'] <= df['open_fixed']) &
        (df['open_fixed'] <= df['high_fixed']) &
        (df['low_fixed'] <= df['close_fixed']) &
        (df['close_fixed'] <= df['high_fixed']) &
        ((df['high_fixed'] - df['low_fixed']) > 0)
    ]

    # Align with parent file
    parent_lookup_field = 'lookup_month' if parent_period == "M" else 'lookup_week'
    df = df.merge(parent_df, left_on=parent_lookup_field, right_on='parent_lookup_date', suffixes=('_child', '_parent'))

    # Debug columns after merge
    print("Columns after merge:", df.columns.tolist())

    # Define all expected fields
    expected_fields = [
        'date', 'open_fixed', 'high_fixed', 'low_fixed', 'close_fixed', 'month', 'year', 'week', 'weekday',
        'lookup_month_child', 'lookup_week', 'dur', 'open', 'high', 'low', 'close', 'serial_id',
        'row_number', 'ticker', 'parent_lookup_date', 'trading_bop', 'create_date', 'create_time',
        'jobname', 'gel_reu_value', 'gel_red_value', 'gel_reu_flag', 'gel_red_flag', 'gel_re_flag',
        'gel_twoway', 'gel_new_rpc', 'gel_total_rpc', 'gel_dir_state', 'gel_eob_dir', 'gel_fre_dir',
        'gel_epc_dir'
    ]

    # Initialize missing fields
    for field in expected_fields:
        if field not in df.columns:
            if field.endswith('_flag'):  # Flags are boolean
                df[field] = False
            elif field.startswith('gel_'):  # Derived numeric fields
                df[field] = np.nan
            else:  # Other fields, initialize with empty or NaN
                df[field] = np.nan

    print("Columns after initialization:", df.columns.tolist())

    # Debug missing fields
    missing_fields = [field for field in expected_fields if field not in df.columns]
    if missing_fields:
        raise KeyError(f"Missing fields in DataFrame: {missing_fields}")

    # Compute derived fields
    df['gel_reu_value'] = np.where(
        df['high_fixed'] > df['high_fixed'].shift(1),
        df['high_fixed'] - df['high_fixed'].shift(1),
        0
    )
    df['gel_red_value'] = np.where(
        df['low_fixed'] < df['low_fixed'].shift(1),
        df['low_fixed'].shift(1) - df['low_fixed'],
        0
    )
    df['gel_reu_flag'] = df['gel_reu_value'] > 0
    df['gel_red_flag'] = df['gel_red_value'] > 0
    df['gel_re_flag'] = df['gel_reu_flag'] | df['gel_red_flag']

    # Compute gel_eob_dir
    df['gel_eob_dir'] = np.where(
        df['gel_twoway'],
        np.where(df['gel_epc_dir'] == "U", "D", "U"),
        df['gel_fre_dir']
    )

    # Compute gel_dir_state
    df['gel_dir_state'] = np.where(
        df['trading_bop'] == 1,
        df['gel_eob_dir'],  # Reset state for the first bar in a parent
        df['gel_dir_state'].ffill()  # Forward fill for subsequent bars
    )

    # Compute gel_new_rpc and gel_total_rpc
    df['gel_new_rpc'] = np.where(
        df['trading_bop'] == 1,
        np.where(df['gel_re_flag'], np.where(df['gel_twoway'], 2, 1), 0),
        np.where(df['gel_twoway'], np.where(df['gel_dir_state'] == df['gel_epc_dir'], 2, 1), 0)
    )
    df['gel_total_rpc'] = df.groupby('parent_lookup_date')['gel_new_rpc'].cumsum()

    print("Sample gel_total_rpc values:", df['gel_total_rpc'].head())

    # Summarize parent-child data
    summary = df.groupby('parent_lookup_date').agg(
        start_date=('date', 'min'),
        end_date=('date', 'max'),
        child_count=('row_number', 'count'),
        reu_count=('gel_reu_value', 'sum'),
        red_count=('gel_red_value', 'sum'),
        total_rpc=('gel_total_rpc', 'max')
    ).reset_index()
    summary['create_date'] = datetime.now().date()
    summary['create_time'] = datetime.now().time()
    summary['jobname'] = jobname

    return df, summary

# Main function
def main():
    # Process parent files
    parent_files = fetch_files(input_folder_parent, parent_period)
    all_parent_data = pd.concat([process_parent_file(f) for f in parent_files], ignore_index=True)

    # Process child files
    child_files = fetch_files(input_folder_child, child_period)
    all_child_data = []
    all_summaries = []
    for f in child_files:
        child_data, summary = process_child_file(f, all_parent_data)
        all_child_data.append(child_data)
        all_summaries.append(summary)

    all_child_data = pd.concat(all_child_data, ignore_index=True)
    all_summaries = pd.concat(all_summaries, ignore_index=True)

    # Export files
    os.makedirs(output_folder_gel_child, exist_ok=True)
    os.makedirs(output_folder_gel_parent, exist_ok=True)

    all_child_data.to_csv(os.path.join(output_folder_gel_child, f"all_child_data_{child_period}.csv"), index=False)
    all_summaries.to_csv(os.path.join(output_folder_gel_parent, f"all_parent_summary_{parent_period}.csv"), index=False)

    print("Processing complete. Child and Parent data have been exported.")

# Run
if __name__ == "__main__":
    main()


Columns in the DataFrame before normalization:
'date' (length: 4)
'open' (length: 4)
'high' (length: 4)
'low' (length: 3)
'close' (length: 5)
'month' (length: 5)
'year' (length: 4)
'week' (length: 4)
'weekday' (length: 7)
'lookup_month' (length: 12)
'month_sequence' (length: 14)
'week_sequence' (length: 13)
'lookup_week' (length: 11)
Columns after renaming: ['date', 'open_fixed', 'high_fixed', 'low_fixed', 'close_fixed', 'month', 'year', 'week', 'weekday', 'lookup_month', 'month_sequence', 'week_sequence', 'lookup_week']
Columns after merge: ['date', 'open_fixed', 'high_fixed', 'low_fixed', 'close_fixed', 'month', 'year', 'week', 'weekday', 'lookup_month_child', 'month_sequence', 'week_sequence', 'lookup_week', 'lookup_month_parent', 'dur', 'open', 'high', 'low', 'close', 'serial_id', 'row_number', 'ticker', 'parent_lookup_date', 'trading_bop', 'create_date', 'create_time', 'jobname']
Columns after initialization: ['date', 'open_fixed', 'high_fixed', 'low_fixed', 'close_fixed', 'month'