<a href="https://colab.research.google.com/github/BaronVonBussin/NewTransit/blob/main/bpb_basic_20250105.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import pandas as pd
import numpy as np
from datetime import datetime

# Constants
INPUT_DIR = '/content/input'
OUTPUT_CHILD_DIR = 'output_bpb'
OUTPUT_SUMMARY_DIR = 'output_bpb_summary'
JOBNAME = 'BPB_20250104'

# Helper function to generate serial_id
def generate_serial_id(date, time, row_number):
    return f"{date.replace('-', '')}{time.replace(':', '')}{str(row_number).zfill(4)}"

# Load and validate input files
def load_and_validate_files():
    input_files = [f for f in os.listdir(INPUT_DIR) if f.endswith('.csv')]
    all_data = []

    for file in input_files:
        ticker, temporal_period, rolling_range_dur = file.split('_')
        rolling_range_dur = rolling_range_dur.replace('.csv', '')

        df = pd.read_csv(os.path.join(INPUT_DIR, file))
        # Validation
        df = df.dropna(subset=['open', 'high', 'low', 'close'])
        df = df[(df['low'] <= df['open']) & (df['open'] <= df['high'])]
        df = df[(df['low'] <= df['close']) & (df['close'] <= df['high'])]
        df = df[(df['high'] - df['low']) > 0]

        # Add reference fields
        df['serial_id'] = [
            generate_serial_id(row['date'], '000000', i + 1) for i, row in df.iterrows()
        ]
        df['row_number'] = df.index + 1
        df['ticker'] = ticker
        df['rolling_range_dur'] = int(rolling_range_dur)
        df['lookup_date'] = pd.to_datetime(df['date']).dt.strftime('%Y/%m/%d')
        df['create_date'] = datetime.now().strftime('%Y-%m-%d')
        df['create_time'] = datetime.now().strftime('%H:%M:%S')
        df['jobname'] = JOBNAME
        df['temporal_period'] = temporal_period

        all_data.append(df)

    return pd.concat(all_data, ignore_index=True)

# Calculate fields
def calculate_fields(df):
    df['ro'] = df['open'].shift(1)
    df['rh'] = df['high'].shift(1)
    df['rl'] = df['low'].shift(1)
    df['rc'] = df['close'].shift(1)

    df['range'] = df['high'] - df['low']
    df['prior_percent_r'] = (df['rc'] - df['rl']) / (df['rh'] - df['rl'])
    df['ce_percent'] = np.where(df['prior_percent_r'] >= 0.5, 1 - df['prior_percent_r'], df['prior_percent_r'])
    df['epc_dir'] = np.where(df['prior_percent_r'] >= 0.5, "U", "D")
    df['epc_hp'] = df['ce_percent'] >= 0.25
    df['epc'] = (df['ce_percent'] / 0.1).round(0)
    df['epc'] = pd.to_numeric(df['epc'], errors='coerce').fillna(0).astype(int)

    # Bar/Prior Bar Calculations
    df['reu_value'] = np.where(df['high'] > df['rh'], df['high'] - df['rh'], 0)
    df['red_value'] = np.where(df['low'] < df['rl'], df['rl'] - df['low'], 0)
    df['reu_flag'] = df['reu_value'] > 0
    df['red_flag'] = df['red_value'] > 0
    df['re_flag'] = df['reu_flag'] | df['red_flag']
    df['re_value'] = np.where(df['re_flag'], df['reu_value'] + df['red_value'], 0)
    df['twoway'] = df['reu_flag'] & df['red_flag']

    # fre_dir and rpc
    df['fre_dir'] = np.where(
        df['twoway'],
        df['epc_dir'],
        np.where(df['reu_flag'], "U", np.where(df['red_flag'], "D", np.nan))
    )
    df['rpc'] = np.where(df['twoway'], 2, 1)

    # EPC Follow-Through
    df['e1_value'] = np.where(df['epc_dir'] == "U", df['reu_value'], df['red_value'])
    df['e2_value'] = np.where(df['epc_dir'] == "U", df['red_value'], df['reu_value'])
    df['e1_flag'] = df['e1_value'] > 0
    df['e2_flag'] = df['e2_value'] > 0
    df['e1_fre_flag'] = df['fre_dir'] == df['epc_dir']

    # Finishing Flags
    df['cos_flag'] = np.where(df['re_flag'], (df['close'] > df['rh']) | (df['close'] < df['rl']), np.nan)
    df['mm_flag'] = np.where(
        df['re_flag'] & ~df['twoway'],
        np.where(df['reu_flag'], df['prior_percent_r'] < 0.5, df['prior_percent_r'] > 0.5),
        np.nan
    )

    return df

# Summarize and Export
def summarize_and_export(df):
    summary = df.groupby('ticker').agg(
        valid_rows=('serial_id', 'count'),
        start_date=('date', 'min'),
        end_date=('date', 'max'),
        rejected_rows=('row_number', 'size')
    )

    print("\nSummary Statistics by Ticker:")
    for ticker in df['ticker'].unique():
        ticker_df = df[df['ticker'] == ticker]
        print(f"\nTicker: {ticker}")
        print(f" - Records Processed: {len(ticker_df)}")
        print(f" - Start Date: {ticker_df['date'].min()}")
        print(f" - End Date: {ticker_df['date'].max()}")
        print(f" - Valid Rows: {len(ticker_df[ticker_df['re_flag']])}")
        print(f" - Rejected Rows: {len(ticker_df) - len(ticker_df[ticker_df['re_flag']])}")

    for ticker in df['ticker'].unique():
        ticker_df = df[df['ticker'] == ticker]
        ticker_df.to_csv(os.path.join(OUTPUT_CHILD_DIR, f"{ticker}_BPB_{ticker_df['temporal_period'].iloc[0]}.csv"), index=False)

    summary.to_csv(os.path.join(OUTPUT_SUMMARY_DIR, "BPB_Summary.csv"), index=True)

# Main Workflow
def main():
    os.makedirs(OUTPUT_CHILD_DIR, exist_ok=True)
    os.makedirs(OUTPUT_SUMMARY_DIR, exist_ok=True)

    data = load_and_validate_files()
    processed_data = calculate_fields(data)
    summarize_and_export(processed_data)
    print("Processing complete. Files exported.")

if __name__ == "__main__":
    main()



Summary Statistics by Ticker:

Ticker: AAPL
 - Records Processed: 1005
 - Start Date: 1/10/2017
 - End Date: 9/9/2019
 - Valid Rows: 879
 - Rejected Rows: 126

Ticker: MMM
 - Records Processed: 1005
 - Start Date: 1/10/2017
 - End Date: 9/9/2019
 - Valid Rows: 880
 - Rejected Rows: 125

Ticker: XLB
 - Records Processed: 6540
 - Start Date: 1/10/2000
 - End Date: 9/9/2024
 - Valid Rows: 5817
 - Rejected Rows: 723

Ticker: AFL
 - Records Processed: 1005
 - Start Date: 1/10/2017
 - End Date: 9/9/2019
 - Valid Rows: 874
 - Rejected Rows: 131
Processing complete. Files exported.
