<a href="https://colab.research.google.com/github/BaronVonBussin/NewTransit/blob/main/voodoo_romulus_preprocessing_20250222.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preprocess data for Romulus gel sets

1.   Upload input files to /content/daily

In [7]:
###hack program that preloads prior parent fields to work around issue in romulus
###only supports W and ME
###export does not label which parent it is--easy fix BUT it requires changes elsewhere, so for later
###second cell exports a zip

import pandas as pd
import os
from datetime import datetime

# Configuration
class Config:
    def __init__(self):
        self.child_period = 'D'  # Fixed for now
        self.parent_period = 'ME'
        if self.parent_period not in ['W', 'ME']:
            raise ValueError("Parent period must be W or ME")

# Create output directory
os.makedirs('/content/gelset_prep', exist_ok=True)

def load_daily_data(ticker):
    """Load daily data from the input file"""
    file_path = f'/content/daily/{ticker}_daily.csv'
    df = pd.read_csv(file_path)
    df['date'] = pd.to_datetime(df['Date'])  # Keep original Date column for now
    return df[['date', 'Open', 'High', 'Low', 'Close']]

def create_parent_labels(df, parent_period):
    """Create parent period labels"""
    if parent_period == 'ME':
        # Monthly format YYYY/MM
        df['parent'] = df['date'].dt.strftime('%Y/%m')
    else:  # Weekly
        # Get the year and week number of the first day of each week
        df['temp_week'] = df['date'].dt.strftime('%Y-%U')
        first_dates = df.groupby('temp_week')['date'].transform('first')
        df['parent'] = first_dates.dt.strftime('%Y/%U')
        df.drop('temp_week', axis=1, inplace=True)
    return df

def add_sequence(df):
    """Add sequence numbers that restart at 1 for each parent period"""
    df['sequence'] = df.groupby('parent').cumcount() + 1
    return df

def calculate_parent_values(df):
    """Calculate parent period OHLC values"""
    parent_values = df.groupby('parent').agg({
        'High': 'max',
        'Low': 'min',
        'Close': 'last'
    }).reset_index()

    # Shift parent values forward one period
    parent_values['next_parent'] = parent_values['parent'].shift(-1)
    parent_values = parent_values.rename(columns={
        'High': 'pph',
        'Low': 'ppl',
        'Close': 'ppc'
    })
    return parent_values

def process_ticker(ticker, config):
    """Process a single ticker"""
    try:
        # Load and process data
        df = load_daily_data(ticker)

        # Create parent labels
        df = create_parent_labels(df, config.parent_period)

        # Add sequence numbers
        df = add_sequence(df)

        # Calculate parent values
        parent_values = calculate_parent_values(df)

        # Merge parent values back to daily data
        df = pd.merge(df, parent_values[['next_parent', 'pph', 'ppl', 'ppc']],
                     left_on='parent', right_on='next_parent', how='left')

        # Clean up and rename columns
        df = df[[
            'date', 'Open', 'High', 'Low', 'Close',
            'parent', 'sequence', 'pph', 'ppl', 'ppc'
        ]].rename(columns={
            'Open': 'open',
            'High': 'high',
            'Low': 'low',
            'Close': 'close'
        })

        # Remove rows with any missing values
        df = df.dropna()

        # Save to output directory
        output_path = f'/content/gelset_prep/{ticker}_D.csv'
        df.to_csv(output_path, index=False)
        print(f"Processed {ticker} successfully")

    except Exception as e:
        print(f"Error processing {ticker}: {str(e)}")

def main():
    # Initialize configuration
    config = Config()

    # Get list of tickers from daily directory
    daily_files = [f.split('_')[0] for f in os.listdir('/content/daily') if f.endswith('_daily.csv')]

    # Process each ticker
    for ticker in daily_files:
        process_ticker(ticker, config)

    print("\nProcessing complete!")

if __name__ == "__main__":
    main()

Processed XLY successfully
Processed XLB successfully
Processed VOO successfully
Processed XLU successfully
Processed VTI successfully
Processed XLV successfully
Processed XLF successfully
Processed UVXY successfully
Processed XLP successfully
Processed XLI successfully
Processed XLE successfully
Processed USO successfully
Processed XLK successfully

Processing complete!


In [5]:
from google.colab import files
import os
import zipfile

def zip_and_download(source_dir, period):
    # Count files
    file_count = len([name for name in os.listdir(source_dir) if os.path.isfile(os.path.join(source_dir, name))])
    print(f"Found {file_count} gelset prep files")

    # Create zip file
    zip_filename = f'gelset_prep_files.zip'
    with zipfile.ZipFile(zip_filename, 'w') as zipf:
        for file in os.listdir(source_dir):
            file_path = os.path.join(source_dir, file)
            if os.path.isfile(file_path):
                zipf.write(file_path, arcname=file)

    # Download zip file
    files.download(zip_filename)
    print(f"Downloaded {zip_filename}")

# Create and download zip
print("\nCreating and downloading zip file...\n")
zip_and_download('/content/gelset_prep', 'gelset_prep')


Creating and downloading zip file...

Found 13 gelset prep files


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded gelset_prep_files.zip
