In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [None]:
# RANGE_THRESHOLD = 0.6
RANGE_THRESHOLD = 0.3

def load_data_from_csv(daily_file, hourly_file, minute_file):
    # Load data from files
    daily_data = pd.read_csv(daily_file)
    hourly_data = pd.read_csv(hourly_file)  
    minute_data = pd.read_csv(minute_file)
    
    # Convert time columns to datetime
    daily_data['Time'] = pd.to_datetime(daily_data['Time'])
    hourly_data['Time'] = pd.to_datetime(hourly_data['Time'])
    minute_data['Time'] = pd.to_datetime(minute_data['Time'])
    
    # Set Time as index
    daily_data.set_index('Time', inplace=True)
    hourly_data.set_index('Time', inplace=True)
    minute_data.set_index('Time', inplace=True)
    
    # Create timeframe dict
    data = {
        "daily": daily_data,
        "hourly": hourly_data,
        "minute": minute_data
    }
    
    return data

def preprocess_timeframe(df, timeframe):
    df = add_common_indicators(df)
    df = add_mean_reversion_indicators(df, timeframe)
    return df

def add_common_indicators(df):
    df = df.dropna()
    data_length = len(df)
    sma_short = min(20, max(5, data_length // 4))
    sma_medium = min(50, max(10, data_length // 2))
    # Add moving averages
    df['sma20'] = df['Close'].rolling(window=sma_short).mean()
    df['sma50'] = df['Close'].rolling(window=sma_medium).mean()
    # Add Bollinger Bands
    df['bollinger_mid'] = df['sma20']
    df['bollinger_std'] = df['Close'].rolling(window=sma_short).std()
    df['bollinger_upper'] = df['bollinger_mid'] + 2 * df['bollinger_std']
    df['bollinger_lower'] = df['bollinger_mid'] - 2 * df['bollinger_std']
    # Add RSI
    rsi_window = min(14, max(5, data_length // 3))
    delta = df['Close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=rsi_window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=rsi_window).mean()
    rs = gain / loss
    rs = rs.replace([np.inf, -np.inf], np.nan).fillna(1)  # Handle division by zero
    df['rsi'] = 100 - (100 / (1 + rs))
    # Add ATR (Average True Range)
    atr_window = min(14, max(5, data_length // 3))
    tr1 = abs(df['High'] - df['Low'])
    tr2 = abs(df['High'] - df['Close'].shift())
    tr3 = abs(df['Low'] - df['Close'].shift())
    tr = pd.concat([tr1, tr2, tr3], axis=1).max(axis=1)
    df['atr'] = tr.rolling(window=atr_window).mean()
    df = df.fillna(method='ffill').fillna(method='bfill')
    return df

def detect_range_boundaries(df, lookback):
    # Calculate based on historical highs and lows
    upper_boundary = df['High'].rolling(window=lookback).max()
    lower_boundary = df['Low'].rolling(window=lookback).min()
    
    return upper_boundary, lower_boundary

def add_mean_reversion_indicators(df, timeframe):
    # Determine appropriate lookback based on timeframe
    data_length = len(df)
    
    if timeframe == "daily":
        lookback = min(20, max(3, data_length // 2))
    elif timeframe == "hourly":
        lookback = min(48, max(4, data_length // 3))
    else:  # minute
        lookback = min(60, max(5, data_length // 4))
    
    # Calculate adaptive mean
    df['mean'] = df['Close'].rolling(window=lookback).mean()
    # Detect range boundaries
    df['upper_range'], df['lower_range'] = detect_range_boundaries(df, lookback)
    # Calculate distance from mean and boundaries
    df['distance_from_mean'] = (df['Close'] - df['mean']) / df['mean'] * 100
    # Safely calculate distance from boundaries (as percentage of range)
    range_width = df['upper_range'] - df['lower_range']
    df['distance_from_upper'] = np.where(
        range_width > 0, 
        (df['upper_range'] - df['Close']) / range_width * 100,
        50  # Default value for undefined ranges
    )
    df['distance_from_lower'] = np.where(
        range_width > 0, 
        (df['Close'] - df['lower_range']) / range_width * 100,
        50  # Default value for undefined ranges
    )
    
    # Calculate range strength
    df['range_strength'] = calculate_range_strength(df, lookback)
    # Calculate mean reversion probability
    df['mean_reversion_probability'] = calculate_reversion_probability(df)
    # Flag if we're in a range market
    df['is_range_market'] = df['range_strength'] > RANGE_THRESHOLD
    
    return df

def calculate_range_strength(df, lookback):
    price_direction = df['Close'].diff()
    direction_change = ((price_direction > 0) != (price_direction.shift() > 0)).rolling(window=lookback).sum()
    range_width = (df['upper_range'] - df['lower_range']) / df['mean']
    traversal = df['Close'].rolling(window=lookback).std() / range_width
    max_direction_changes = lookback - 1
    norm_direction_change = (direction_change / max_direction_changes).clip(0, 1)
    norm_range_width = (1 - range_width.clip(0, 0.2) / 0.2).clip(0, 1)
    norm_traversal = traversal.clip(0, 1)
    range_strength = (
        norm_direction_change * 0.4 +  # Frequency of oscillation
        norm_range_width * 0.3 +       # Narrowness of range
        norm_traversal * 0.3           # Coverage of range
    ).clip(0, 1)
    return range_strength

def calculate_reversion_probability(df):
    # Factor 1: Distance from mean (normalized)
    distance_factor = abs(df['distance_from_mean'] / 100).clip(0, 0.5) * 2
    # Factor 2: RSI extremes (higher probability when RSI is extreme)
    rsi = df['rsi'].fillna(50) 
    rsi_factor = np.where(rsi < 30, (30 - rsi) / 30, 
                        np.where(rsi > 70, (rsi - 70) / 30, 0))
    
    # Factor 3: Bollinger Band proximity
    bb_upper_dist = (df['Close'] - df['bollinger_upper']) / df['bollinger_std']
    bb_lower_dist = (df['bollinger_lower'] - df['Close']) / df['bollinger_std']
    bb_factor = np.where(bb_upper_dist > 0, bb_upper_dist, 
                       np.where(bb_lower_dist > 0, bb_lower_dist, 0)).clip(0, 1)
    # Factor 4: Range strength (higher probability in strong ranges)
    range_factor = df['range_strength']
    # Combine factors with weights
    probability = (
        distance_factor * 0.3 +
        rsi_factor * 0.2 + 
        bb_factor * 0.2 + 
        range_factor * 0.3
    ).clip(0, 1)
    return probability

def temporal_split(data, split_date):
    split_date = pd.to_datetime(split_date).tz_localize('UTC')
    train_data = {}
    test_data = {}
    
    for timeframe, df in data.items():
        train_data[timeframe] = df[df.index < split_date].copy()
        test_data[timeframe] = df[df.index >= split_date].copy()
    
    return train_data, test_data

def get_data_at_timestamp(multi_timeframe_data, timestamp):
    result = {}
    
    for timeframe, df in multi_timeframe_data.items():
        # Get the last available row before or at the timestamp
        df_before = df[df.index <= timestamp]
        if not df_before.empty:
            result[timeframe] = df_before.iloc[-1]
        else:
            result[timeframe] = None
    return result

def prepare_multi_timeframe_data(daily_file, hourly_file, minute_file, split_date="2022-01-01"):
    print(f"Loading data from CSV files...")
    data = load_data_from_csv(daily_file, hourly_file, minute_file)
    # Process each timeframe
    for timeframe in data:
        print(f"Processing {timeframe} data...")
        data[timeframe] = preprocess_timeframe(data[timeframe], timeframe)
    
    # Split into training and testing sets
    print(f"Splitting data at {split_date}")
    train_data, test_data = temporal_split(data, split_date)
    
    # Print summary
    for timeframe in train_data:
        print(f"Training {timeframe} data: {train_data[timeframe].shape[0]} rows")
        print(f"Testing {timeframe} data: {test_data[timeframe].shape[0]} rows")
    
    return train_data, test_data

# Example usage
if __name__ == "__main__":
    daily_file = "/Users/newuser/Projects/robust_algo_trader/data/gen_alpaca_data/CRM_D1_raw_data.csv"
    hourly_file = "/Users/newuser/Projects/robust_algo_trader/data/gen_alpaca_data/CRM_H1_raw_data.csv"
    minute_file = "/Users/newuser/Projects/robust_algo_trader/data/gen_alpaca_data/CRM_M1_raw_data.csv"
    
    # Prepare data
    train_data, test_data = prepare_multi_timeframe_data(
        # daily_file, hourly_file, minute_file, split_date="2022-01-01"
        daily_file, hourly_file, minute_file, split_date="2017-01-01"
    )

In [None]:
train_data['hourly']

In [None]:
import uuid
import pandas as pd

# Simple class for positions
class Position:
    def __init__(self, id, timestamp, action, price, size, stop_loss, target, entry_reason):
        self.id = id
        self.timestamp = timestamp
        self.action = action  # 'BUY' or 'SELL'
        self.price = price
        self.size = size
        self.stop_loss = stop_loss
        self.target = target  # Target price (usually the mean)
        self.entry_reason = entry_reason

# Simple class for signals
class Signal:
    def __init__(self, timestamp, action, price, size, position_id=None, stop_loss=None, target=None, reason=''):
        self.timestamp = timestamp
        self.action = action  # 'BUY', 'SELL', 'EXIT'
        self.price = price
        self.size = size
        self.position_id = position_id
        self.stop_loss = stop_loss
        self.target = target
        self.reason = reason
    
    def to_dict(self):
        return {
            'timestamp': self.timestamp,
            'action': self.action,
            'price': self.price,
            'size': self.size,
            'position_id': self.position_id,
            'stop_loss': self.stop_loss,
            'target': self.target,
            'reason': self.reason
        }


class MeanReversionStrategy:
    def __init__(self, params):
        # Entry parameters
        self.entry_threshold_upper = params.get('entry_threshold_upper', 15)
        self.entry_threshold_lower = params.get('entry_threshold_lower', 15)
        
        # Exit parameters
        self.exit_threshold_pct = params.get('exit_threshold_pct', 3)
        
        # Stop loss parameters
        self.use_atr_stops = params.get('use_atr_stops', True)
        self.stop_loss_atr_multiplier = params.get('stop_loss_atr_multiplier', 2.5)
        self.stop_loss_range_factor = params.get('stop_loss_range_factor', 0.15)
        
        # Position sizing parameters
        self.position_sizing_factor = params.get('position_sizing_factor', 1.0)
        self.max_position_size = params.get('max_position_size', 1.0)
        
        # Risk management
        self.max_positions = params.get('max_positions', 3)
        
        # Define timeframe hierarchy for lookahead prevention
        self.timeframe_hierarchy = params.get('timeframe_hierarchy', ['higher', 'middle', 'primary'])
        
        # State tracking
        self.positions = []
    
    def generate_signals(self, multi_timeframe_data):
        signals = []
        
        # Use primary timeframe for iteration
        primary_df = multi_timeframe_data['primary']
        
        for idx, row in primary_df.iterrows():
            timestamp = idx
            
            # Get current data across all timeframes
            current_data = self._get_data_at_timestamp(multi_timeframe_data, timestamp)
            if any(x is None for x in current_data.values()):
                continue
            
            # Prevent lookahead bias across all timeframes
            current_data = self._prevent_lookahead_bias(multi_timeframe_data, current_data, timestamp)
            if any(x is None for x in current_data.values()):
                continue
            
            # Check if we're in a ranging market using higher timeframe
            is_range = self._is_in_range_market(current_data['higher'])
            # print(f"Is range market: {is_range}")
            if not is_range:
                continue  # Skip if not in range market
            
            # Check for exit conditions for existing positions
            exit_signal = self._check_exit_conditions(current_data)
            if exit_signal:
                signals.append(exit_signal)
                continue  # Skip entry check after exit
            
            # Check for entry conditions if we have capacity and no existing position in same direction
            if (len(self.positions) < self.max_positions and 
                not self._has_position_in_direction('BUY') and 
                not self._has_position_in_direction('SELL')):
                
                entry_signal = self._check_entry_conditions(current_data)
                if entry_signal:
                    signals.append(entry_signal)
                    # Add position to tracking
                    self._add_position(entry_signal)
        
        return signals
    
    def _prevent_lookahead_bias(self, multi_timeframe_data, current_data, timestamp):
        # Make a copy to avoid modifying the original
        adjusted_data = current_data.copy()
        
        # Process timeframes from highest to lowest (except primary)
        for i, timeframe in enumerate(self.timeframe_hierarchy[:-1]):  # Skip the last (primary)
            if timeframe not in adjusted_data or adjusted_data[timeframe] is None:
                continue
                
            # Get the timestamp of the current bar for this timeframe
            tf_timestamp = adjusted_data[timeframe].name
            
            # Determine if the bar is complete or still forming
            bar_is_forming = False
            
            # For daily timeframe
            if timeframe == 'higher':
                # If we're on the same day and before market close, the daily bar is still forming
                bar_is_forming = (tf_timestamp.date() == timestamp.date() and 
                                timestamp.time().hour < 16)
            
            # For hourly timeframe
            elif timeframe == 'middle':
                # If we're in the same hour, the hourly bar is still forming
                bar_is_forming = (tf_timestamp.year == timestamp.year and
                                tf_timestamp.month == timestamp.month and
                                tf_timestamp.day == timestamp.day and
                                tf_timestamp.hour == timestamp.hour)
            
            # If the bar is still forming, get the previous completed bar
            if bar_is_forming:
                tf_df = multi_timeframe_data[timeframe]
                if timeframe == 'higher':
                    # Get previous day's bar
                    prev_bars = tf_df[tf_df.index < tf_timestamp.floor('D')]
                elif timeframe == 'middle':
                    # Get previous hour's bar
                    prev_bars = tf_df[tf_df.index < tf_timestamp.floor('H')]
                else:
                    # For any other timeframe, just get previous bar
                    prev_bars = tf_df[tf_df.index < tf_timestamp]
                
                # If we have previous bars, use the most recent one
                if not prev_bars.empty:
                    adjusted_data[timeframe] = prev_bars.iloc[-1]
                else:
                    # No previous bars available, remove this timeframe data
                    adjusted_data[timeframe] = None
        
        return adjusted_data
    
    def _get_data_at_timestamp(self, multi_timeframe_data, timestamp):
        result = {}
        
        for timeframe, df in multi_timeframe_data.items():
            # Get the last available row before or at the timestamp
            df_before = df[df.index <= timestamp]
            if not df_before.empty:
                result[timeframe] = df_before.iloc[-1]
            else:
                result[timeframe] = None
        
        return result
    
    def _is_in_range_market(self, higher_data):
        # FIXED: Safely extract boolean value from Series
        is_range_market = higher_data['is_range_market']
        # Handle bool or pandas boolean type
        if isinstance(is_range_market, (bool, np.bool_)):
            return bool(is_range_market)
        # Handle pandas Series
        else:
            # Use .iloc[0] to get the first (and only) value
            # or .item() to convert to Python scalar
            try:
                return bool(is_range_market.item())
            except:
                return bool(is_range_market.iloc[0])
    
    def _has_position_in_direction(self, direction):
        """
        Check if we already have a position in the given direction.
        """
        for position in self.positions:
            if position.action == direction:
                return True
        return False
    
    def _check_entry_conditions(self, current_data):
        primary = current_data['primary']
        middle = current_data['middle']
        
        # FIXED: Extract scalar values for safe comparison
        distance_from_lower = float(primary['distance_from_lower'])
        distance_from_upper = float(primary['distance_from_upper'])
        
        # Handle potential NaN in mean_reversion_probability
        try:
            mean_reversion_probability = float(middle['mean_reversion_probability']) 
            if pd.isna(mean_reversion_probability):
                return None
        except:
            print("Could not extract mean_reversion_probability")
            return None
        
        # Calculate range width for stop loss calculation
        range_width = float(primary['upper_range'] - primary['lower_range'])
        
        # Long entry when price is near lower boundary
        if (distance_from_lower < self.entry_threshold_lower and 
            mean_reversion_probability > 0.7):
            
            # FIXED: Use scalar values
            close_price = float(primary['Close'])
            
            # Calculate stop loss based on settings
            if self.use_atr_stops:
                stop_loss = close_price - (float(primary['atr']) * self.stop_loss_atr_multiplier)
            else:
                stop_loss = float(primary['lower_range']) - (range_width * self.stop_loss_range_factor)
            
            # Calculate position size based on conviction
            position_size = self._calculate_position_size(
                distance_from_lower, 
                mean_reversion_probability
            )
            
            return Signal(
                timestamp=primary.name,
                action='BUY',
                price=close_price,
                size=position_size,
                stop_loss=stop_loss,
                target=float(primary['mean']),
                reason='long_mean_reversion'
            )
            
        # Short entry when price is near upper boundary
        elif (distance_from_upper < self.entry_threshold_upper and 
              mean_reversion_probability > 0.5):
            
            # FIXED: Use scalar values
            close_price = float(primary['Close'])
            
            # Calculate stop loss based on settings
            if self.use_atr_stops:
                stop_loss = close_price + (float(primary['atr']) * self.stop_loss_atr_multiplier)
            else:
                stop_loss = float(primary['upper_range']) + (range_width * self.stop_loss_range_factor)
            
            # Calculate position size based on conviction
            position_size = self._calculate_position_size(
                distance_from_upper, 
                mean_reversion_probability
            )
            
            return Signal(
                timestamp=primary.name,
                action='SELL',
                price=close_price,
                size=position_size,
                stop_loss=stop_loss,
                target=float(primary['mean']),
                reason='short_mean_reversion'
            )
        
        return None
    
    def _check_exit_conditions(self, current_data):
        if not self.positions:
            return None
            
        primary = current_data['primary']
        
        # FIXED: Extract scalar values
        current_price = float(primary['Close'])
        mean_price = float(primary['mean'])
        
        for position in self.positions:
            # Check stop loss first (risk management priority)
            if (position.action == 'BUY' and current_price <= position.stop_loss) or \
               (position.action == 'SELL' and current_price >= position.stop_loss):
                
                exit_action = 'SELL' if position.action == 'BUY' else 'BUY'
                
                # Remove position from tracking
                self._remove_position(position.id)
                
                return Signal(
                    timestamp=primary.name,
                    action=exit_action,
                    price=current_price,
                    size=position.size,
                    position_id=position.id,
                    reason='stop_loss_hit'
                )
            
            # Check if we've reached the target (mean)
            pct_from_mean = abs((current_price - mean_price) / mean_price) * 100
            
            if pct_from_mean <= self.exit_threshold_pct:
                exit_action = 'SELL' if position.action == 'BUY' else 'BUY'
                
                # Remove position from tracking
                self._remove_position(position.id)
                
                return Signal(
                    timestamp=primary.name,
                    action=exit_action,
                    price=current_price,
                    size=position.size,
                    position_id=position.id,
                    reason='target_reached'
                )
        
        return None
    
    def _calculate_position_size(self, distance_from_boundary, reversion_probability):
        # Base size inversely proportional to distance from boundary
        # Avoid division by zero
        distance = max(distance_from_boundary, 1.0)
        base_size = self.position_sizing_factor / distance
        
        # Multiply by conviction (reversion probability)
        position_size = base_size * reversion_probability
        
        # Cap the size at the maximum
        return min(position_size, self.max_position_size)
    
    def _add_position(self, signal):
        """Add a new position to tracking."""
        position = Position(
            id=str(uuid.uuid4()),
            timestamp=signal.timestamp,
            action=signal.action,
            price=signal.price,
            size=signal.size,
            stop_loss=signal.stop_loss,
            target=signal.target,
            entry_reason=signal.reason
        )
        
        self.positions.append(position)
        
        # Update the signal with the position ID
        signal.position_id = position.id
    
    def _remove_position(self, position_id):
        """Remove a position from tracking by ID."""
        self.positions = [p for p in self.positions if p.id != position_id]
    
    def reset(self):
        """Reset the strategy state (clear positions)."""
        self.positions = []


# Example of usage function - import numpy to handle boolean types
import numpy as np

# Define strategy parameters
strategy_params = {
    # Entry thresholds - can be adjusted based on your preference
    'entry_threshold_upper': 25,
    'entry_threshold_lower': 25,
    
    # Exit threshold
    'exit_threshold_pct': 5,
    
    # Stop loss settings
    'use_atr_stops': True,
    'stop_loss_atr_multiplier': 2.0,
    
    # Position sizing
    'position_sizing_factor': 1.0,
    'max_position_size': 1.0,
    
    # Limit to only 1 position at a time
    'max_positions': 1,
    
    # Timeframe hierarchy
    'timeframe_hierarchy': ['higher', 'middle', 'primary']
}

# Initialize strategy
strategy = MeanReversionStrategy(strategy_params)

# Setup multi-timeframe data
multi_timeframe_data = {
    'higher': train_data['daily'],
    'middle': train_data['hourly'],
    'primary': train_data['minute']
}

# Generate signals
signals = strategy.generate_signals(multi_timeframe_data)

print(f"Generated {len(signals)} signals")
    

In [None]:
import pandas as pd

# Convert signals to dictionaries
signal_dicts = [signal.to_dict() for signal in signals]

# Create DataFrame
signals_df = pd.DataFrame(signal_dicts)

# Display first few signals
print(signals_df.head())


In [None]:
m_data = get_data_at_timestamp(train_data, "2022-01-01")
m_data