In [1]:
import pandas as pd
import numpy as np
import pandas as pd
from joblib import Parallel, delayed

In [2]:
def compute_stats(series, compute_all_stats):
    """Compute statistics for a given series."""
    if series.empty:
        return {
            'mean': 0,
            'min': 0,
            'max': 0,
            'std': 0
        }
    
    # Extract scalar values directly
    return {
        'mean': float(series.mean()),
        'min': float(series.min()),
        'max': float(series.max()),
        'std': float(series.std())
    }

def compute_stats(series, compute_all_stats=True):
    """
    Compute basic statistics for a given pandas Series.

    Parameters:
    - series: pandas Series containing the data to analyze.
    - compute_all_stats: boolean flag to determine if all statistics should be computed.

    Returns:
    - A dictionary containing computed statistics.
    """
    stats = {
        'mean': series.mean(),
        'median': series.median(),
        'std': series.std()
    }

    if compute_all_stats:
        stats['min'] = series.min()
        stats['max'] = series.max()
        stats['count'] = series.count()

    return stats

### OLD

In [3]:
def compute_regime_stats(market_df, value_df, value_column, other = "yes", compute_all_stats=True):    
    markets = market_df['value'].unique()
    results_list = []
    
    for market in markets:
        bear_values = []
        for index, row in market_df[market_df['value'] == market].iterrows():
            start_date, end_date = pd.to_datetime([row['start_date'], row['end_date']])
            value_df.index = pd.DatetimeIndex(value_df.index)
            bear_period_values = value_df[(value_df.index.normalize() >= pd.Timestamp(start_date.date())) & (value_df.index.normalize() <= pd.Timestamp(end_date.date()))]
            bear_values.append({'end_date': end_date, 'values': bear_period_values})
    
        bear_values.sort(key=lambda x: x['end_date'])
        bear_series = pd.concat([x['values'] for x in bear_values])[value_column].sort_index(ascending=True)
    
        stats = compute_stats(bear_series, compute_all_stats)
        results_df = pd.DataFrame({'Regime': [market], **stats})
        results_list.append(results_df.round(1))
    
    if other.lower() == "yes":
        value_df.index = pd.DatetimeIndex(value_df.index)
        non_bear_values = value_df[~((value_df.index >= market_df['start_date'].min()) & (value_df.index <= market_df['end_date'].max()))]
        non_bear_series = non_bear_values[value_column]
        non_bear_stats = compute_stats(non_bear_series, compute_all_stats)
        non_bear_results_df = pd.DataFrame({'Regime': ['Bull'], **non_bear_stats})
        results_list.append(non_bear_results_df.round(1))
    
    return pd.concat(results_list, ignore_index=True)

### Data

In [6]:
# Recreate market_df
market_df = pd.DataFrame({
    'value': ['Bear', 'Bear', 'Bear', 'Bear'],
    'start_date': ['2024-01-01', '2024-02-01', '2024-03-01', '2024-04-01'],
    'end_date': ['2024-01-10', '2024-02-10', '2024-03-10', '2024-04-10']
})

# Convert start_date and end_date to datetime
market_df['start_date'] = pd.to_datetime(market_df['start_date'])
market_df['end_date'] = pd.to_datetime(market_df['end_date'])

# Assuming date_range is already defined
date_range = pd.date_range(start='2024-01-01', end='2024-05-01')

# Create value_df with 'market_type' randomly assigned as 0 or 1
value_df = pd.DataFrame({
    'recession': np.random.choice([0, 1], size=len(date_range))
}, index=date_range)

# Define a date range
date_range = pd.date_range(start='2024-01-01', end='2024-05-01')

# Create a fake time series for volatility data with random values
vol_data = pd.DataFrame({
    'volatility': np.random.normal(loc=0.5, scale=0.1, size=len(date_range))
}, index=date_range)



### DEV

In [5]:
def analyze_market_periods(market_df, value_df, value_column, compute_stats, other="no", compute_all_stats=True):
    """
    Analyze market periods and compute statistics for specific regimes with enhanced performance.

    Parameters:
    - market_df: DataFrame containing market data with 'value', 'start_date', and 'end_date' columns.
    - value_df: DataFrame containing time series data to be analyzed.
    - value_column: The column in value_df which contains the data to be analyzed.
    - compute_stats: A function to compute desired statistics on the given series.
    - compute_all_stats: Additional parameters for computing statistics.
    - other: A string flag to decide if stats should also be computed for periods not covered by 'market_df'.

    Returns:
    - A concatenated DataFrame of statistics for each regime.
    """
    
    # Ensure the index of value_df is a DatetimeIndex only once for optimization
    value_df.index = pd.to_datetime(value_df.index)

    # Initialize a list to store results
    results_list = []

    # Use vectorized operations to filter the data for each market regime
    for regime in market_df['value'].unique():
        # Get the start and end dates for the current regime
        regime_periods = market_df[market_df['value'] == regime][['start_date', 'end_date']].apply(pd.to_datetime)

        # Create a boolean mask for the periods that fall within each regime
        regime_mask = pd.Series(False, index=value_df.index)
        for start_date, end_date in regime_periods.itertuples(index=False):
            regime_mask |= (value_df.index >= start_date.normalize()) & (value_df.index <= end_date.normalize())

        # Filter the values for the current regime using the mask
        regime_series = value_df.loc[regime_mask, value_column]

        # Compute statistics for the filtered data
        stats = compute_stats(regime_series, compute_all_stats)
        results_df = pd.DataFrame({'Regime': [regime], **stats})

        # Append the results for the current regime to the results list
        results_list.append(results_df.round(1))

    # Optionally compute statistics for periods not covered by any regime
    if other.lower() == "yes":
        # Create a boolean mask for the periods not covered by any regime
        all_regimes_mask = pd.Series(False, index=value_df.index)
        for start_date, end_date in market_df[['start_date', 'end_date']].apply(pd.to_datetime).itertuples(index=False):
            all_regimes_mask |= (value_df.index >= start_date.normalize()) & (value_df.index <= end_date.normalize())

        non_regime_mask = ~all_regimes_mask
        non_regime_series = value_df.loc[non_regime_mask, value_column]

        # Compute statistics for the non-regime periods
        non_regime_stats = compute_stats(non_regime_series, compute_all_stats)
        non_regime_results_df = pd.DataFrame({'Regime': ['Non-Regime'], **non_regime_stats})

        # Append the results for the non-market regime to the results list
        results_list.append(non_regime_results_df.round(1))

    # Concatenate all the results into a single DataFrame and return
    return pd.concat(results_list, ignore_index=True)

### OLD

In [6]:
compute_regime_stats(market_df, value_df, 'recession', other = "yes", compute_all_stats=True)

Unnamed: 0,Regime,mean,median,std,min,max,count
0,Bear,0.6,1.0,0.5,0,1,40
1,Bull,0.7,1.0,0.5,0,1,21


### NEW

In [7]:
# Now you can call the analyze_market_periods function
result_df = analyze_market_periods(market_df, value_df, 'recession', compute_stats, compute_all_stats=True, other="yes")

# Display the results
result_df

Unnamed: 0,Regime,mean,median,std,min,max,count
0,Bear,0.6,1.0,0.5,0,1,40
1,Non-Regime,0.5,0.0,0.5,0,1,82


In [8]:
# Function to create market regime cache
def create_market_cache(market_df, value_df_index):
    cache = {}
    for regime in market_df['value'].unique():
        regime_mask = pd.Series(False, index=value_df_index)
        regime_periods = market_df[market_df['value'] == regime][['start_date', 'end_date']]
        for start_date, end_date in regime_periods.itertuples(index=False):
            regime_mask |= (value_df_index >= start_date) & (value_df_index <= end_date)
        cache[regime] = regime_mask
    return cache

# Function to create cache based on generic regime (e.g., binary market regime)
def create_generic_regime_cache(value_df, regime_column):
    cache = {}
    for val in value_df[regime_column].unique():
        cache[f"Regime_{int(val)}"] = value_df[regime_column] == val
    return cache

# Function to process each combination of market and regime
def process_combination(market_regime, market_mask, regime, regime_mask, vol_data, compute_stats, compute_all_stats):
    combined_mask = market_mask & regime_mask
    filtered_series = vol_data.loc[combined_mask]
    stats = compute_stats(filtered_series['volatility'], compute_all_stats)
    return pd.DataFrame({'Market Regime': [market_regime], 'Regime Type': [regime], **stats})

# Function to compute statistics in parallel (or sequentially for simplicity)
def compute_stats_parallel(market_cache, regime_cache, vol_data, compute_stats, compute_all_stats=True):
    results = []
    for market_regime, market_mask in market_cache.items():
        for regime, regime_mask in regime_cache.items():
            result = process_combination(market_regime, market_mask, regime, regime_mask, vol_data, compute_stats, compute_all_stats)
            results.append(result)
    return pd.concat(results, ignore_index=True)

# Main function to compute market stats
def compute_market_stats(market_dfs, vol_data, regime_column, compute_stats, compute_all_stats=True):
    results_list = []

    for market_df in market_dfs:
        market_cache = create_market_cache(market_df, vol_data.index)
        regime_cache = create_generic_regime_cache(vol_data, regime_column)
        results_df = compute_stats_parallel(market_cache, regime_cache, vol_data, compute_stats, compute_all_stats)
        results_list.append(results_df)

    return pd.concat(results_list, ignore_index=True)

# Example compute_stats function
def compute_stats(series, compute_all_stats=True):
    stats = {
        'mean': series.mean(),
        'median': series.median(),
        'std': series.std()
    }
    if compute_all_stats:
        stats.update({
            'min': series.min(),
            'max': series.max(),
            'count': series.count()
        })
    return stats

            volatility
2024-01-01    0.572088
2024-01-02    0.682506
2024-01-03    0.423574
2024-01-04    0.441068
2024-01-05    0.453711
2024-01-06    0.410176
2024-01-07    0.452020
2024-01-08    0.590657
2024-01-09    0.381069
2024-01-10    0.440752


In [9]:
# Example market_df for market regimes
market_df_1 = pd.DataFrame({
    'value': ['Bear', 'Bear', 'Bull', 'Bull'],
    'start_date': ['2024-01-01', '2024-02-01', '2024-03-01', '2024-04-01'],
    'end_date': ['2024-01-10', '2024-02-10', '2024-03-10', '2024-04-10']
})
market_df_1['start_date'] = pd.to_datetime(market_df_1['start_date'])
market_df_1['end_date'] = pd.to_datetime(market_df_1['end_date'])

# Add a binary regime column to vol_data to simulate recession/non-recession periods
vol_data['market_regime'] = np.random.choice([0, 1], size=len(vol_data))
print(vol_data)

            volatility  market_regime
2024-01-01    0.572088              0
2024-01-02    0.682506              0
2024-01-03    0.423574              0
2024-01-04    0.441068              1
2024-01-05    0.453711              1
...                ...            ...
2024-04-27    0.535080              1
2024-04-28    0.658835              1
2024-04-29    0.350204              1
2024-04-30    0.536938              0
2024-05-01    0.470684              1

[122 rows x 2 columns]


In [10]:
# Run the computation
result_df = compute_market_stats([market_df_1], vol_data, 'market_regime', compute_stats)

# Display the results
print(result_df)

  Market Regime Regime Type      mean    median       std       min       max  \
0          Bear    Regime_0  0.456980  0.446648  0.131633  0.242994  0.682506   
1          Bear    Regime_1  0.446904  0.440910  0.061264  0.375857  0.571099   
2          Bull    Regime_0  0.478976  0.521161  0.148867  0.261143  0.697277   
3          Bull    Regime_1  0.529546  0.522008  0.090262  0.384122  0.661276   

   count  
0     12  
1      8  
2      7  
3     13  


### DEV

In [7]:
# Example DataFrames for testing
market_df_1 = pd.DataFrame({
    'value': ['Bear', 'Bear', 'Bull', 'Bull'],
    'start_date': ['2024-01-01', '2024-02-01', '2024-03-01', '2024-04-01'],
    'end_date': ['2024-01-10', '2024-02-10', '2024-03-10', '2024-04-10']
})
market_df_1['start_date'] = pd.to_datetime(market_df_1['start_date'])
market_df_1['end_date'] = pd.to_datetime(market_df_1['end_date'])

market_df_2 = pd.DataFrame({
    'value': ['High', 'Low', 'Medium', 'High'],
    'start_date': ['2024-01-01', '2024-02-01', '2024-03-01', '2024-04-01'],
    'end_date': ['2024-01-15', '2024-02-15', '2024-03-15', '2024-04-15']
})
market_df_2['start_date'] = pd.to_datetime(market_df_2['start_date'])
market_df_2['end_date'] = pd.to_datetime(market_df_2['end_date'])

date_range = pd.date_range(start='2024-01-01', end='2024-05-01')
value_df_1 = pd.DataFrame({
    'market_regime': np.random.choice([0, 1], size=len(date_range))
}, index=date_range)

value_df_2 = pd.DataFrame({
    'economic_state': np.random.choice([0, 1], size=len(date_range))
}, index=date_range)

print(value_df_1)
print(market_df_1)
print(value_df_2)
print(market_df_2)
print(vol_data)

            market_regime
2024-01-01              0
2024-01-02              1
2024-01-03              1
2024-01-04              0
2024-01-05              1
...                   ...
2024-04-27              0
2024-04-28              0
2024-04-29              1
2024-04-30              1
2024-05-01              0

[122 rows x 1 columns]
  value start_date   end_date
0  Bear 2024-01-01 2024-01-10
1  Bear 2024-02-01 2024-02-10
2  Bull 2024-03-01 2024-03-10
3  Bull 2024-04-01 2024-04-10
            economic_state
2024-01-01               0
2024-01-02               0
2024-01-03               1
2024-01-04               0
2024-01-05               1
...                    ...
2024-04-27               1
2024-04-28               0
2024-04-29               0
2024-04-30               1
2024-05-01               0

[122 rows x 1 columns]
    value start_date   end_date
0    High 2024-01-01 2024-01-15
1     Low 2024-02-01 2024-02-15
2  Medium 2024-03-01 2024-03-15
3    High 2024-04-01 2024-04-15
      

### END -------------

In [6]:
# Example compute_stats function
def compute_stats(series, compute_all_stats=True):
    stats = {
        'mean': series.mean(),
        'median': series.median(),
        'std': series.std()
    }
    if compute_all_stats:
        stats.update({
            'min': series.min(),
            'max': series.max(),
            'count': series.count()
        })
    return stats

# Step 1: Create Helper Functions for Cache Creation

# Function to create market regime cache
def create_market_cache(market_df, value_df_index):
    cache = {}
    for regime in market_df['value'].unique():
        regime_mask = pd.Series(False, index=value_df_index)
        regime_periods = market_df[market_df['value'] == regime][['start_date', 'end_date']]
        for start_date, end_date in regime_periods.itertuples(index=False):
            regime_mask |= (value_df_index >= start_date) & (value_df_index <= end_date)
        cache[regime] = regime_mask
    return cache

# Function to create cache based on generic regime (e.g., binary market regime)
def create_generic_regime_cache(value_df, regime_column):
    """
    Creates a cache of masks for each unique value in the specified regime column.
    
    Parameters:
    - value_df: The DataFrame containing the regime data.
    - regime_column: The column in value_df representing different regimes.
    
    Returns:
    - cache: A dictionary where keys are regime names and values are boolean masks.
    """
    cache = {}
    unique_values = value_df[regime_column].unique()
    for val in unique_values:
        cache[f"Regime_{int(val)}"] = value_df[regime_column] == val
    return cache


    # Modify create_generic_regime_cache to return all True for testing
def create_generic_regime_cache(value_df, regime_column):
    cache = {}
    unique_values = value_df[regime_column].unique()
    #print(f"Unique regimes identified in {regime_column}: {unique_values}")
    
    for val in unique_values:
        regime_name = f"Regime_{int(val)}"
        cache[regime_name] = value_df[regime_column] == val
        #print(f"Created mask for {regime_name}, count: {cache[regime_name].sum()}")
    
    return cache

# Step 3: Process Each Combination of Market and Regime
# Print the masks for inspection
def inspect_masks(market_mask, regime_mask, combined_mask):
    print(f"Market Mask:\n{market_mask}")
    print(f"Regime Mask:\n{regime_mask}")
    print(f"Combined Mask:\n{combined_mask.sum()}")

def process_combination(market_regime, market_mask, regime, regime_mask, vol_data, compute_stats, compute_all_stats):
    combined_mask = market_mask & regime_mask
    # inspect_masks(market_mask, regime_mask, combined_mask)  # Inspect the masks
    filtered_series = vol_data.loc[combined_mask]
    if filtered_series.empty:
        print(f"No data selected for combination: Market Regime = {market_regime}, Regime Type = {regime}")
    
    stats = compute_stats(filtered_series['volatility'], compute_all_stats)
    return pd.DataFrame({'Market Regime': [market_regime], 'Regime Type': [regime], **stats})
    
# Step 4: Compute Statistics in Parallel or Sequentially

# Function to compute statistics in parallel (or sequentially for simplicity)
def compute_stats_parallel(market_cache, regime_cache, vol_data, compute_stats, compute_all_stats=True):
    results = []
    for market_regime, market_mask in market_cache.items():
        for regime, regime_mask in regime_cache.items():
            result = process_combination(market_regime, market_mask, regime, regime_mask, vol_data, compute_stats, compute_all_stats)
            results.append(result)
    return pd.concat(results, ignore_index=True)

# Step 5: Main Function to Compute Market Stats

# Main function to compute market stats
def compute_market_stats(market_dfs, vol_data, regime_column, compute_stats, compute_all_stats=True):
    results_list = []

    for market_df in market_dfs:
        market_cache = create_market_cache(market_df, vol_data.index)
        regime_cache = create_generic_regime_cache(vol_data, regime_column)
        results_df = compute_stats_parallel(market_cache, regime_cache, vol_data, compute_stats, compute_all_stats)
        results_list.append(results_df)

    return pd.concat(results_list, ignore_index=True)

In [7]:
# Define a date range for the vol_data DataFrame
date_range = pd.date_range(start='2024-01-01', end='2024-05-01')

# Create a fake time series for volatility data with random values
vol_data = pd.DataFrame({
    'volatility': np.random.normal(loc=0.5, scale=0.1, size=len(date_range))
}, index=date_range)

# Example market regime DataFrames
market_df_1 = pd.DataFrame({
    'value': ['Bear', 'Bear', 'Bull', 'Bull'],
    'start_date': ['2024-01-01', '2024-01-20', '2024-02-01', '2024-03-01'],
    'end_date': ['2024-01-15', '2024-02-05', '2024-02-15', '2024-03-15']
})
market_df_1['start_date'] = pd.to_datetime(market_df_1['start_date'])
market_df_1['end_date'] = pd.to_datetime(market_df_1['end_date'])

market_df_2 = pd.DataFrame({
    'value': ['High', 'Low', 'Medium', 'High'],
    'start_date': ['2024-01-10', '2024-02-10', '2024-03-10', '2024-04-10'],
    'end_date': ['2024-01-20', '2024-02-20', '2024-03-20', '2024-04-20']
})
market_df_2['start_date'] = pd.to_datetime(market_df_2['start_date'])
market_df_2['end_date'] = pd.to_datetime(market_df_2['end_date'])

In [8]:
import pandas as pd
import numpy as np

# Define a date range for the vol_data DataFrame
date_range = pd.date_range(start='2024-01-01', end='2024-05-01')

# Example: Add a 'regime' column to the vol_data to simulate different regimes
vol_data = pd.DataFrame({
    'volatility': np.random.normal(loc=0.5, scale=0.1, size=len(date_range)),
    'regime': np.random.choice([0, 1], size=len(date_range))  # Randomly assign regime 0 or 1
}, index=date_range)

# Example market regime DataFrames
market_df_1 = pd.DataFrame({
    'value': ['Bear', 'Bear', 'Bull', 'Bull'],
    'start_date': ['2024-01-01', '2024-01-20', '2024-02-01', '2024-03-01'],
    'end_date': ['2024-01-15', '2024-02-05', '2024-02-15', '2024-03-15']
})
market_df_1['start_date'] = pd.to_datetime(market_df_1['start_date'])
market_df_1['end_date'] = pd.to_datetime(market_df_1['end_date'])

market_df_2 = pd.DataFrame({
    'value': ['High', 'Low', 'Medium', 'High'],
    'start_date': ['2024-01-10', '2024-02-10', '2024-03-10', '2024-04-10'],
    'end_date': ['2024-01-20', '2024-02-20', '2024-03-20', '2024-04-20']
})
market_df_2['start_date'] = pd.to_datetime(market_df_2['start_date'])
market_df_2['end_date'] = pd.to_datetime(market_df_2['end_date'])

# Create the regime masks and inspect them
regime_cache = create_generic_regime_cache(vol_data, 'regime')

# Check the masks generated for Regime_0 and Regime_1
#for regime_name, mask in regime_cache.items():
#    print(f"{regime_name} mask count: {mask.sum()}")

# Run the model
result_df = compute_market_stats([market_df_1, market_df_2], vol_data, 'regime', compute_stats)

# Display the results
result_df

Unnamed: 0,Market Regime,Regime Type,mean,median,std,min,max,count
0,Bear,Regime_1,0.531582,0.549244,0.075404,0.40649,0.660187,12
1,Bear,Regime_0,0.49343,0.490752,0.117182,0.20855,0.691633,20
2,Bull,Regime_1,0.508142,0.49323,0.095192,0.358923,0.699329,20
3,Bull,Regime_0,0.514936,0.516356,0.07845,0.391797,0.662136,10
4,High,Regime_1,0.532373,0.52108,0.068619,0.408152,0.642541,12
5,High,Regime_0,0.508056,0.514309,0.084183,0.395179,0.626472,10
6,Low,Regime_1,0.458561,0.4306,0.118777,0.305016,0.699329,8
7,Low,Regime_0,0.544255,0.524997,0.10953,0.445631,0.662136,3
8,Medium,Regime_1,0.503582,0.523292,0.109048,0.375107,0.64519,8
9,Medium,Regime_0,0.586329,0.59017,0.098544,0.48592,0.682896,3


### ADDITIONAL

In [26]:
import pandas as pd

def save_filtered_volatility(market_mask, regime_mask, vol_data, file_path):
    """
    Filters the volatility data based on the given market and regime masks
    and saves the filtered data to a specified file.

    Parameters:
    - market_mask: A boolean mask representing the market regime.
    - regime_mask: A boolean mask representing the regime type.
    - vol_data: The volatility DataFrame to filter.
    - file_path: The file path where the filtered data will be saved.
    """
    # Combine the masks
    combined_mask = market_mask & regime_mask
    
    # Filter the data
    filtered_data = vol_data.loc[combined_mask]
    
    # Save the filtered data to a CSV file
    filtered_data.to_csv(file_path)
    
    # Return the filtered data for inspection
    return filtered_data

# Example usage:
bear_mask = (vol_data.index >= '2024-01-05') & (vol_data.index <= '2024-01-15')
regime_mask = vol_data.index == vol_data.index  # This should select all data

# Save the filtered data to a CSV file
filtered_volatility_data = save_filtered_volatility(bear_mask, regime_mask, vol_data, 'filtered_volatility_bear.csv')

print("Filtered Volatility Data:")
print(filtered_volatility_data)

Filtered Volatility Data:
            volatility
2024-01-05    0.559056
2024-01-06    0.375618
2024-01-07    0.502274
2024-01-08    0.455374
2024-01-09    0.481883
2024-01-10    0.607561
2024-01-11    0.369052
2024-01-12    0.429884
2024-01-13    0.492682
2024-01-14    0.212748
2024-01-15    0.429482
