In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime

### Functions

In [30]:
def align_data(df_vol_moneyness, compo, earning_dates, x=21, alignment_method="nearest_neighbor"):
    """
    Align df_vol_moneyness with index composition and earnings dates within a specified range.
    
    Args:
    - df_vol_moneyness (pd.DataFrame): Implied volatility time series data.
    - compo (pd.DataFrame): Index composition data.
    - earning_dates (pd.DataFrame): Earnings dates for each stock.
    - x (int): Number of days before and after earnings dates to consider. Default is 21.
    - alignment_method (str): Method for aligning data, either 'nearest_neighbor' or 'interpolation'. Default is 'nearest_neighbor'.
    
    Returns:
    - pd.DataFrame: Filtered and aligned volatility time series.
    - pd.DataFrame: Debugging report.
    """
    # Debug print: Initial data
    print("Initial Data:")
    print("df_vol_moneyness head:", df_vol_moneyness.head())
    print("compo head:", compo.head())
    print("earning_dates head:", earning_dates.head())
    
    # Align compo with df_vol_moneyness based on the chosen alignment method
    if alignment_method == "nearest_neighbor":
        compo_aligned = compo.reindex(df_vol_moneyness.index, method='nearest')
    elif alignment_method == "interpolation":
        compo_aligned = compo.reindex(df_vol_moneyness.index).interpolate(method='linear')
    else:
        raise ValueError("Invalid alignment_method. Choose either 'nearest_neighbor' or 'interpolation'.")
    
    # Debug print: Aligned compo
    print("Aligned compo head:", compo_aligned.head())
    
    # Create masks based on earnings dates within the specified range
    mask = pd.DataFrame(False, index=df_vol_moneyness.index, columns=df_vol_moneyness.columns)
    for ticker in earning_dates.columns:
        for date in earning_dates[ticker].dropna():
            start_date = date - pd.Timedelta(days=x)
            end_date = date + pd.Timedelta(days=x)
            mask.loc[start_date:end_date, ticker] = True

    # Debug print: Mask
    print("Earnings date mask head:", mask.head())

    # Apply masks to filter based on earnings dates
    filtered_vol_earnings = df_vol_moneyness[mask]

    # Debug print: Filtered based on earnings dates
    print("Filtered Volatility Data (after earnings dates mask) head:", filtered_vol_earnings.head())
    
    # Apply masks to filter based on index composition
    compo_mask = compo_aligned > 0
    filtered_vol_compo = df_vol_moneyness[compo_mask]

    # Debug print: Filtered based on composition
    print("Filtered Volatility Data (after composition mask) head:", filtered_vol_compo.head())

    # Combine both masks to filter the final DataFrame
    combined_mask = mask & compo_mask
    filtered_vol = df_vol_moneyness[combined_mask]

    # Debug print: Filtered data
    print("Filtered Volatility Data head:", filtered_vol.head())
    
    # Report missing earnings dates and percentage of time each stock is part of the index composition
    missing_dates_report = earning_dates.isna().sum()
    compo_percentage = compo.notna().sum() / len(compo)
    
    # Create a debug report DataFrame
    debug_report = pd.DataFrame({
        'Missing Earnings Dates': missing_dates_report,
        'Composition Percentage': compo_percentage
    })
    
    return filtered_vol, debug_report


def plot_filtered_volatility(filtered_vol, df_vol_moneyness, compo, earning_dates):
    """
    Plot the first column of all DataFrames (filtered_vol, df_vol_moneyness, compo, earning_dates) in a single plot.
    
    Args:
    - filtered_vol (pd.DataFrame): Filtered and aligned volatility time series.
    - df_vol_moneyness (pd.DataFrame): Implied volatility time series data.
    - compo (pd.DataFrame): Index composition data.
    - earning_dates (pd.DataFrame): Earnings dates for each stock.
    
    Returns:
    - None
    """
    # Extract the first column from each DataFrame
    vol_moneyness_first_col = df_vol_moneyness.iloc[:, 0]
    compo_first_col = compo.iloc[:, 0]
    earning_dates_first_col = earning_dates.iloc[:, 0]
    filtered_vol_first_col = filtered_vol.iloc[:, 0]
    
    # Plot each series
    plt.figure(figsize=(12, 8))
    plt.plot(vol_moneyness_first_col, label='Vol Moneyness (STOCK0)', color='blue', alpha=0.5)
    plt.plot(compo_first_col, label='Compo (STOCK0)', color='red', alpha=0.5)
    plt.scatter(earning_dates_first_col, [0]*len(earning_dates_first_col), label='Earnings Dates (STOCK0)', color='green', marker='o')
    plt.plot(filtered_vol_first_col, label='Filtered Vol Moneyness (STOCK0)', color='purple', linewidth=2)
    
    # Add title and labels
    plt.title('First Column of All DataFrames with Filtered Data')
    plt.xlabel('Date')
    plt.ylabel('Value')
    plt.legend()
    plt.grid(True)
    
    # Show the plot
    plt.show()

### Generate data

In [19]:
# Create a date range for df_vol_moneyness (business daily) and compo (weekly)
date_range_vol = pd.date_range(start="2020-01-01", end="2024-12-31", freq='B')
date_range_compo = pd.date_range(start="2020-01-01", end="2024-12-31", freq='W')

# Generate fake stock tickers
tickers = [f"STOCK{i}" for i in range(10)]

# Generate fake compo DataFrame
np.random.seed(0)
compo_data = np.random.rand(len(date_range_compo), len(tickers))
compo_data = compo_data / compo_data.sum(axis=1, keepdims=True)  # Normalize to sum to 1
compo = pd.DataFrame(compo_data, index=date_range_compo, columns=tickers)

# Generate fake df_vol_moneyness DataFrame
vol_data = np.random.rand(len(date_range_vol), len(tickers))
df_vol_moneyness = pd.DataFrame(vol_data, index=date_range_vol, columns=tickers)

# Generate fake earning_dates DataFrame
earning_dates = pd.DataFrame(index=pd.date_range(start="2020-01-01", end="2024-12-31", freq='Q-DEC'))
for ticker in tickers:
    earning_dates[ticker] = earning_dates.index

# Generate fake df_sectors DataFrame
sectors = ['Technology', 'Healthcare', 'Financials', 'Energy', 'Consumer Discretionary', 'Utilities', 'Industrials', 'Materials', 'Real Estate', 'Communication Services']
df_sectors = pd.DataFrame({'Sector': sectors}, index=tickers)

  earning_dates = pd.DataFrame(index=pd.date_range(start="2020-01-01", end="2024-12-31", freq='Q-DEC'))


In [32]:
# Call the align_data function to get filtered_vol and debug_report separately
filtered_vol, debug_report = align_data(df_vol_moneyness, compo, earning_dates, x=21, alignment_method="nearest_neighbor")

# Print the heads of the filtered_vol and debug_report DataFrames
print("Filtered Volatility Data (head):")
print(filtered_vol.head())
# Check if filtered_vol contains NaN values
print("Filtered Volatility Data (info):")
print(filtered_vol.info())

Initial Data:
df_vol_moneyness head:               STOCK0    STOCK1    STOCK2    STOCK3    STOCK4    STOCK5  \
2020-01-01  0.965116  0.796512  0.558731  0.330617  0.845238  0.455436   
2020-01-02  0.014349  0.611485  0.995830  0.817249  0.617238  0.914399   
2020-01-03  0.846426  0.058176  0.774804  0.447503  0.660799  0.764633   
2020-01-06  0.689301  0.953706  0.795253  0.389786  0.584291  0.304201   
2020-01-07  0.964020  0.661949  0.068778  0.010778  0.997046  0.220506   

              STOCK6    STOCK7    STOCK8    STOCK9  
2020-01-01  0.092685  0.454904  0.871968  0.448282  
2020-01-02  0.813582  0.498639  0.591220  0.731293  
2020-01-03  0.539502  0.158515  0.409527  0.076873  
2020-01-06  0.541046  0.300727  0.738325  0.259342  
2020-01-07  0.312563  0.208414  0.928102  0.652013  
compo head:               STOCK0    STOCK1    STOCK2    STOCK3    STOCK4    STOCK5  \
2020-01-05  0.089127  0.116146  0.097888  0.088489  0.068801  0.104893   
2020-01-12  0.144649  0.096629  0.103782

In [21]:
print("\nDebug Report:")
print(debug_report)


Debug Report:
        Missing Earnings Dates  Composition Percentage
STOCK0                       0                     1.0
STOCK1                       0                     1.0
STOCK2                       0                     1.0
STOCK3                       0                     1.0
STOCK4                       0                     1.0
STOCK5                       0                     1.0
STOCK6                       0                     1.0
STOCK7                       0                     1.0
STOCK8                       0                     1.0
STOCK9                       0                     1.0


In [33]:
# Rebase the filtered volatility data around earnings dates using normalization
try:
    rebased_vol_normalize = rebase_around_earnings(filtered_vol, earning_dates, x=21, method="normalize")
    print("Rebased Volatility Data (Normalize) (head):")
    print(rebased_vol_normalize.head())
except ValueError as e:
    print(e)

# Rebase the filtered volatility data around earnings dates using spread method
try:
    rebased_vol_spread = rebase_around_earnings(filtered_vol, earning_dates, x=21, method="spread")
    print("Rebased Volatility Data (Spread) (head):")
    print(rebased_vol_spread.head())
except ValueError as e:
    print(e)


Data slice for STOCK0 at 2020-03-31 00:00:00 is not of expected length: 31
Data slice for STOCK0 at 2020-06-30 00:00:00 is not of expected length: 31
Data slice for STOCK0 at 2020-09-30 00:00:00 is not of expected length: 31
Data slice for STOCK0 at 2020-12-31 00:00:00 is not of expected length: 31
Data slice for STOCK0 at 2021-03-31 00:00:00 is not of expected length: 31
Data slice for STOCK0 at 2021-06-30 00:00:00 is not of expected length: 31
Data slice for STOCK0 at 2021-09-30 00:00:00 is not of expected length: 31
Data slice for STOCK0 at 2021-12-31 00:00:00 is not of expected length: 31
Data slice for STOCK0 at 2022-03-31 00:00:00 is not of expected length: 31
Data slice for STOCK0 at 2022-06-30 00:00:00 is not of expected length: 31
Data slice for STOCK0 at 2022-09-30 00:00:00 is not of expected length: 31
Data slice for STOCK0 at 2022-12-31 00:00:00 is not of expected length: 30
Data slice for STOCK0 at 2023-03-31 00:00:00 is not of expected length: 31
Data slice for STOCK0 at 

In [28]:
def rebase_around_earnings(df_vol_moneyness, earning_dates, x=21, method="normalize"):
    """
    Rebase volatility data around earnings dates, setting earnings date as day 0 and including x days before and after.
    
    Args:
    - df_vol_moneyness (pd.DataFrame): Implied volatility time series data.
    - earning_dates (pd.DataFrame): Earnings dates for each stock.
    - x (int): Number of days before and after earnings dates to include. Default is 21.
    - method (str): Method to rebase data, either 'normalize' or 'spread'. Default is 'normalize'.
    
    Returns:
    - pd.DataFrame: Rebases volatility data with a multi-level index (stock ticker and relative day).
    """
    rebased_data = []

    for ticker in earning_dates.columns:
        for date in earning_dates[ticker].dropna():
            start_date = date - pd.Timedelta(days=x)
            end_date = date + pd.Timedelta(days=x)
            data_slice = df_vol_moneyness.loc[start_date:end_date, ticker].copy()

            if len(data_slice) == 2 * x + 1:
                # Fill missing data with the last available previous value
                data_slice.fillna(method='ffill', inplace=True)
                
                if method == "normalize":
                    # Normalize the data slice
                    base_value = data_slice.loc[date]
                    rebased_slice = (data_slice / base_value) * 100
                elif method == "spread":
                    # Calculate spread (difference from previous day)
                    rebased_slice = data_slice.diff().fillna(0)
                
                rebased_slice.index = range(-x, x + 1)
                rebased_slice = rebased_slice.to_frame(name=ticker)
                rebased_slice['Ticker'] = ticker
                rebased_data.append(rebased_slice)
            else:
                print(f"Data slice for {ticker} at {date} is not of expected length: {len(data_slice)}")

    if not rebased_data:
        raise ValueError("No objects to concatenate. Ensure data slices are of expected length.")

    # Combine all rebased slices into a single DataFrame
    rebased_df = pd.concat(rebased_data).reset_index()
    rebased_df = rebased_df.set_index(['Ticker', 'index'])
    rebased_df.index.names = ['Ticker', 'Relative Day']
    return rebased_df

In [29]:
# Call the align_data function to get filtered_vol and debug_report separately
filtered_vol, debug_report = align_data(df_vol_moneyness, compo, earning_dates, x=21, alignment_method="nearest_neighbor")

# Print the heads of the filtered_vol and debug_report DataFrames
print("Filtered Volatility Data (head):")
print(filtered_vol.head())
print("\nDebug Report (head):")
print(debug_report.head())

# Rebase the filtered volatility data around earnings dates using normalization
rebased_vol_normalize = rebase_around_earnings(filtered_vol, earning_dates, x=21, method="normalize")

# Print the rebased volatility data (head) for normalization method
print("Rebased Volatility Data (Normalize) (head):")
print(rebased_vol_normalize.head())

# Rebase the filtered volatility data around earnings dates using spread method
rebased_vol_spread = rebase_around_earnings(filtered_vol, earning_dates, x=21, method="spread")

# Print the rebased volatility data (head) for spread method
print("Rebased Volatility Data (Spread) (head):")
print(rebased_vol_spread.head())

Filtered Volatility Data (head):
            STOCK0  STOCK1  STOCK2  STOCK3  STOCK4  STOCK5  STOCK6  STOCK7  \
2020-01-01     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
2020-01-02     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
2020-01-03     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
2020-01-06     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
2020-01-07     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   

            STOCK8  STOCK9  
2020-01-01     NaN     NaN  
2020-01-02     NaN     NaN  
2020-01-03     NaN     NaN  
2020-01-06     NaN     NaN  
2020-01-07     NaN     NaN  

Debug Report (head):
        Missing Earnings Dates  Composition Percentage
STOCK0                       0                     1.0
STOCK1                       0                     1.0
STOCK2                       0                     1.0
STOCK3                       0                     1.0
STOCK4                     

ValueError: No objects to concatenate. Ensure data slices are of expected length.

In [None]:
def align_data(df_vol_moneyness, compo, earning_dates, x=21, alignment_method="nearest_neighbor"):
    """
    Align df_vol_moneyness with index composition and earnings dates within a specified range.
    
    Args:
    - df_vol_moneyness (pd.DataFrame): Implied volatility time series data.
    - compo (pd.DataFrame): Index composition data.
    - earning_dates (pd.DataFrame): Earnings dates for each stock.
    - x (int): Number of days before and after earnings dates to consider. Default is 21.
    - alignment_method (str): Method for aligning data, either 'nearest_neighbor' or 'interpolation'. Default is 'nearest_neighbor'.
    
    Returns:
    - pd.DataFrame: Filtered and aligned volatility time series.
    - pd.DataFrame: Debugging report.
    """
    # Debug print: Initial data
    print("Initial Data:")
    print("df_vol_moneyness head:", df_vol_moneyness.head())
    print("compo head:", compo.head())
    print("earning_dates head:", earning_dates.head())
    
    # Align compo with df_vol_moneyness based on the chosen alignment method
    if alignment_method == "nearest_neighbor":
        compo_aligned = compo.reindex(df_vol_moneyness.index, method='nearest')
    elif alignment_method == "interpolation":
        compo_aligned = compo.reindex(df_vol_moneyness.index).interpolate(method='linear')
    else:
        raise ValueError("Invalid alignment_method. Choose either 'nearest_neighbor' or 'interpolation'.")
    
    # Debug print: Aligned compo
    print("Aligned compo head:", compo_aligned.head())
    
    # Create masks based on earnings dates within the specified range
    mask = pd.DataFrame(False, index=df_vol_moneyness.index, columns=df_vol_moneyness.columns)
    for ticker in earning_dates.columns:
        for date in earning_dates[ticker].dropna():
            start_date = date - pd.Timedelta(days=x)
            end_date = date + pd.Timedelta(days=x)
            mask.loc[start_date:end_date, ticker] = True

    # Debug print: Mask
    print("Earnings date mask head:", mask.head())

    # Apply masks to filter based on earnings dates
    filtered_vol_earnings = df_vol_moneyness[mask]

    # Debug print: Filtered based on earnings dates
    print("Filtered Volatility Data (after earnings dates mask) head:", filtered_vol_earnings.head())
    
    # Apply masks to filter based on index composition
    compo_mask = compo_aligned > 0
    filtered_vol_compo = df_vol_moneyness[compo_mask]

    # Debug print: Filtered based on composition
    print("Filtered Volatility Data (after composition mask) head:", filtered_vol_compo.head())

    # Combine both masks to filter the final DataFrame
    combined_mask = mask & compo_mask
    filtered_vol = df_vol_moneyness[combined_mask]
    
    # Fill missing values
    filtered_vol = filtered_vol.ffill().bfill()

    # Debug print: Filtered data
    print("Filtered Volatility Data head:", filtered_vol.head())
    
    # Report missing earnings dates and percentage of time each stock is part of the index composition
    missing_dates_report = earning_dates.isna().sum()
    compo_percentage = compo.notna().sum() / len(compo)
    
    # Create a debug report DataFrame
    debug_report = pd.DataFrame({
        'Missing Earnings Dates': missing_dates_report,
        'Composition Percentage': compo_percentage
    })
    
    return filtered_vol, debug_report

def rebase_around_earnings(df_vol_moneyness, earning_dates, x=21, method="normalize"):
    """
    Rebase volatility data around earnings dates, setting earnings date as day 0 and including x days before and after.
    
    Args:
    - df_vol_moneyness (pd.DataFrame): Implied volatility time series data.
    - earning_dates (pd.DataFrame): Earnings dates for each stock.
    - x (int): Number of days before and after earnings dates to include. Default is 21.
    - method (str): Method to rebase data, either 'normalize' or 'spread'. Default is 'normalize'.
    
    Returns:
    - pd.DataFrame: Rebases volatility data with a multi-level index (stock ticker and relative day).
    """
    rebased_data = []

    for ticker in earning_dates.columns:
        for date in earning_dates[ticker].dropna():
            start_date = date - pd.Timedelta(days=x)
            end_date = date + pd.Timedelta(days=x)
            data_slice = df_vol_moneyness.loc[start_date:end_date, ticker].copy()

            if len(data_slice) == 2 * x + 1:
                # Fill missing data with the last available previous value
                data_slice.fillna(method='ffill', inplace=True)
                
                if method == "normalize":
                    # Normalize the data slice
                    base_value = data_slice.loc[date]
                    rebased_slice = (data_slice / base_value) * 100
                elif method == "spread":
                    # Calculate spread (difference from previous day)
                    rebased_slice = data_slice.diff().fillna(0)
                
                rebased_slice.index = range(-x, x + 1)
                rebased_slice = rebased_slice.to_frame(name=ticker)
                rebased_slice['Ticker'] = ticker
                rebased_data.append(rebased_slice)
            else:
                print(f"Data slice for {ticker} at {date} is not of expected length: {len(data_slice)}")

    if not rebased_data:
        raise ValueError("No objects to concatenate. Ensure data slices are of expected length.")

    # Combine all rebased slices into a single DataFrame
    rebased_df = pd.concat(rebased_data).reset_index()
    rebased_df = rebased_df.set_index(['Ticker', 'index'])
    rebased_df.index.names = ['Ticker', 'Relative Day']
    return rebased_df

# Call the align_data function to get filtered_vol and debug_report separately
filtered_vol, debug_report = align_data(df_vol_moneyness, compo, earning_dates, x=21, alignment_method="nearest_neighbor")

# Print the heads of the filtered_vol and debug_report DataFrames
print("Filtered Volatility Data (head):")
print(filtered_vol.head())
print("\nDebug Report (head):")
print(debug_report.head())

# Rebase the filtered volatility data around earnings dates using normalization
try:
    rebased_vol_normalize = rebase_around_earnings(filtered_vol, earning_dates, x=21, method="normalize")
    print("Rebased Volatility Data (Normalize) (head):")
    print(rebased_vol_normalize.head())
except ValueError as e:
    print(e)

# Rebase the filtered volatility data around earnings dates using spread method
try:
    rebased_vol_spread = rebase_around_earnings(filtered_vol, earning_dates, x=21, method="spread")
    print("Rebased Volatility Data (Spread) (head):")
    print(rebased_vol_spread.head())
except ValueError as e:
    print(e)
