In [71]:
import pandas as pd
from dateutil.relativedelta import relativedelta
import numpy as np
import re
import matplotlib.pyplot as plt
from matplotlib import pyplot as plt
from matplotlib.pyplot import figure
import warnings
warnings.filterwarnings("ignore")
import math
import os
from datetime import date, timedelta, datetime
import time
from tqdm import tqdm
import pyodbc
import seaborn as sns
from scipy import stats
import xlsxwriter
from matplotlib.ticker import MaxNLocator
from matplotlib.backends.backend_pdf import PdfPages
start_time = time.perf_counter()

In [72]:
# Load CSVs into DataFrames (replace with actual file paths)
lowvol_long = pd.read_csv('LowVolLongNAV.csv').drop(columns=['Unnamed: 0'])
lowvol_short = pd.read_csv('LowVolShortNAV.csv').drop(columns=['Unnamed: 0'])

value_long = pd.read_csv('ValueLongNAV.csv').drop(columns=['Unnamed: 0'])
value_short = pd.read_csv('ValueShortNAV.csv').drop(columns=['Unnamed: 0'])

mom_long = pd.read_csv('MomLongNAV.csv').drop(columns=['Unnamed: 0'])
mom_short = pd.read_csv('MomShortNAV.csv').drop(columns=['Unnamed: 0'])

# Merge long and short for each category on 'Date'
lowvol_df = pd.merge(lowvol_long, lowvol_short, on='Date', suffixes=('_long', '_short'))
value_df = pd.merge(value_long, value_short, on='Date', suffixes=('_long', '_short'))
mom_df = pd.merge(mom_long, mom_short, on='Date', suffixes=('_long', '_short'))
# Function to calculate daily returns, long-short returns, and NAV
def calculate_returns_and_nav(df, factor_name):

    # Calculate Daily Returns for both long and short portfolios
    df['long_return'] = df['Close_long'].pct_change()
    df['short_return'] = df['Close_short'].pct_change()

    # Calculate the Long-Short Return
    df['long_short_return'] = df['long_return'] - df['short_return']

    # Rebase the Long-Short Return to calculate NAV
    df[f'{factor_name}_LS_NAV'] = (1 + df['long_short_return']).cumprod() * 100

    # Keep only the 'Date', 'NAV_Long', and 'Factor_LS_NAV' columns
    return df[['Date', f'{factor_name}_LS_NAV']]

# Apply the function to each category
lowvol_result = calculate_returns_and_nav(lowvol_df, 'LowVol')
value_result = calculate_returns_and_nav(value_df, 'Value')
mom_result = calculate_returns_and_nav(mom_df, 'Mom')

# Load the CSVs for Quality and LowVol
quality_long = pd.read_csv('QualityNAV.csv')
lowvol_short = pd.read_csv('LowVolShortNAV.csv').drop(columns=['Unnamed: 0'])

# Merge Quality Long with LowVol Short on 'Date'
quality_df = pd.merge(quality_long, lowvol_short, on='Date', suffixes=('_long', '_short'))

# Function to calculate Quality Long-Short NAV using LowVol's short returns
def calculate_quality_long_short_nav(df):
    # Calculate Daily Return for the Quality Long portfolio
    df['long_return'] = df['Close_long'].pct_change()

    # Use LowVol's short returns for the short leg
    df['short_return'] = df['Close_short'].pct_change()

    # Calculate Long-Short Return
    df['long_short_return'] = df['long_return'] - df['short_return']

    # Rebase the Long-Short Return to calculate NAV
    df['Quality_LS_NAV'] = (1 + df['long_short_return']).cumprod() * 100

    # Keep only the necessary columns: Date, NAV_Long, and the new Quality_LS_NAV
    return df[['Date' ,'Quality_LS_NAV']]

# Apply the function to the merged DataFrame
quality_result = calculate_quality_long_short_nav(quality_df)

In [74]:
# Ensure all Date columns are in datetime format before merging
lowvol_result['Date'] = pd.to_datetime(lowvol_result['Date'])
value_result['Date'] = pd.to_datetime(value_result['Date'])
mom_result['Date'] = pd.to_datetime(mom_result['Date'])
quality_result['Date'] = pd.to_datetime(quality_result['Date'])

# Merge all results
merged_result = pd.merge(lowvol_result, value_result, on='Date', how='outer')
merged_result = pd.merge(merged_result, mom_result, on='Date', how='outer')
merged_result = pd.merge(merged_result, quality_result, on='Date', how='outer')

# Sort and drop NaN values cautiously
merged_result = merged_result.sort_values(by='Date')

# Check for missing data before dropping
print(merged_result.isna().sum())  # Identify missing data per column

# Drop only if you are sure it’s needed
merged_result = merged_result.dropna()

# Ensure Date is the index and in correct format
merged_result.set_index('Date', inplace=True)
merged_result.index = pd.to_datetime(merged_result.index)

# Verify final output
print(merged_result.head())
print(merged_result.tail())


Date              0
LowVol_LS_NAV     2
Value_LS_NAV      2
Mom_LS_NAV        1
Quality_LS_NAV    2
dtype: int64
            LowVol_LS_NAV  Value_LS_NAV  Mom_LS_NAV  Quality_LS_NAV
Date                                                               
2008-02-12     114.579716    101.379808  104.393410       99.634496
2008-03-11      95.946576    107.545863   94.355441       89.194295
2008-03-12     113.447356    100.771103  103.416353      100.660266
2008-04-11      92.080594    107.188479   94.631684       90.241873
2008-05-12     106.967390     97.737971  100.503140       99.467992
            LowVol_LS_NAV  Value_LS_NAV  Mom_LS_NAV  Quality_LS_NAV
Date                                                               
2025-01-14      42.978996    163.532547  809.864308       31.592801
2025-01-15      42.953132    162.882903  805.724957       31.682044
2025-01-16      41.943974    165.281858  814.498894       31.351472
2025-01-17      41.555501    163.963328  824.373420       30.820443
202

In [28]:
# import matplotlib.pyplot as plt
# import seaborn as sns

# def calculate_rolling_correlations(df, window_years):
#     window = window_years * 252  # Assuming 252 trading days per year
#     pairs = ['LowVol_LS_NAV-Value_LS_NAV', 'LowVol_LS_NAV-Mom_LS_NAV', 'LowVol_LS_NAV-Quality_LS_NAV',
#              'Value_LS_NAV-Mom_LS_NAV', 'Value_LS_NAV-Quality_LS_NAV', 'Mom_LS_NAV-Quality_LS_NAV']
    
#     corr_df = pd.DataFrame(index=df.index)
    
#     for pair in pairs:
#         factor1, factor2 = pair.split('-')
#         corr_df[f'{pair}_{window_years}Y'] = df[factor1].rolling(window).corr(df[factor2])
    
#     return corr_df

# def plot_correlations(df, corr_df, window_years, save_path):
#     # Increase figure size and DPI for better quality
#     plt.figure(figsize=(20, 15), dpi=300)
    
#     pairs = ['LowVol_LS_NAV-Value_LS_NAV', 'LowVol_LS_NAV-Mom_LS_NAV', 'LowVol_LS_NAV-Quality_LS_NAV',
#              'Value_LS_NAV-Mom_LS_NAV', 'Value_LS_NAV-Quality_LS_NAV', 'Mom_LS_NAV-Quality_LS_NAV']
    
#     for i, pair in enumerate(pairs, 1):
#         plt.subplot(2, 3, i)
        
#         # Plot with improved styling
#         plt.plot(corr_df[f'{pair}_{window_years}Y'], linewidth=1.5, color='#1f77b4')
        
#         # Improve title formatting
#         plt.title(f'{window_years}Y Rolling Correlation:\n{pair}', pad=20, fontsize=12)
        
#         # Improve axis formatting
#         plt.grid(True, linestyle='--', alpha=0.7)
#         plt.ylim(-1, 1)  # Set fixed y-axis limits for correlation
        
#         # Format x-axis
#         plt.xticks(rotation=45, ha='right')
        
#         # Add y-axis label
#         plt.ylabel('Correlation')
        
#         # Add horizontal lines at important correlation levels
#         plt.axhline(y=0, color='black', linestyle='-', linewidth=0.5, alpha=0.3)
#         plt.axhline(y=0.5, color='gray', linestyle='--', linewidth=0.5, alpha=0.3)
#         plt.axhline(y=-0.5, color='gray', linestyle='--', linewidth=0.5, alpha=0.3)
    
#     # Adjust layout to prevent cutoff
#     plt.tight_layout(pad=3.0)
    
#     # Save with high quality
#     plt.savefig(f'{save_path}_{window_years}Y_correlations.pdf', 
#                 bbox_inches='tight', 
#                 pad_inches=0.5)
#     plt.close()

# def calculate_full_period_correlations(df):
#     corr_matrix = df.corr()
#     corr_matrix.to_csv('full_period_correlations.csv')
#     return corr_matrix

# # Run the analysis
# def run_correlation_analysis(merged_result):
#     # Calculate rolling correlations for different windows
#     corr_1y = calculate_rolling_correlations(merged_result, 1)
#     corr_3y = calculate_rolling_correlations(merged_result, 3)
#     corr_5y = calculate_rolling_correlations(merged_result, 5)

#     # Create plots
#     plot_correlations(merged_result, corr_1y, 1, 'rolling')
#     plot_correlations(merged_result, corr_3y, 3, 'rolling')
#     plot_correlations(merged_result, corr_5y, 5, 'rolling')

#     # Calculate and save full-period correlations
#     full_period_corr = calculate_full_period_correlations(merged_result)
    
#     return corr_1y, corr_3y, corr_5y, full_period_corr

# # Execute
# corr_1y, corr_3y, corr_5y, full_period_corr = run_correlation_analysis(merged_result)

In [75]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib.dates import YearLocator, DateFormatter
import matplotlib.dates as mdates

def plot_individual_correlations(df, window_years=1, save_path='factor_correlations.pdf'):
    # Ensure 'date' column exists and set it as index
    if 'date' in df.columns:
        df['date'] = pd.to_datetime(df['date'])
        df = df.set_index('date')
    
    # Drop any rows with NaN values to prevent incorrect date handling
    df = df.dropna()
    
    # Calculate window size (252 trading days per year)
    window = window_years * 252
    
    # Define all factor pairs
    pairs = [
        ('LowVol_LS_NAV', 'Value_LS_NAV'),
        ('LowVol_LS_NAV', 'Mom_LS_NAV'),
        ('LowVol_LS_NAV', 'Quality_LS_NAV'),
        ('Value_LS_NAV', 'Mom_LS_NAV'),
        ('Value_LS_NAV', 'Quality_LS_NAV'),
        ('Mom_LS_NAV', 'Quality_LS_NAV')
    ]
    
    # Create PDF
    with PdfPages(save_path) as pdf:
        for factor1, factor2 in pairs:
            # Calculate rolling correlation
            rolling_corr = df[factor1].rolling(window).corr(df[factor2])
            rolling_corr = rolling_corr.dropna()
            
            expanding_corr = df[factor1].expanding().corr(df[factor2])
            expanding_corr = expanding_corr.dropna()
            
            # Calculate statistics
            mean_corr = rolling_corr.mean()
            min_corr = rolling_corr.min()
            max_corr = rolling_corr.max()
            std_corr = rolling_corr.std()
            
            # Calculate expanding percentiles
            p10 = expanding_corr.expanding().quantile(0.10)
            p75 = expanding_corr.expanding().quantile(0.75)
            p95 = expanding_corr.expanding().quantile(0.95)
            
            # Create figure
            fig, ax = plt.subplots(figsize=(12, 6), dpi=300)
            
            # Plot correlation
            ax.plot(rolling_corr.index, rolling_corr.values, 
                   linewidth=1.5, 
                   color='#1f77b4', 
                   label=f'1Y Rolling Correlation')
            
            # Plot percentile bands
            ax.plot(p10.index, p10.values, linestyle='dashed', color='gray', alpha=0.6, label='10th Percentile')
            ax.plot(p75.index, p75.values, linestyle='dashed', color='green', alpha=0.6, label='75th Percentile')
            ax.plot(p95.index, p95.values, linestyle='dashed', color='red', alpha=0.6, label='95th Percentile')
            
            # Add horizontal lines
            ax.axhline(y=0, color='black', linestyle='-', linewidth=0.5, alpha=0.3)
            ax.axhline(y=0.5, color='gray', linestyle='--', linewidth=0.5, alpha=0.3)
            ax.axhline(y=-0.5, color='gray', linestyle='--', linewidth=0.5, alpha=0.3)
            
            # Add mean correlation line
            ax.axhline(y=mean_corr, color='red', linestyle='--', linewidth=1, alpha=0.5,
                      label=f'Mean: {mean_corr:.2f}')
            
            # Add statistics box
            stats_text = (f'Mean: {mean_corr:.2f}\n'
                         f'Min: {min_corr:.2f}\n'
                         f'Max: {max_corr:.2f}\n'
                         f'Std Dev: {std_corr:.2f}')
            
            # Place stats box in upper left corner
            plt.text(0.02, 0.98, stats_text,
                    transform=ax.transAxes,
                    bbox=dict(facecolor='white', alpha=0.8, edgecolor='none'),
                    verticalalignment='top',
                    fontsize=10)
            
            # Customize plot
            plt.title(f'{factor1} vs {factor2}\n1-Year Rolling Correlation', 
                     pad=20, 
                     fontsize=12)
            plt.grid(True, linestyle='--', alpha=0.7)
            plt.ylim(-1, 1)
            
            # Format axes
            plt.ylabel('Correlation')
            plt.xlabel('Date')
            
            # Format x-axis dates
            ax.xaxis.set_major_locator(mdates.YearLocator())
            ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
            fig.autofmt_xdate()  # Automatically format dates
            
            # Add legend
            plt.legend(loc='upper right')
            
            # Adjust layout
            plt.tight_layout()
            
            # Save to PDF
            pdf.savefig(fig, bbox_inches='tight')
            plt.close()
        
        # Create and save summary statistics
        summary_stats = pd.DataFrame(columns=['Pair', 'Mean Correlation', 'Min', 'Max', 'Std Dev'])
        
        for factor1, factor2 in pairs:
            corr = df[factor1].rolling(window).corr(df[factor2])
            new_row = pd.DataFrame({
                'Pair': [f'{factor1} vs {factor2}'],
                'Mean Correlation': [corr.mean()],
                'Min': [corr.min()],
                'Max': [corr.max()],
                'Std Dev': [corr.std()]
            })
            summary_stats = pd.concat([summary_stats, new_row], ignore_index=True)
        
        summary_stats.to_csv('correlation_summary_stats.csv', index=False)
        return summary_stats

# Run the analysis
summary_stats = plot_individual_correlations(merged_result)
print("\nAnalysis complete. Correlation plots have been saved to a single PDF file.")



Analysis complete. Correlation plots have been saved to a single PDF file.


In [63]:
print(merged_result.index[merged_result.index < '2000-01-01'])

DatetimeIndex([], dtype='datetime64[ns]', name='Date', freq=None)
