In [20]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import numpy as np
from simulations_creation import parameters

def define_tscv_size_multiple(raw_multiple):
    diffs = {multiple: abs(multiple - raw_multiple) for multiple in parameters["tscv_size_multiple"]}
    return min(diffs, key=diffs.get)


In [2]:
factors = ["testing_window", "training_window", "tscv_size_multiple", "n_tscv", "year", "n_assets"]
dependent_variable = ["sharpe"]

yearly_performance_metrics = pd.read_csv("output/simulations_yearly_performance_metrics.csv")
yearly_performance_metrics_filtered = (
    yearly_performance_metrics
    .assign(tscv_size_multiple=lambda df: (df.tscv_size / df.training_window).map(lambda x: define_tscv_size_multiple((x))))
    .loc[:, factors + dependent_variable]
)
yearly_performance_metrics_filtered[factors] = yearly_performance_metrics_filtered[factors].astype(str)

In [111]:
model = ols('sharpe ~ C(testing_window) + C(training_window) + C(tscv_size_multiple) + C(n_tscv) + C(year) + C(n_assets)', data=yearly_performance_metrics_filtered).fit()
anova_results = sm.stats.anova_lm(model, typ=2)
anova_results = (
    anova_results
    .rename({
        "C(testing_window)": "Testing Windows (Days before Rebalancing)",
        "C(training_window)": "Training Window (Nº Training Days)",
        "C(tscv_size_multiple)": "TSCV Size Multiple (Days in TSCV as % of Training Window)",
        "C(n_tscv)": "Nº TSCV (Time-Series Cross-Validation Folds)",
        "C(year)": "Year",
        "C(n_assets)": "Nº Assets"
    }, axis=0)
    .rename({
        "sum_sq": "Sum of Squares",
        "df": "d.f.",
        "F": "F-Statistic",
        "PR(>F)": "p-value"
    }, axis=1)
    .replace({pd.NA: "-", np.nan: "-"})
)
anova_results.to_latex("reports/tables/anova/no_interaction.tex", index=True, escape=True, float_format="%.2f")
anova_results

Unnamed: 0,Sum of Squares,d.f.,F-Statistic,p-value
Testing Windows (Days before Rebalancing),2.266374,6.0,4.50453,0.000143
Training Window (Nº Training Days),20.37714,3.0,81.001133,0.0
TSCV Size Multiple (Days in TSCV as % of Training Window),3.205092,4.0,9.555417,0.0
Nº TSCV (Time-Series Cross-Validation Folds),23.841066,7.0,40.615964,0.0
Year,104587.323917,21.0,59392.127166,0.0
Nº Assets,49.587668,4.0,147.836888,0.0
Residual,10327.124153,123154.0,-,-


In [107]:
def calc_tukey_pairwise_analysis(metric_opt):
    df = yearly_performance_metrics_filtered.copy()
    model = ols(f'sharpe ~ C({metric_opt})', data=df).fit()
    anova_results = sm.stats.anova_lm(model, typ=2)  # Using Type II sums of squares
    (
        anova_results
        .rename({
            "C(testing_window)": "Testing Windows (Days before Rebalancing)",
            "C(training_window)": "Training Window (Nº Training Days)",
            "C(tscv_size_multiple)": "TSCV Size Multiple (Days in TSCV as Percentage of Training Window)",
            "C(n_tscv)": "Nº TSCV (Time-Series Cross-Validation Folds)",
            "C(year)": "Year",
            "C(n_assets)": "Nº Assets"
        }, axis=0)
    )
    tukey = pairwise_tukeyhsd(endog=df['sharpe'], groups=df[metric_opt], alpha=0.1)
    tukey_analysis = (
        pd.DataFrame(tukey.summary())
        .iloc[1:]
        .set_axis(pd.DataFrame(tukey.summary()).iloc[0], axis=1)
        .set_axis(["group1", "group2", "meandiff", "p-adj", "lower", "upper", "reject"], axis=1)
        .assign(**{
            "group1": lambda df: df["group1"].map(lambda x: x.format(100).replace(" ", "")).astype(str),
            "group2": lambda df: df["group2"].map(lambda x: x.format(100).replace(" ", "")).astype(str),
            "meandiff": lambda df: df["meandiff"].map(lambda x: x.format(100).replace(" ", "")).astype(float),
            "p-adj": lambda df: df["p-adj"].map(lambda x: x.format(100).replace(" ", "")).astype(float),
            "lower": lambda df: df["lower"].map(lambda x: x.format(100).replace(" ", "")).astype(float),
            "upper": lambda df: df["upper"].map(lambda x: x.format(100).replace(" ", "")).astype(float),
            "reject": lambda df: df["reject"].map(lambda x: x.format(100).replace(" ", "")).astype(str)
        })
        .rename({
            "group1": "Group 1",
            "group2": "Group 2",
            "meandiff": "Mean Difference",
            "p-adj": "Adjusted p-value",
            "lower": "Lower Bound",
            "upper": "Upper Bound",
        }, axis=1)
        .loc[lambda df: df.reject == "True"]
        .loc[:, ["Group 1", "Group 2", "Mean Difference", "Adjusted p-value", "Lower Bound", "Upper Bound"]]
    )
    tukey_analysis.to_latex(f"reports/tables/tukeyhsd/{metric_opt}.tex", index=False, escape=True, float_format="%.4f")
    return tukey_analysis

In [106]:
for metric_opt in factors:
    tukey_analysis = calc_tukey_pairwise_analysis(metric_opt)

Tukey Analysis for testing_window:
Empty DataFrame
Columns: [Group 1, Group 2, Mean Difference, Adjusted p-value, Lower Bound, Upper Bound]
Index: []
Tukey Analysis for training_window:
  Group 1 Group 2  Mean Difference  Adjusted p-value  Lower Bound  Upper Bound
5     252      63          -0.0306            0.0005      -0.0485      -0.0128
6     504      63          -0.0309            0.0004      -0.0487      -0.0130
Tukey Analysis for tscv_size_multiple:
Empty DataFrame
Columns: [Group 1, Group 2, Mean Difference, Adjusted p-value, Lower Bound, Upper Bound]
Index: []
Tukey Analysis for n_tscv:
  Group 1 Group 2  Mean Difference  Adjusted p-value  Lower Bound  Upper Bound
1       1       2          -0.0438            0.0018      -0.0744      -0.0132
2       1       3          -0.0388            0.0100      -0.0694      -0.0082
3       1       4          -0.0390            0.0094      -0.0696      -0.0084
4       1       5          -0.0426            0.0028      -0.0732      -0.0120
5