In [1]:
import pandas as pd
from scipy.stats import ttest_ind
import numpy as np


In [2]:
import math

def format_mean_sem(mean, sem):
    if sem == 0 or np.isnan(sem):
        return f"{mean:.4f}(0)"  # fallback
    
    exponent = int(math.floor(np.log10(sem)))
    precision = -exponent
    digits = max(0, precision)

    sem_rounded = round(sem, digits)
    mean_rounded = round(mean, digits)
    sem_for_latex = int(round(sem_rounded * (10 ** digits)))

    return f"{mean_rounded:.{digits}f}({sem_for_latex})"


# Collect data

In [3]:
from pathlib import Path
import re
import numpy as np


def build_dataset(p: Path):
    dfs = []
    for folder in p.glob('*'):
        for file in folder.glob('*.csv'):
            dfs.append(pd.read_csv(file))
            dfs[-1]['Seed'] = int(re.findall(r'\d+', file.name)[0])

    df = pd.concat(dfs)
    df['Loglr'] = df['Learning rate'].apply(lambda x: np.log10(x))

    activation_dict = {
        'activation.identity': "Identity",
        'activation.relu': "ReLU",
        'activation.leaky_relu': "LeakyReLU",
        'activation.sigmoid': "Sigmoid",
        'activation.dmelu': "DELU"
    }

    model_dict = {
        "model_type.hyperbolic" : "Hyperbolic",
        "model_type.euclidean" : "Euclidean",
        "model_type.eubn" : "Eucl + BN",
        "model_type.poincare" : "Poincare",
        "model_type.lorentz" : "Lorentz",
        "model_type.logr" : "Logistic"
    }

    df['Model'] = df['Model'].apply(lambda x: model_dict[x])
    df['Activation'] = df['Activation'].apply(lambda x: activation_dict[x])
    df['Type'] = df['Model'] + ' - ' + df['Activation']

    return df

# Regression tasks

In [4]:
dfs = []
for problem in ['sinc', 'sinc3', 'prod2', 'prod3', 'hyp']:
    df = build_dataset(Path('..//data/regression')/problem)
    df['Type'] = df['Model'] + '_' + df['Activation']
    df['Type'] = df['Type'].apply(lambda x: ' '.join(x.split('_')))
    df1 = df.groupby(['Model', 'Activation', 'Nlayers', 'Seed', 'Type', 'Neurons'], as_index=False).apply(min)
    df1['Problem'] = problem
    dfs.append(df1)

regression = pd.concat(dfs)

  df1 = df.groupby(['Model', 'Activation', 'Nlayers', 'Seed', 'Type', 'Neurons'], as_index=False).apply(min)
  df1 = df.groupby(['Model', 'Activation', 'Nlayers', 'Seed', 'Type', 'Neurons'], as_index=False).apply(min)
  df1 = df.groupby(['Model', 'Activation', 'Nlayers', 'Seed', 'Type', 'Neurons'], as_index=False).apply(min)
  df1 = df.groupby(['Model', 'Activation', 'Nlayers', 'Seed', 'Type', 'Neurons'], as_index=False).apply(min)
  df1 = df.groupby(['Model', 'Activation', 'Nlayers', 'Seed', 'Type', 'Neurons'], as_index=False).apply(min)
  df1 = df.groupby(['Model', 'Activation', 'Nlayers', 'Seed', 'Type', 'Neurons'], as_index=False).apply(min)
  df1 = df.groupby(['Model', 'Activation', 'Nlayers', 'Seed', 'Type', 'Neurons'], as_index=False).apply(min)
  df1 = df.groupby(['Model', 'Activation', 'Nlayers', 'Seed', 'Type', 'Neurons'], as_index=False).apply(min)
  df1 = df.groupby(['Model', 'Activation', 'Nlayers', 'Seed', 'Type', 'Neurons'], as_index=False).apply(min)
  df1 = df.groupby(

In [5]:
regression['R^2'] = 1 - regression['Normalized test loss']

loss_summary = (
    regression
    .groupby(['Problem', 'Type'])['R^2']
    .agg(['mean', 'std', 'count'])
    .reset_index()
    .rename(columns={'mean': 'Mean_R^2', 'std': 'Std_R^2', 'count': 'N'})
)

In [6]:
loss_summary

Unnamed: 0,Problem,Type,Mean_R^2,Std_R^2,N
0,hyp,Euclidean DELU,0.853256,0.017045,10
1,hyp,Hyperbolic DELU,0.972528,0.004905,10
2,hyp,Hyperbolic Identity,0.931343,0.075905,10
3,hyp,Poincare DELU,0.965276,0.005898,10
4,prod2,Euclidean DELU,0.999126,0.000255,10
5,prod2,Hyperbolic DELU,0.999485,9.3e-05,10
6,prod2,Hyperbolic Identity,0.999584,0.000127,10
7,prod2,Poincare DELU,0.997436,0.000497,10
8,prod3,Euclidean DELU,0.991176,0.001982,10
9,prod3,Hyperbolic DELU,0.990699,0.003797,10


In [7]:
ttest_results = []

for problem, group in regression.groupby("Problem"):

    r2_means = group.groupby("Type")["R^2"].mean()
    best_type = r2_means.idxmax()
    
    best_scores = group[group["Type"] == best_type]["R^2"]
    
    for type_ in group["Type"].unique():
        if type_ == best_type:
            continue
        
        comp_scores = group[group["Type"] == type_]["R^2"]
        
        t_stat, p_val = ttest_ind(best_scores, comp_scores, equal_var=False)  # Welch's t-test
        
        ttest_results.append({
            "Problem": problem,
            "Best_Type": best_type,
            "Compared_Type": type_,
            "T_stat": t_stat,
            "P_value": p_val,
            "Best_Mean": best_scores.mean(),
            "Compared_Mean": comp_scores.mean(),
            "Best_Better": best_scores.mean() > comp_scores.mean() and p_val < 0.05
        })

ttest_df = pd.DataFrame(ttest_results)


In [8]:
ttest_df

Unnamed: 0,Problem,Best_Type,Compared_Type,T_stat,P_value,Best_Mean,Compared_Mean,Best_Better
0,hyp,Hyperbolic DELU,Euclidean DELU,21.264652,5.833632e-10,0.972528,0.853256,True
1,hyp,Hyperbolic DELU,Hyperbolic Identity,1.71224,0.1207287,0.972528,0.931343,False
2,hyp,Hyperbolic DELU,Poincare DELU,2.989565,0.008071866,0.972528,0.965276,True
3,prod2,Hyperbolic Identity,Euclidean DELU,5.07986,0.0002006205,0.999584,0.999126,True
4,prod2,Hyperbolic Identity,Hyperbolic DELU,1.981341,0.06444789,0.999584,0.999485,False
5,prod2,Hyperbolic Identity,Poincare DELU,13.23802,9.675146e-08,0.999584,0.997436,True
6,prod3,Euclidean DELU,Hyperbolic DELU,0.352386,0.7299634,0.991176,0.990699,False
7,prod3,Euclidean DELU,Hyperbolic Identity,5.821767,2.560083e-05,0.991176,0.984788,True
8,prod3,Euclidean DELU,Poincare DELU,2.731676,0.0136976,0.991176,0.988764,True
9,sinc,Hyperbolic Identity,Euclidean DELU,19.792173,8.435626e-09,0.988278,0.859017,True


In [9]:
problems_dict = {"sinc": "$\mathrm{Sinc}(\|x\|_2)$", 
                 "sinc3":"$\mathrm{Sinc}(\|x\|_3)$", 
                 "prod3":"$x_0 + x_0x_1 + x_0x_1x_2$", 
                 "prod2":"$x_0 + x_0x_1$", 
                 "hyp":"$\\frac{1}{n}\left(\sum_i^{n-1} x_i^2 - x_n^2\\right)$"
                 }

models_dict = {"Poincare DELU": "Poincaré + DiLU",
               "Hyperbolic Identity": "Cartan",
               "Hyperbolic DELU": "Cartan + DiLU",
               "Euclidean DELU": "Euclidean + DiLU",
               "Lorentz DELU": "Lorentz + DiLU",
               }

  problems_dict = {"sinc": "$\mathrm{Sinc}(\|x\|_2)$",
  "sinc3":"$\mathrm{Sinc}(\|x\|_3)$",
  "hyp":"$\\frac{1}{n}\left(\sum_i^{n-1} x_i^2 - x_n^2\\right)$"


In [10]:
n = loss_summary["N"].mean()
loss_summary['SEM'] = loss_summary['Std_R^2'] / np.sqrt(n)

problems = loss_summary['Problem'].unique()
types = loss_summary['Type'].unique()

best_types_per_problem = {}
for prob in problems:
    sig_bests = ttest_df[(ttest_df["Problem"] == prob) & (ttest_df["Best_Better"] == True)]
    if not sig_bests.empty:
        best_type = sig_bests.iloc[0]["Best_Type"]
    else:
        best_type = loss_summary[loss_summary["Problem"] == prob].sort_values("R^2", ascending=False).iloc[0]["Type"]
    best_types_per_problem[prob] = best_type

header = r"""\centering
\caption{$R^2$ on toy regression datasets (mean $\pm$ std, $n$ = """ + str(int(n)) + r""")}\label{tab:regression}
\begin{tabular}{
  l""" + "  " + "  ".join(["S[table-format=1.3(2)]" for _ in types]) + r"""}
\toprule
\textbf{Problem} & """ + " & ".join([f"{{{models_dict[t]}}}" for t in types]) + r""" \\
\midrule
"""

rows = []
for prob in problems:
    row = [problems_dict[prob]]
    for t in types:
        entry = loss_summary[(loss_summary['Problem'] == prob) & (loss_summary['Type'] == t)]
        if not entry.empty:
            mean = entry['Mean_R^2'].values[0]
            sem = entry['Std_R^2'].values[0]
            formatted = format_mean_sem(mean, sem)
            if best_types_per_problem[prob] == t:
                formatted = r"\cellcolor{yellow!30}{\num{" + formatted + "}}"
            else:
                formatted = r"\num{" + formatted + "}"
        else:
            formatted = "-"
        row.append(formatted)
    rows.append(" & ".join(row) + r" \\")

footer = r"""\bottomrule
\end{tabular}
"""

latex_table = header + "\n".join(rows) + "\n" + footer


In [11]:
with open("regression.txt", "w") as text_file:
    text_file.write(latex_table)

# Classification

In [12]:
dfs = []
for problem in ['mnist', 'fmnist', 'kmnist', 'cifar10']:
    df = build_dataset(Path("../data/classification")/problem)
    df['Type'] = df['Model'] + '_' + df['Activation']
    df['Type'] = df['Type'].apply(lambda x: ' '.join(x.split('_')))
    df['Problem'] = problem
    df1 = df.groupby(['Model', 'Activation', 'Nlayers', 'Seed', 'Type', 'Neurons'], as_index=False).apply(max)
    dfs.append(df1)

classification = pd.concat(dfs)
classification

  df1 = df.groupby(['Model', 'Activation', 'Nlayers', 'Seed', 'Type', 'Neurons'], as_index=False).apply(max)
  df1 = df.groupby(['Model', 'Activation', 'Nlayers', 'Seed', 'Type', 'Neurons'], as_index=False).apply(max)


  df1 = df.groupby(['Model', 'Activation', 'Nlayers', 'Seed', 'Type', 'Neurons'], as_index=False).apply(max)
  df1 = df.groupby(['Model', 'Activation', 'Nlayers', 'Seed', 'Type', 'Neurons'], as_index=False).apply(max)
  df1 = df.groupby(['Model', 'Activation', 'Nlayers', 'Seed', 'Type', 'Neurons'], as_index=False).apply(max)
  df1 = df.groupby(['Model', 'Activation', 'Nlayers', 'Seed', 'Type', 'Neurons'], as_index=False).apply(max)
  df1 = df.groupby(['Model', 'Activation', 'Nlayers', 'Seed', 'Type', 'Neurons'], as_index=False).apply(max)
  df1 = df.groupby(['Model', 'Activation', 'Nlayers', 'Seed', 'Type', 'Neurons'], as_index=False).apply(max)


Unnamed: 0.1,Unnamed: 0,Model,Activation,Test accuracy,Test loss,Train accuracy,Train loss,Epoch,Learning rate,Weight decay,Neurons,Nlayers,Train hyperbolicities,Test hyperbolicities,Time,Seed,Loglr,Type,Problem
0,146,Euclidean,DELU,0.9646,0.801424,0.977388,1.457546,146,0.0001,0.00001,20,2,,,1379.658705,66,-4.0,Euclidean DELU,mnist
1,171,Euclidean,DELU,0.9651,0.829703,0.981357,1.501590,171,0.0001,0.00001,20,2,,,1600.686154,454,-4.0,Euclidean DELU,mnist
2,139,Euclidean,DELU,0.9637,0.794452,0.980023,1.476443,139,0.0001,0.00001,20,2,,,1312.862182,1510,-4.0,Euclidean DELU,mnist
3,168,Euclidean,DELU,0.9657,0.816425,0.980890,1.504859,168,0.0001,0.00001,20,2,,,1553.072401,3682,-4.0,Euclidean DELU,mnist
4,160,Euclidean,DELU,0.9620,0.791865,0.977872,1.507620,160,0.0001,0.00001,20,2,,,1477.297243,4932,-4.0,Euclidean DELU,mnist
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45,217,Poincare,DELU,0.4752,2.029708,0.509363,2.155364,217,0.0001,0.00001,20,2,,,4226.195787,4524,-4.0,Poincare DELU,cifar10
46,200,Poincare,DELU,0.4766,2.039002,0.515745,2.161585,200,0.0001,0.00001,20,2,,,3937.594336,5036,-4.0,Poincare DELU,cifar10
47,224,Poincare,DELU,0.4788,2.031615,0.518886,2.158645,224,0.0001,0.00001,20,2,,,5178.055401,6004,-4.0,Poincare DELU,cifar10
48,196,Poincare,DELU,0.4771,2.066206,0.512184,2.169173,196,0.0001,0.00001,20,2,,,3889.882892,6416,-4.0,Poincare DELU,cifar10


In [13]:
loss_summary = (
    classification
    .groupby(['Problem', 'Type'])['Test accuracy']
    .agg(['mean', 'std', 'count'])
    .reset_index()
    .rename(columns={'mean': 'Mean_accuracy', 'std': 'Std_accuracy', 'count': 'N'})
)

In [14]:
loss_summary

Unnamed: 0,Problem,Type,Mean_accuracy,Std_accuracy,N
0,cifar10,Euclidean DELU,0.47728,0.003342,10
1,cifar10,Hyperbolic DELU,0.47557,0.00321,10
2,cifar10,Hyperbolic Identity,0.44391,0.007259,10
3,cifar10,Logistic Identity,0.41036,0.001729,10
4,cifar10,Poincare DELU,0.47478,0.002962,10
5,fmnist,Euclidean DELU,0.86823,0.002337,10
6,fmnist,Hyperbolic DELU,0.86906,0.001828,10
7,fmnist,Hyperbolic Identity,0.85588,0.002997,10
8,fmnist,Logistic Identity,0.84899,0.000528,10
9,fmnist,Poincare DELU,0.87363,0.001989,10


In [15]:
ttest_results = []

for problem, group in classification.groupby("Problem"):

    classification_means = group.groupby("Type")["Test accuracy"].mean()
    best_type = classification_means.idxmax()
    
    best_scores = group[group["Type"] == best_type]["Test accuracy"]
    
    for type_ in group["Type"].unique():
        if type_ == best_type:
            continue
        
        comp_scores = group[group["Type"] == type_]["Test accuracy"]
        
        t_stat, p_val = ttest_ind(best_scores, comp_scores, equal_var=False)  # Welch's t-test
        
        ttest_results.append({
            "Problem": problem,
            "Best_Type": best_type,
            "Compared_Type": type_,
            "T_stat": t_stat,
            "P_value": p_val,
            "Best_Mean": best_scores.mean(),
            "Compared_Mean": comp_scores.mean(),
            "Best_Better": best_scores.mean() > comp_scores.mean() and p_val < 0.05
        })

ttest_df = pd.DataFrame(ttest_results)


In [16]:
ttest_df

Unnamed: 0,Problem,Best_Type,Compared_Type,T_stat,P_value,Best_Mean,Compared_Mean,Best_Better
0,cifar10,Euclidean DELU,Hyperbolic DELU,1.167007,0.2584601,0.47728,0.47557,False
1,cifar10,Euclidean DELU,Hyperbolic Identity,13.205037,9.041352e-09,0.47728,0.44391,True
2,cifar10,Euclidean DELU,Logistic Identity,56.246045,2.1084290000000003e-17,0.47728,0.41036,True
3,cifar10,Euclidean DELU,Poincare DELU,1.770561,0.0938117,0.47728,0.47478,False
4,fmnist,Poincare DELU,Euclidean DELU,5.563699,3.053853e-05,0.87363,0.86823,True
5,fmnist,Poincare DELU,Hyperbolic DELU,5.349385,4.491689e-05,0.87363,0.86906,True
6,fmnist,Poincare DELU,Hyperbolic Identity,15.604018,5.946313e-11,0.87363,0.85588,True
7,fmnist,Poincare DELU,Logistic Identity,37.855506,2.319778e-12,0.87363,0.84899,True
8,kmnist,Poincare DELU,Euclidean DELU,2.943935,0.00871528,0.82109,0.81401,True
9,kmnist,Poincare DELU,Hyperbolic DELU,5.447213,3.806301e-05,0.82109,0.8075,True


In [17]:
problems_dict = {"cifar10": "Cifar10", 
                 "fmnist":"FMNIST", 
                 "kmnist":"KMNIST", 
                 "mnist":"MNIST"
                 }

models_dict = {"Poincare DELU": "Poincaré + DiLU",
               "Hyperbolic Identity": "Cartan",
               "Hyperbolic DELU": "Cartan + DiLU",
               "Euclidean DELU": "Euclidean + DiLU",
               "Lorentz DELU": "Lorentz + DiLU",
               "Logistic Identity": "Logistic"
               }

In [18]:
best_types_per_problem

{'hyp': 'Hyperbolic DELU',
 'prod2': 'Hyperbolic Identity',
 'prod3': 'Euclidean DELU',
 'sinc': 'Hyperbolic Identity',
 'sinc3': 'Hyperbolic Identity'}

In [19]:
n = loss_summary["N"].mean()

problems = loss_summary['Problem'].unique()
types = loss_summary['Type'].unique()

best_types_per_problem = {}
for prob in problems:
    sig_bests = ttest_df[(ttest_df["Problem"] == prob) & (ttest_df["Best_Better"] == True)]
    if not sig_bests.empty:
        best_type = sig_bests.iloc[0]["Best_Type"]
    else:
        best_type = loss_summary[loss_summary["Problem"] == prob].sort_values("R^2", ascending=False).iloc[0]["Type"]
    best_types_per_problem[prob] = best_type

header = r"""\centering
\caption{Accuracy on real-world datasets (mean $\pm$ std, $n$ = """ + str(int(n)) + r""")}\label{tab:classification}
\begin{tabular}{
  l""" + "  " + "  ".join(["S[table-format=1.3(2)]" for _ in types]) + r"""}
\toprule
\textbf{Problem} & """ + " & ".join([f"{{{models_dict[t]}}}" for t in types]) + r""" \\
\midrule
"""

rows = []
for prob in problems:
    row = [problems_dict[prob]]
    for t in types:
        entry = loss_summary[(loss_summary['Problem'] == prob) & (loss_summary['Type'] == t)]
        if not entry.empty:
            mean = entry['Mean_accuracy'].values[0]
            sem = entry['Std_accuracy'].values[0]
            formatted = format_mean_sem(mean, sem)
            if best_types_per_problem[prob] == t:
                formatted = r"\cellcolor{yellow!30}{\num{" + formatted + "}}"
            else:
                prob_Df = ttest_df[ttest_df.Problem == prob]
                if prob_Df[prob_Df.Compared_Type == t].Best_Better.all() == False:
                    formatted = r"\cellcolor{yellow!30}{\num{" + formatted + "}}"
                else:
                    formatted = r"\num{" + formatted + "}"
        else:
            formatted = "-"
        row.append(formatted)
    rows.append(" & ".join(row) + r" \\")

footer = r"""\bottomrule
\end{tabular}
"""

latex_table = header + "\n".join(rows) + "\n" + footer


In [20]:
with open("classification.txt", "w") as text_file:
    text_file.write(latex_table)

# Best classification results

In [21]:
def build_dataset(p):
    dfs = []
    for folder in p.glob('*'):
        for file in folder.glob('*.csv'):
            dfs.append(pd.read_csv(file))
            dfs[-1]['Seed'] = int(re.findall(r'\d+', file.name)[0])

    df = pd.concat(dfs)
    df['Loglr'] = df['Learning rate'].apply(lambda x: np.log10(x))

    activation_dict = {
        'activation.identity': "",
        'activation.relu': "ReLU",
        'activation.leaky_relu': "LeakyReLU",
        'activation.sigmoid': "Sigmoid",
        'activation.dmelu': "DiLU"
    }

    model_dict = {
        "model_type.hyperbolic" : "Cartan",
        "model_type.euclidean" : "Euclidean",
        "model_type.eubn" : "Eucl + BN",
        "model_type.poincare" : "Poincare",
        "model_type.lorentz" : "Lorentz"
    }

    df['Model'] = df['Model'].apply(lambda x: model_dict[x])
    df['Activation'] = df['Activation'].apply(lambda x: activation_dict[x])
    df['Type'] = df['Model'] + ' - ' + df['Activation']

    return df

In [22]:
kmnist = build_dataset(Path('../data/kmnist/kmnist'))
cifar = build_dataset(Path('../data/cifar/cifar10'))

dfs = []
for df, prob in zip([kmnist, cifar], ['KMNIST', 'CIFAR10']):
    df['Type'] = df['Model'] + '_' + df['Activation']
    df['Type'] = df['Type'].apply(lambda x: ' '.join(x.split('_')))
    df1 = df.groupby(['Model', 'Activation', 'Nlayers', 'Type', 'Neurons'], as_index=False).mean()
    df1['Problem'] = prob
    dfs.append(df1)


In [23]:
configs = [[],[]]
for df, dfc, prob in zip(dfs, configs, ['KMNIST', 'CIFAR10']):
    df = df[df['Neurons']<500]
    for type in df['Type'].unique():
        best_conf = df[df['Type']==type].sort_values(by = 'Test accuracy', ascending=False).iloc[0]
        neurons = best_conf['Neurons']
        nlayers = best_conf['Nlayers']
        dfc.append(
            {
                "Type":type,
                "Neurons":neurons,
                "Nlayers":nlayers,
                "Problem":prob
            }
        )

final_dfs = []
for df, config_list in zip([kmnist, cifar], configs):
    dfagg = df.groupby(['Model', 'Activation', 'Nlayers', 'Type', 'Neurons','Seed'], as_index=False).apply(max)
    for config in config_list:
        temp = dfagg[np.logical_and(np.logical_and(
            dfagg['Type']==config['Type'],
            dfagg['Neurons'] == config['Neurons']
        ),
        dfagg['Nlayers']==config['Nlayers']
        )].copy()
        temp['Problem'] = config['Problem']
        final_dfs.append(temp)

best_class_df = pd.concat(final_dfs)

  dfagg = df.groupby(['Model', 'Activation', 'Nlayers', 'Type', 'Neurons','Seed'], as_index=False).apply(max)
  dfagg = df.groupby(['Model', 'Activation', 'Nlayers', 'Type', 'Neurons','Seed'], as_index=False).apply(max)
  dfagg = df.groupby(['Model', 'Activation', 'Nlayers', 'Type', 'Neurons','Seed'], as_index=False).apply(max)
  dfagg = df.groupby(['Model', 'Activation', 'Nlayers', 'Type', 'Neurons','Seed'], as_index=False).apply(max)


In [24]:
loss_summary = (
    best_class_df
    .groupby(['Problem', 'Type'])['Test accuracy']
    .agg(['mean', 'std', 'count'])
    .reset_index()
    .rename(columns={'mean': 'Mean_accuracy', 'std': 'Std_accuracy', 'count': 'N'})
)

In [25]:
loss_summary

Unnamed: 0,Problem,Type,Mean_accuracy,Std_accuracy,N
0,CIFAR10,Cartan,0.50292,0.00695,5
1,CIFAR10,Cartan DiLU,0.53434,0.002985,5
2,CIFAR10,Euclidean DiLU,0.53892,0.002422,10
3,CIFAR10,Poincare DiLU,0.52645,0.002551,10
4,KMNIST,Cartan,0.84477,0.005032,10
5,KMNIST,Cartan DiLU,0.89178,0.002746,10
6,KMNIST,Euclidean DiLU,0.8863,0.003007,10
7,KMNIST,Poincare DiLU,0.87767,0.003514,10


In [26]:
ttest_results = []

for problem, group in best_class_df.groupby("Problem"):

    classification_means = group.groupby("Type")["Test accuracy"].mean()
    best_type = classification_means.idxmax()
    
    best_scores = group[group["Type"] == best_type]["Test accuracy"]
    
    for type_ in group["Type"].unique():
        if type_ == best_type:
            continue
        
        comp_scores = group[group["Type"] == type_]["Test accuracy"]
        
        t_stat, p_val = ttest_ind(best_scores, comp_scores, equal_var=False)  # Welch's t-test
        
        ttest_results.append({
            "Problem": problem,
            "Best_Type": best_type,
            "Compared_Type": type_,
            "T_stat": t_stat,
            "P_value": p_val,
            "Best_Mean": best_scores.mean(),
            "Compared_Mean": comp_scores.mean(),
            "Best_Better": best_scores.mean() > comp_scores.mean() and p_val < 0.05
        })

ttest_df = pd.DataFrame(ttest_results)


In [27]:
ttest_df

Unnamed: 0,Problem,Best_Type,Compared_Type,T_stat,P_value,Best_Mean,Compared_Mean,Best_Better
0,CIFAR10,Euclidean DiLU,Cartan,11.246025,0.0001846873,0.53892,0.50292,True
1,CIFAR10,Euclidean DiLU,Cartan DiLU,2.975386,0.02157442,0.53892,0.53434,True
2,CIFAR10,Euclidean DiLU,Poincare DiLU,11.209373,1.545149e-09,0.53892,0.52645,True
3,KMNIST,Cartan DiLU,Cartan,25.931597,3.482275e-13,0.89178,0.84477,True
4,KMNIST,Cartan DiLU,Euclidean DiLU,4.255641,0.0004834984,0.89178,0.8863,True
5,KMNIST,Cartan DiLU,Poincare DiLU,10.00522,1.5317e-08,0.89178,0.87767,True


In [28]:
problems_dict = {"CIFAR10": "Cifar10", 
                 "fmnist":"FMNIST", 
                 "KMNIST":"KMNIST", 
                 "mnist":"MNIST"
                 }

models_dict = {"Poincare DiLU": "Poincaré + DiLU",
               "Cartan ": "Cartan",
               "Cartan DiLU": "Cartan + DiLU",
               "Euclidean DiLU": "Euclidean + DiLU",
               "Lorentz DELU": "Lorentz + DiLU",
               }

In [29]:
problems = loss_summary['Problem'].unique()
types = loss_summary['Type'].unique()

best_types_per_problem = {}
for prob in problems:
    sig_bests = ttest_df[(ttest_df["Problem"] == prob) & (ttest_df["Best_Better"] == True)]
    if not sig_bests.empty:
        best_type = sig_bests.iloc[0]["Best_Type"]
    else:
        best_type = loss_summary[loss_summary["Problem"] == prob].sort_values("R^2", ascending=False).iloc[0]["Type"]
    best_types_per_problem[prob] = best_type

header = r"""\centering
\caption{Best ccuracy on real-world datasets (mean $\pm$ std, $n$ = """ + str(int(n)) + r""")}\label{tab:bestclassification}
\begin{tabular}{
  l""" + "  " + "  ".join(["S[table-format=1.3(2)]" for _ in types]) + r"""}
\toprule
\textbf{Problem} & """ + " & ".join([f"{{{models_dict[t]}}}" for t in types]) + r""" \\
\midrule
"""

rows = []
for prob in problems:
    row = [problems_dict[prob]]
    for t in types:
        entry = loss_summary[(loss_summary['Problem'] == prob) & (loss_summary['Type'] == t)]
        if not entry.empty:
            mean = entry['Mean_accuracy'].values[0]
            sem = entry['Std_accuracy'].values[0]
            formatted = format_mean_sem(mean, sem)
            if best_types_per_problem[prob] == t:
                formatted = r"\cellcolor{yellow!30}{\num{" + formatted + "}}"
            else:
                prob_Df = ttest_df[ttest_df.Problem == prob]
                if prob_Df[prob_Df.Compared_Type == t].Best_Better.all() == False:
                    formatted = r"\cellcolor{yellow!30}{\num{" + formatted + "}}"
                else:
                    formatted = r"\num{" + formatted + "}"
        else:
            formatted = "-"
        row.append(formatted)
    rows.append(" & ".join(row) + r" \\")

footer = r"""\bottomrule
\end{tabular}
"""

latex_table = header + "\n".join(rows) + "\n" + footer


In [30]:
with open("best_classification.txt", "w") as text_file:
    text_file.write(latex_table)