# Intro

The goal of the below work is to study what results can be obtained using the co-occurrence learning method for several  data and model sizes.

In [1]:
import json
import pandas as pd
import numpy as np

from eval_utils import *

In [2]:
data_sizes = [  # (s, T, n)
    (100, 40, 4),
    (100, 400, 4),
    (100, 4000, 4),
    (100, 40, 8),
    (100, 400, 8),
    (100, 1000, 8),
    (100, 4000, 8),
    (100, 40, 12),
    (100, 400, 12),
    (100, 4000, 12),
    (100, 40, 20),
    (100, 400, 20),
    (100, 4000, 20),
    (100, 40, 50),
    (100, 400, 50),
    (100, 4000, 50),
    (100, 40000, 50),
    (100, 40, 100),
    (100, 400, 100),
    (100, 4000, 100),
    (100, 40000, 100),
    (1000, 10000, 100)
]

In [3]:
def calculate_metrics(dsize):
    s, T, n, pi, A, mu, sigma, _, _, _, X_true, Y_true, lengths, _, _ = init_experiment(dsize=dsize, simple_model=True)
    nodes_tmp = mu.reshape(-1)
    nodes = np.concatenate([(nodes_tmp[1:] + nodes_tmp[:-1]) / 2, np.array([np.infty])])
    Y_disc =  (Y_true > nodes.reshape(1, -1)).sum(axis=-1).reshape(-1, 1)
    omega_emp = empirical_cooc_prob(Y_disc, n, lengths)
    omega_gt = normal_cooc_prob(mu, sigma, nodes, A)
    metrics = dict(
        s = s,
        T = T,
        n = n,
        loss = np.square((omega_emp - omega_gt)).sum(),
        dtv = dtv(omega_emp, omega_gt),
        MAPE = abs((omega_gt - omega_emp) / omega_gt).mean(),
        MAE = abs(omega_emp - omega_gt).mean()
    )
    return metrics

def estimate_metrics(dsize):
    res_tmp = pd.DataFrame([calculate_metrics(dsize) for i in range(100)])
    return res_tmp.mean(axis=0), res_tmp.std(axis=0)

In [4]:
def present_result(dsize):
    result = estimate_metrics(dsize)
    represent = dict(
        s = int(result[0]["s"]),
        T = int(result[0]["T"]),
        n = int(result[0]["n"]),
        loss = f'{round(result[0]["loss"], 3)} +/- {round(result[1]["loss"], 3)}',
        dtv = f'{round(result[0]["dtv"], 3)} +/- {round(result[1]["dtv"], 3)}',
        MAPE = f'{round(result[0]["MAPE"] * 100, 1)} +/- {round(result[1]["MAPE"] * 100, 1)}',
        MAE = f'{round(result[0]["MAE"], 3)} +/- {round(result[1]["MAE"], 3)}',
    )
    print(pd.DataFrame([represent]).style.to_latex())
    return represent, result

In [5]:
def present_all_results(data_sizes):
    all_results = [present_result(dsize) for dsize in data_sizes]
    display(pd.DataFrame([r[0] for r in all_results]))
    return all_results

In [None]:
all_results = present_all_results(data_sizes)

\begin{tabular}{lrrrllll}
 & s & T & n & loss & dtv & MAPE & MAE \\
0 & 100 & 40 & 4 & 0.0 +/- 0.0 & 0.005 +/- 0.002 & 11.5 +/- 6.5 & 0.003 +/- 0.001 \\
\end{tabular}

\begin{tabular}{lrrrllll}
 & s & T & n & loss & dtv & MAPE & MAE \\
0 & 100 & 400 & 4 & 0.0 +/- 0.0 & 0.002 +/- 0.001 & 3.3 +/- 1.2 & 0.001 +/- 0.0 \\
\end{tabular}

\begin{tabular}{lrrrllll}
 & s & T & n & loss & dtv & MAPE & MAE \\
0 & 100 & 4000 & 4 & 0.0 +/- 0.0 & 0.001 +/- 0.0 & 1.0 +/- 0.3 & 0.0 +/- 0.0 \\
\end{tabular}

\begin{tabular}{lrrrllll}
 & s & T & n & loss & dtv & MAPE & MAE \\
0 & 100 & 40 & 8 & 0.0 +/- 0.0 & 0.005 +/- 0.001 & 20.7 +/- 3.1 & 0.001 +/- 0.0 \\
\end{tabular}

\begin{tabular}{lrrrllll}
 & s & T & n & loss & dtv & MAPE & MAE \\
0 & 100 & 400 & 8 & 0.0 +/- 0.0 & 0.002 +/- 0.0 & 6.7 +/- 1.2 & 0.0 +/- 0.0 \\
\end{tabular}

\begin{tabular}{lrrrllll}
 & s & T & n & loss & dtv & MAPE & MAE \\
0 & 100 & 1000 & 8 & 0.0 +/- 0.0 & 0.001 +/- 0.0 & 4.2 +/- 0.6 & 0.0 +/- 0.0 \\
\end{tabular}

\begin{tabul

In [None]:
with open("co-occurrence_expectations.txt", "w") as f:
    f.write(pd.DataFrame([r[0] for r in all_results]).style.to_latex())