In [2]:
from scipy.stats import entropy
import numpy as np
import pandas as pd
import plotly.express as px

# set plotly as default pandas plotting backend
pd.options.plotting.backend = "plotly"

In [3]:
data = pd.read_json("SeapoPym_HOT_climato_obs_npp_opti_all_parameters_2_groups_logbook.json", orient="table")
var = data.columns[:-2]

In [4]:
data = data.reset_index()

In [5]:
var

Index(['D1N1_energy_coefficient', 'D1N1_tr_max', 'D1N1_tr_rate',
       'D1N1_inv_lambda_max', 'D1N1_inv_lambda_rate',
       'D2N1_energy_coefficient', 'D2N1_tr_max', 'D2N1_tr_rate',
       'D2N1_inv_lambda_max', 'D2N1_inv_lambda_rate'],
      dtype='object')

In [6]:
def compute_shannon_entropy(p, bins=10):
    """Proche de 0 = distribution similaires."""
    hist_p, _ = np.histogram(p, bins=bins, density=True)
    hist_p += 1e-10
    p = hist_p / np.sum(hist_p)

    return entropy(p)


entropies = {}
for generation in data.reset_index()["generation"].unique():
    data_gen = data[data["generation"] == generation]
    gen_entropy = {k: compute_shannon_entropy(v) for k, v in data_gen.items() if k in var}
    entropies[generation] = gen_entropy

entropies = pd.DataFrame(entropies).T

entropies = (
    entropies.unstack().reset_index().rename(columns={"level_1": "generation", "level_0": "variable", 0: "entropy"})
)

px.area(
    entropies,
    x="generation",
    y="entropy",
    color="variable",
    line_group="variable",
    title="Shannon entropy of parameter distributions",
    labels={"index": "Generation", "value": "Shannon entropy"},
    markers=True,
).update_layout(xaxis_showgrid=False, yaxis_showgrid=False, plot_bgcolor="rgba(0, 0, 0, 0)")

In [7]:
def compute_kl_divergence(p, q, bins=None):
    """
    Proche de 0 = distribution similaires.

    p: distribution de référence
    q: distribution à comparer
    """
    # bins = freedman_diaconis_rule(p) if bins is None else bins
    hist_p, bin_edges = np.histogram(p, bins=bins, density=True)
    hist_q, _ = np.histogram(q, bins=bin_edges, density=True)

    hist_p += 1e-10
    hist_q += 1e-10

    p = hist_p / np.sum(hist_p)
    q = hist_q / np.sum(hist_q)

    return entropy(p, q)  # Base e (nats) by default


kls = {}
for generation in data.reset_index()["generation"].unique()[1:]:
    data_gen = data[data["generation"] == generation]
    data_previous_gen = data[data["generation"] == generation - 1]
    gen_kl = {k: compute_kl_divergence(p=data_previous_gen[k], q=data_gen[k], bins=50) for k in var}
    kls[generation] = gen_kl
kls = pd.DataFrame(kls).T

kls = kls.unstack().reset_index().rename(columns={"level_1": "generation", "level_0": "variable", 0: "KLS"})

px.area(
    kls,
    x="generation",
    y="KLS",
    color="variable",
    line_group="variable",
    title="KLS of parameter distributions",
    labels={"index": "Generation", "value": "KLS"},
    markers=True,
    # add information to hover
    hover_name="variable",
    hover_data={"generation": True, "KLS": True},
).update_layout(xaxis_showgrid=False, yaxis_showgrid=False, plot_bgcolor="rgba(0, 0, 0, 0)")

In [8]:
import numpy as np
from scipy.spatial.distance import pdist

In [9]:
pdist(data[var].iloc[:1000])

array([75.05179334, 52.60391533, 90.37442297, ..., 56.73896531,
       35.03224499, 52.89119086])

In [10]:
import matplotlib.pyplot as plt

# TODO :

-   Normalizer les jeux de paramètres pour obtenir une distance euclidienne standard ?


In [29]:
euclidian = []
for generation in data.reset_index()["generation"].unique()[:]:
    data_gen = data[data["generation"] == generation]
    euclidian.append(pd.Series(pdist(data_gen[var]), name=f"Generation {generation}"))
euclidian = pd.concat(euclidian, axis=1)
euclidian = euclidian.describe()
euclidian

Unnamed: 0,Generation 0,Generation 1,Generation 2,Generation 3,Generation 4,Generation 5,Generation 6,Generation 7,Generation 8,Generation 9,Generation 10,Generation 11,Generation 12,Generation 13,Generation 14,Generation 15,Generation 16,Generation 17,Generation 18,Generation 19
count,499500.0,499500.0,499500.0,499500.0,499500.0,499500.0,499500.0,499500.0,499500.0,499500.0,499500.0,499500.0,499500.0,499500.0,499500.0,499500.0,499500.0,499500.0,499500.0,499500.0
mean,59.930783,59.147512,59.050453,58.081305,58.675119,58.395335,56.609572,55.209361,54.893755,54.258019,54.479612,54.746724,55.728734,55.15618,54.361619,54.168887,52.5061,49.168351,46.080871,44.789302
std,21.934844,21.892155,21.937593,21.804796,22.417573,22.688108,21.966086,21.859011,21.94701,21.938507,22.356538,22.452352,22.385166,22.163174,22.153067,22.290702,22.75418,22.267464,21.907366,21.987841
min,2.108975,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,43.548182,42.816112,42.640479,41.773966,41.804527,41.274318,40.051077,38.869578,38.637439,37.847607,37.882459,37.864118,39.061473,38.866842,37.957521,37.567198,34.978411,31.247353,27.600301,26.179326
50%,58.687939,57.74625,57.757548,56.691331,57.156934,56.632156,54.888086,53.149294,52.60955,52.049038,52.104522,52.655829,53.888786,53.452733,52.968072,53.050225,52.138104,49.231915,45.399836,43.272395
75%,75.585495,74.657339,74.623464,73.424544,74.714677,74.393983,72.261703,70.495813,70.254052,69.650615,69.789382,70.187726,71.220478,70.267485,69.535182,69.921702,69.330982,66.038004,62.876355,62.138254
max,144.581931,144.192904,144.360336,141.83484,141.290384,142.67528,136.490728,136.268955,137.802017,136.194843,137.494585,137.324277,143.15751,134.745606,132.646267,132.741202,134.635265,132.040377,128.25611,118.053436


In [35]:
import plotly.graph_objects as go

fig = go.Figure()

fig.add_trace(
    go.Box(
        q1=euclidian.loc["25%"],
        median=euclidian.loc["50%"],
        q3=euclidian.loc["75%"],
        lowerfence=euclidian.loc["min"],
        upperfence=euclidian.loc["max"],
    )
)

fig.update_xaxes(title_text="Generation")
fig.update_yaxes(title_text="Euclidian distance")
fig.update_layout(
    title="Euclidian distance of parameter distributions", xaxis_title="Generation", yaxis_title="Euclidian distance"
)

fig.show()