In [184]:
import json
import math
import networkx as nx
import pandas as pd
import altair as alt
import numpy as np
from scipy import stats
from scipy.optimize import curve_fit
import distfit
import os

from generator_v2 import Generator

In [4]:
alt.data_transformers.enable("vegafusion")

DataTransformerRegistry.enable('vegafusion')

In [5]:
COLOR = "#99d8c9"

In [6]:
alt.__version__

'5.1.2'

In [7]:
# define the theme by returning the dictionary of configurations

font = "monospace"
# Axes
axisColor = "#000000"
gridColor = "#DEDDDD"
# Colors
main_palette = ["#1696d2", 
                "#d2d2d2",
                "#000000", 
                "#fdbf11", 
                "#ec008b", 
                "#55b748", 
                "#5c5859", 
                "#db2b27", 
               ]
sequential_palette = ["black", 
                      "#a2d4ec", 
                      "#73bfe2", 
                      "#46abdb", 
                      "#1696d2", 
                      "#12719e", 
                     ]


# markColor = "#8856a7"
# markColor = "#6a51a3"
markColor = "#9ebcda"

# Define custom color schemes for ordinal and categorical data
categorical_color_scheme = ['red', 'green', 'blue', 'purple', 'orange']
ordinal_color_scheme = ["#fcfbfd",
"#efedf5",
"#dadaeb",
"#bcbddc",
"#9e9ac8",
"#807dba",
"#6a51a3",
"#4a1486"]


def simple_theme():
    return {
        'config': {
            'view': {
                'height': 300,
                'width': 300,
            },
            "title": {
                    "fontSize": 18,
                    "font": font,
                    "anchor": "start", # equivalent of left-aligned.
                    "fontColor": "#000000"
                },
            "facet": {
                "labelFontSize": 16,
                "titleFontSize": 20  # Adjust the fontsize as needed
            },
            "axisX": {
                    "labelFont": font,
                    "labelFontSize": 16,
                    "titleFontSize": 20,
                    "grid": False,
                },
                "axisY": {
                    "domain": False,
                    "grid": False,
                    "labelFont": font,
                    "labelFontSize": 16,
                    "titleFontSize": 20,
                },
            'mark': {
                'color': markColor,
                'fill': markColor
            },
            "line": {
                "fill": None,
               "stroke": "black",
           },
            # "range": {
            #         "category": categorical_color_scheme,
            #         "ordinal": ordinal_color_scheme,
            #         "diverging": sequential_palette,
            #     },
            "scale": {"color": {"scheme": "blue"}},
            "legend": {

                "titleFontSize": 20,
                "labelFontSize": 16
            }            
        }
    }

In [8]:
# alt.themes.register("theme", theme)
alt.themes.register("theme", simple_theme)
alt.themes.enable("theme")

ThemeRegistry.enable('theme')

In [9]:
# Set the ggplot2 theme
#alt.themes.enable("ggplot2")

In [10]:
alt.themes

ThemeRegistry(active='theme', registered=['dark', 'default', 'excel', 'fivethirtyeight', 'ggplot2', 'googlecharts', 'latimes', 'none', 'opaque', 'powerbi', 'quartz', 'theme', 'urbaninstitute', 'vox'])

In [11]:
# Essayer avec 1000 pour voir max(n1, n2) / n1 + n2"
N = 1000
# N_edges = N // 2
# Tester plusierus valeurs d'aretes (50, 100, 200), pour la fraction des hyperaretes
N_edges = 200

# Last value is the one used for most simulations
# N_EDGES = [50, 100, 200, 500, 300]
# N_EDGES = [300]
# N_EDGES = [100, 200, 300, 500]
Ns = [200, 300, 500, 700, 1000, 2000]


N_coms = 4
sampling_strat = "weighted"
# sampling_strat = "max"
p = 30/N
q = 3/N

In [12]:
community_array = [0 for x in range(N//2)]  + [1 for x in range(N//2)]

In [13]:
len(community_array)

1000

# Node Degree Multi Simulation

In [16]:
N_sim = 20

In [17]:
df_sim = pd.DataFrame(columns=["count", "simNumber"], dtype=int)
for i in range(N_sim):
    gen = Generator(N, N // 2, N_coms, 20/N, 3/N, community_array, sampling_strat)
    gen.run()
    degrees = dict(gen.degrees()).values()
    degrees_df = pd.DataFrame(degrees, columns=["degree"])

    countdf = degrees_df.groupby(['degree'])['degree'].count()
    countdf = countdf.to_frame().rename(columns={"degree": "count"})
    countdf["simNumber"] = i
    
    df_sim = pd.concat([df_sim, countdf])

KeyboardInterrupt: 

In [None]:
df_sim = df_sim.reset_index(names="degree")
# df_sim.rename(columns={}

In [None]:
df_sim

In [None]:
bars = alt.Chart(df_sim).mark_bar(color=COLOR).encode(
    alt.X("degree:Q", scale=alt.Scale(domain=[0, 18])),
    alt.Y("mean(count):Q"),
)

In [None]:
error = alt.Chart(df_sim).mark_errorbar(extent="ci", rule=True).encode(
    x=alt.X("degree:Q", scale=alt.Scale(domain=[0, 18])),
    y=alt.Y(
        "count:Q",
        scale=alt.Scale(zero=False),
        title="Absolute Frequency"
    ),
)

In [None]:
bars + error

# Hyperedge Size

## Hyperedge Sim

In [None]:
df_sim = pd.DataFrame(columns=["count", "simNumber"], dtype=int)
for i in range(N_sim):
    gen = Generator(N, N_edges, N_coms, p, q, community_array, sampling_strat)
    gen.run()
    hsizes = dict(gen.hyperedge_sizes()).values()
    hsizes_df = pd.DataFrame(hsizes, columns=["hsize"])

    countdf = hsizes_df.groupby(['hsize'])['hsize'].count()
    countdf = countdf.to_frame().rename(columns={"hsize": "count"})
    countdf["simNumber"] = i
    
    df_sim = pd.concat([df_sim, countdf])

In [None]:
df_sim = df_sim.reset_index(names="hsize")

In [None]:
df_sim.head()

In [None]:
bars = alt.Chart(df_sim).mark_bar(color=COLOR).encode(
    alt.X("hsize:Q", scale=alt.Scale(domain=[0, 25]), title="Hyperedge Size"),
    alt.Y("mean(count):Q"),
)

error = alt.Chart(df_sim).mark_errorbar(extent="ci", rule=True).encode(
    x=alt.X("hsize:Q", scale=alt.Scale(domain=[0, 25])),
    y=alt.Y(
        "count:Q",
        scale=alt.Scale(zero=False),
        title="Absolute Frequency"
    ),
)

In [None]:
bars + error

# Fraction Dist

In [65]:
# Faire pareil sur 100/1000 réseaux
# Accumuler la liste des degrées/hsize
# Une value de p/q => plusieurs simulations => faire le fit la dessus

In [66]:
p_init = 30 / N
q_init = 30 / N

In [67]:
q_init

0.03

In [68]:
def fit_function(x, n, p):
    return stats.binom.pmf(x, n, p)

In [69]:
qs = np.arange(0, 1.01, 0.10)
# qs = np.append(qs, [0.95, 0.99])
qs = np.append(qs, [0.01, 0.05])

In [70]:
qs

array([0.  , 0.1 , 0.2 , 0.3 , 0.4 , 0.5 , 0.6 , 0.7 , 0.8 , 0.9 , 1.  ,
       0.01, 0.05])

In [71]:
fits_N = 1000
fits_strat = "weighted"

## SIMULATIONS HERE

In [None]:
p = p_init
q = q_init

df_degrees = pd.DataFrame(columns=["degree"], dtype=int)
df_hsizes = pd.DataFrame(columns=["hsize"], dtype=int)

df_sim = pd.DataFrame(columns=["sim", "type", "count"], dtype=int)
df_fraction = pd.DataFrame(columns=["sim", "count", "fraction0"], dtype=int)
df_fraction2 = pd.DataFrame(columns=["sim", "value"])

# df_peff = pd.DataFrame(columns=["peff", "p", "q", "q_frac"], dtype=int)

df_fits = pd.DataFrame(columns=["peff", "Neff", "p_hsize_eff", "n_eff_hsize", "p", "q", "q_frac"], dtype=int)

# Same as df_fits but other format so easier altair plots 
df_fits2 = pd.DataFrame()

increment = 0.05
N_sim = int(1 / increment) + 1
q_frac_order = []

# Prendre 100 configs
# N_graphs = 5
# N_graphs = 50
N_graphs = 100
# N_graphs = 3

q_to_degreefit = {}
q_to_hsizefit = {}

# for i in range(N_sim):
for q_frac in qs:
    # q = round(q_init - (p * increment * i), 4)
    q = round(p * q_frac,  4)
    print(p, q)
    
    # q_frac = f"{round(1 - (increment * (i)), 3)}p"
    q_frac_order.append(q_frac) 

    for strat in ["weighted", "max", "min", "frequent"]:
        for n in Ns:
        # for n in [2000]:
            all_degrees = []
            all_hsizes = []  
    
            community_array = []
            for i in range(N_coms):
                community_array += [i for x in range(n // N_coms)]
            
            for n_graph in range(N_graphs):
                # gen = Generator(N, N_edges, N_coms, p, q, community_array, sampling_strat)
                gen = Generator(n, N_edges, N_coms, p, q, community_array, strat)
                gen.run()
                
                # comp = gen.hyperedges_types()
                # n_pure = comp.count("pure")
                # n_mixed = comp.count("mixed")
                # df = pd.DataFrame({"sim": [i, i], "q": [q_frac, q_frac], "type": ["pure", "mixed"], "count": [n_pure, n_mixed]})
                # df_sim = pd.concat([df_sim, df])
            
            #     For fraction distribution of mixed edges
                # comp = gen.mixed_he_fraction_to_count()
                # for fraction, count in comp.items():
                #     df = pd.DataFrame({"sim": [i], "q": [q_frac], "count": [count], "fraction0": fraction})
                #     df_fraction = pd.concat([df_fraction, df])
                    
                #     For com distrib of hyperedges
                comp = gen.hyperedges_nmax()
                ginis = gen.ginis()
                
                # for fraction in comp:
                df = pd.DataFrame({"q": [q] * len(comp), "value": comp, "gini": ginis, "N_nodes": [n] * len(comp), "strat": [strat] * len(comp)})
                df_fraction2 = pd.concat([df_fraction2, df])
                
                # for peffectif computation
                if n == fits_N and strat == fits_strat:
                    degrees = dict(gen.degrees()).values()
                    hsizes = dict(gen.hyperedge_sizes()).values()
                    
                    all_degrees = all_degrees + list(degrees)
                    all_hsizes = all_hsizes + list(hsizes)
        

            # DO THE DISTRIBUTIONS AND FIT ONLY FOR THOSE VALUES
            if n == fits_N and strat == fits_strat:
                df = pd.DataFrame({"degree": all_degrees, "q": q_frac, "q / p": q / p, "N_nodes": n})
                df_degrees = pd.concat([df_degrees, df])
                                      
                df = pd.DataFrame({"hsize": all_hsizes, "q": q_frac, "q / p": q / p, "N_nodes": n})
                df_hsizes = pd.concat([df_hsizes, df])
    
                # Distrib of degree
                # Bounds of n and p effectif
                bounds = [(0, N_edges), (0, p)]
                
                dist = stats.binom
                res = stats.fit(dist, all_degrees, bounds)
                
                peff = res.params[1]
                neff = res.params[0]
                
                bounds2 = [(N_edges, N_edges), (0, p)]
                res = stats.fit(dist, all_degrees, bounds2)
                peffNfixed = res.params[1]
                neffNfixed = res.params[0]
                
                bounds3 = [(0, N_edges), (p, p)]
                res = stats.fit(dist, all_degrees, bounds3)
                peffpfixed = res.params[1]
                neffpfixed = res.params[0]
                
                df_notfixed = pd.DataFrame({"p": [p], "q": [q], "q_frac": [q_frac], "peff": [peff], "Neff": [neff], "bounds": ["not fixed"], "measure": ["degree"]})
                df_pfixed = pd.DataFrame({"p": [p], "q": [q], "q_frac": [q_frac], "peff": [peffNfixed], "Neff": [neffNfixed], "bounds": ["n Fixed"], "measure": ["degree"]})
                df_nfixed = pd.DataFrame({"p": [p], "q": [q], "q_frac": [q_frac], "peff": [peffpfixed], "Neff": [neffpfixed], "bounds": ["p Fixed"], "measure": ["degree"]})
                df_fits2 = pd.concat([df_fits2, df_notfixed, df_pfixed, df_nfixed])
                
                q_to_degreefit[q] = res
                # print(res.nllf(), res.success)
                # print(neff, peff)
                
                # mean = np.mean(all_degrees)
                # variance = np.var(all_degrees)
                
                # pForm = 1 - (variance / mean)
                # NForm = mean / (1 -  (variance / mean))
                # print("compute", NForm, pForm)
                
                # Distrib of hsizes
                dist = stats.binom
                
                # bounds = [(N / N_coms, N), (q, p)]
                bounds = [(N / N_coms, N), (0, p)]
                res = stats.fit(dist, all_hsizes, bounds)
                peffhsize = res.params[1]
                neff_hsize = res.params[0]
                q_to_hsizefit[q] = res
                
                bounds = [(N / N_coms, N), (p, p)]
                res = stats.fit(dist, all_hsizes, bounds)
                peff_hsize_pfixed = res.params[1]
                neff_hsize_pfixed = res.params[0]
                
                bounds = [(N, N), (0, p)]
                res = stats.fit(dist, all_hsizes, bounds)
                peff_hsize_nfixed = res.params[1]
                neff_hsize_nfixed = res.params[0]
                
                df = pd.DataFrame({"p": [p], "q": [q], "q_frac": [q_frac], "peff": [peff], "Neff": [neff], "peffpfixed": [peffpfixed], "Neffpfixed": [neffpfixed], "peffnfixed": [peffNfixed], "Neffnfixed": [neffNfixed], "p_hsize_eff": [peffhsize], "n_eff_hsize": [neff_hsize]})
                df_fits = pd.concat([df_fits, df])
                # df_peff = pd.concat([df_peff, df])
                
                df_notfixed = pd.DataFrame({"p": [p], "q": [q], "q_frac": [q_frac], "peff": [peffhsize], "Neff": [neff_hsize], "bounds": ["not fixed"], "measure": ["hsize"]})
                df_pfixed = pd.DataFrame({"p": [p], "q": [q], "q_frac": [q_frac], "peff": [peff_hsize_nfixed], "Neff": [neff_hsize_nfixed], "bounds": ["n Fixed"], "measure": ["hsize"]})
                df_nfixed = pd.DataFrame({"p": [p], "q": [q], "q_frac": [q_frac], "peff": [peff_hsize_pfixed], "Neff": [neff_hsize_pfixed], "bounds": ["p Fixed"], "measure": ["hsize"]})
                df_fits2 = pd.concat([df_fits2, df_notfixed, df_pfixed, df_nfixed])

In [72]:
fn = "he_distrib.csv"
if not os.path.exists(fn):
    df_fraction2.to_csv(fn, sep=',', index=False, encoding='utf-8')
else:
    df_fraction2 = pd.read_csv(fn)

In [73]:
df_fraction2["q / p"] = df_fraction2["q"] / p
df_fraction2["N / E"] = df_fraction2["N_nodes"] / N_edges

In [74]:
df_fraction2 = df_fraction2.drop(["sim"], axis=1)
df_fraction2 = df_fraction2.drop("value", axis=1)
df_fraction2 = df_fraction2.drop("N_nodes", axis=1)

In [75]:
fn = f"hsize_distrib_{fits_N}_{fits_strat}.csv"
if not os.path.exists(fn):
    df_hsizes.to_csv(fn, sep=',', index=False, encoding='utf-8')
else:
    df_hsizes = pd.read_csv(fn)

In [76]:
fn = f"degree_distrib_{fits_N}_{fits_strat}.csv"
if not os.path.exists(fn):
    df_degrees.to_csv(fn, sep=',', index=False, encoding='utf-8')
else:
    df_degrees = pd.read_csv(fn)

In [77]:
fn = f"fits1_{fits_N}_{fits_strat}.csv"
if not os.path.exists(fn):
    df_fits.to_csv(fn, sep=',', index=False, encoding='utf-8')
else:
    df_fits = pd.read_csv(fn)

In [78]:
fn = f"fits2_{fits_N}_{fits_strat}.csv"
if not os.path.exists(fn):
    df_fits2.to_csv(fn, sep=',', index=False, encoding='utf-8')
else:
    df_fits2 = pd.read_csv(fn)

## Group by

In [79]:
df_degrees = df_degrees[df_degrees["N_nodes"] == 1000]

In [80]:
df_degrees.head()

Unnamed: 0,degree,q,q / p,N_nodes
0,7,0.0,0.0,1000.0
1,0,0.0,0.0,1000.0
2,3,0.0,0.0,1000.0
3,2,0.0,0.0,1000.0
4,0,0.0,0.0,1000.0


In [81]:
df_fits = df_fits.round({'q_frac': 3})
df_fits.q_frac

0     0.00
1     0.10
2     0.20
3     0.30
4     0.40
5     0.50
6     0.60
7     0.70
8     0.80
9     0.90
10    1.00
11    0.01
12    0.05
Name: q_frac, dtype: float64

In [82]:
grouped = df_degrees.groupby('q / p')['degree'].value_counts(normalize=True).reset_index(name='Percentage')

In [83]:
grouped = grouped.round({'q / p': 2})

In [84]:
grouped['binomial'] = 0  # Initialize the column

for q in qs:
    q = round(q, 3)
    # print(q)
    df = df_degrees[df_degrees["q"] == f"{q}p"]
    # fit = q_to_degreefit[round(q * p, 3)]
    
    # Calculate the binomial PDF values for a range of x values
    x_values = range(19)  # Assuming a range from 0 to 10
    # binomial_pmf = [stats.binom.pmf(x, fit.params[0], fit.params[1]) for x in x_values]
    # fit_values = df_fits[df_fits["q_frac"] == f"{q}p"]
    # binomial_pmf2 = [stats.binom.pmf(x, fit_values.Neffpfixed[0], fit_values.peffpfixed[0]) for x in x_values]
    # binomial_pmf3 = [stats.binom.pmf(x, fit_values.Neffnfixed[0], fit_values.peffnfixed[0]) for x in x_values]

    # fit_values = df_fits[df_fits["q_frac"] == f"{q}p"]
    fit_values = df_fits[df_fits["q_frac"] == q]
    # print(fit_values.Neffpfixed)
    # print(fit_values.Neffpfixed.iloc[0])
    for x in x_values:
        binom_pdf = stats.binom.pmf(x, fit_values.Neffpfixed.iloc[0], fit_values.peffpfixed.iloc[0])
        # grouped.loc[(grouped['degree'] == x) & (grouped['q'] == f"{q}p"), 'binomial'] = binom_pdf
        grouped.loc[(grouped['degree'] == x) & (grouped['q / p'] == q), 'binomial'] = binom_pdf        

In [85]:
grouped

Unnamed: 0,q / p,degree,Percentage,binomial
0,0.0,1,0.31566,0.318535
1,0.0,2,0.26100,0.265993
2,0.0,0,0.19388,0.187260
3,0.0,3,0.14599,0.145337
4,0.0,4,0.05832,0.058434
...,...,...,...,...
190,1.0,15,0.00064,0.000750
191,1.0,16,0.00024,0.000268
192,1.0,17,0.00005,0.000090
193,1.0,18,0.00003,0.000028


In [86]:
# grouped_selp_deg = grouped[grouped.q.isin(["0.0p", "0.3p", "0.7p", "1.0p"])]
grouped_selp_deg = grouped[grouped["q / p"].isin([0, 0.3, 0.7, 1])]

In [87]:
grouped_selp_deg

Unnamed: 0,q / p,degree,Percentage,binomial
0,0.0,1,0.31566,0.318535
1,0.0,2,0.26100,0.265993
2,0.0,0,0.19388,0.187260
3,0.0,3,0.14599,0.145337
4,0.0,4,0.05832,0.058434
...,...,...,...,...
190,1.0,15,0.00064,0.000750
191,1.0,16,0.00024,0.000268
192,1.0,17,0.00005,0.000090
193,1.0,18,0.00003,0.000028


In [152]:
pdf = alt.Chart(grouped_selp_deg).mark_line(width=1, opacity=0.7, fill=None, color="black").encode(
# pdf = alt.Chart(grouped_selp_deg).mark_line(width=1, opacity=0.7, color="darkgrey", fill=None).encode(
        x=alt.X('degree:Q', title="k", axis=alt.Axis(labelExpr="datum.value % 4 ? null : datum.label"), scale=alt.Scale(domainMin=0)),
        y=alt.Y('binomial:Q', title="P(k)"),
        # color="q"
        # facet=alt.Facet("q:O")    
)

In [153]:
# distrib = alt.Chart(grouped_selp_deg).mark_bar(width=5,  opacity=0.7, color="black").encode(
distrib = alt.Chart(grouped_selp_deg).mark_bar(width=5,  opacity=0.7).encode(
        x=alt.X('degree:Q', title="k", axis=alt.Axis(tickMinStep=1), scale=alt.Scale(domainMin=0)),
        y=alt.Y('Percentage:Q'),
        # facet=alt.Facet("q:O")
        # color="q"
    )

In [154]:
# alt.layer(pdf, distrib, data=grouped_selp_deg).facet(column='q')
alt.layer(pdf, distrib, data=grouped_selp_deg).facet(
    facet=alt.Facet(
        'q / p',
        header=alt.Header(labelFontSize=18, titleFontSize=20)
    ),
    columns=2
).configure_axisY(
    titleAngle=0,
    titleX=-80,
)


# alt.hconcat(freq_table, background + data_plot, spacing=100).properties(padding=100)

## Hsize

### Group by

In [91]:
df_hsizes = df_hsizes[df_hsizes["N_nodes"] == 1000]

In [92]:
df_fits2 = df_fits2.round({'q_frac': 3})

In [93]:
grouped = df_hsizes.groupby('q / p')['hsize'].value_counts(normalize=True).reset_index(name='Percentage')
grouped

Unnamed: 0,q / p,hsize,Percentage
0,0.0,8,0.15430
1,0.0,7,0.14835
2,0.0,9,0.12910
3,0.0,6,0.12165
4,0.0,10,0.10630
...,...,...,...
405,1.0,11,0.00010
406,1.0,12,0.00010
407,1.0,52,0.00005
408,1.0,54,0.00005


In [94]:
grouped = grouped.round({'q / p': 2})

In [95]:
grouped['binomial'] = 0  # Initialize the column

for q in qs:
    q = round(q, 2)
    # df = df_hsizes[df_hsizes["q"] == f"{q}p"]
    df = df_hsizes[df_hsizes["q"] == q]
    
    # fit_values = df_fits2[(df_fits2["q_frac"] == f"{q}p") & (df_fits2["measure"] == "hsize") & (df_fits2["bounds"] == "p Fixed")]
    fit_values = df_fits2[(df_fits2["q_frac"] == q) & (df_fits2["measure"] == "hsize") & (df_fits2["bounds"] == "p Fixed")]
    
    x_values = range(60)  # Assuming a range from 0 to 10
    for x in x_values:
        binom_pdf = stats.binom.pmf(x, fit_values.Neff, fit_values.peff)
        grouped.loc[(grouped['hsize'] == x) & (grouped['q / p'] == q), 'binomial'] = binom_pdf

In [96]:
# grouped_selp = grouped[grouped.q.isin(["0.0p", "0.3p", "0.7p", "1.0p"])]
grouped_selp = grouped[grouped["q / p"].isin([0, 0.3, 0.7, 1])]

In [97]:
grouped_selp

Unnamed: 0,q / p,hsize,Percentage,binomial
0,0.0,8,0.15430,1.412357e-01
1,0.0,7,0.14835,1.368275e-01
2,0.0,9,0.12910,1.291020e-01
3,0.0,6,0.12165,1.155546e-01
4,0.0,10,0.10630,1.058104e-01
...,...,...,...,...
405,1.0,11,0.00010,3.678851e-05
406,1.0,12,0.00010,9.348833e-05
407,1.0,52,0.00005,5.588254e-05
408,1.0,54,0.00005,1.666132e-05


In [149]:
# pdf = alt.Chart(grouped_selp).mark_line(width=1, opacity=0.7, color="darkgrey").encode(
pdf = alt.Chart(grouped_selp).mark_line(width=1, opacity=0.7, color="black", fill=None).encode(
        # x=alt.X('hsize:O', title="Hyperedge Size", axis=alt.Axis(tickMinStep=10), scale=alt.Scale(domain=list(range(3, 51)), padding=0.1, paddingInner=0.1)),
        x=alt.X('hsize:Q', title="m", axis=alt.Axis(tickMinStep=5)),
        y=alt.Y('binomial:Q', title="P(m)"),
        # color="q"
        # facet=alt.Facet("q:O")    
)

In [150]:
# distrib = alt.Chart(grouped_selp).mark_bar(width=5,  opacity=0.7, color="black").encode(
distrib = alt.Chart(grouped_selp).mark_bar(width=5,  opacity=0.7).encode(
        # x=alt.X('hsize:O', title="Hyperedge Size", axis=alt.Axis(tickMinStep=10), scale=alt.Scale(domain=list(range(3, 51)), padding=0.1, paddingInner=0.1)),
        x=alt.X('hsize:Q', title="m", axis=alt.Axis(tickMinStep=5)),
        y=alt.Y('Percentage:Q'),
        # facet=alt.Facet("q:O")
        # color="q"
    )

In [151]:
alt.layer(pdf, distrib, data=grouped_selp).facet(
    facet=alt.Facet('q / p', header=alt.Header(labelFontSize=18, titleFontSize=20)),  columns=2
).configure_axisY(
    titleAngle=0,
    titleX=-80,
)

## Hyperedge composition

In [101]:
df_fraction2

Unnamed: 0,q,gini,strat,q / p,N / E
0,0.0000,1.000000,weighted,0.00,1.0
1,0.0000,1.000000,weighted,0.00,1.0
2,0.0000,1.000000,weighted,0.00,1.0
3,0.0000,1.000000,weighted,0.00,1.0
4,0.0000,1.000000,weighted,0.00,1.0
...,...,...,...,...,...
6237590,0.0015,0.833333,frequent,0.05,10.0
6237591,0.0015,0.955556,frequent,0.05,10.0
6237592,0.0015,0.925926,frequent,0.05,10.0
6237593,0.0015,0.789474,frequent,0.05,10.0


In [102]:
df_fraction2['strat'] = df_fraction2['strat'].replace(['frequent'], ['majority'])

In [103]:
df_fraction2 = df_fraction2.round({"q / p": 3})

In [166]:
# 1000 nodes
df_fraction_1000 = df_fraction2[df_fraction2["N / E"] == 5]

In [167]:
df_test = df_fraction2[(df_fraction2["N / E"].isin([10])) & (df_fraction2["strat"] == "max")]

In [421]:
# Calculer la variance a la place, tester avec plusieurs valeurs de N, en faisant varié E, N = 200, 500, 1000

error = alt.Chart(df_fraction_1000).mark_errorband(extent="stdev").encode(
    x="q / p",
    y=alt.Y(
        "value:Q",
        # scale=alt.Scale(zero=False),
        # title="max(n1, n2) / n1 + n2",
        title="c"
    ),
)

mean_line = alt.Chart(df_fraction_1000).mark_line().encode(
    x="q / p",
    y=alt.Y(
        "mean(value)",
        # title="Hyperedge Purity"
        # title="max(n1, n2) / n1 + n2"
        title="c"
    ),
)

error + mean_line

ValueError: Unable to determine data type for the field "mean(value)"; verify that the field name is not misspelled. If you are referencing a field from a transform, also confirm that the data type is specified correctly.

alt.LayerChart(...)

In [None]:
error = alt.Chart(df_fraction_1000).mark_errorband(extent="stdev").encode(
    x="q / p",
    y=alt.Y(
        "gini:Q",
        title="gini"
    ),
)

mean_line = alt.Chart(df_fraction_1000).mark_line().encode(
    x="q / p",
    y=alt.Y(
        "mean(gini)",
        title="gini"
    ),
)

error + mean_line

In [None]:
error = alt.Chart(df_test).mark_errorband(extent="stdev").encode(
    x="q / p",
    y=alt.Y(
        "gini:Q",
        # scale=alt.Scale(zero=False),
        # title="max(n1, n2) / n1 + n2",
        title="gini"
    ),
)

mean_line = alt.Chart(df_test).mark_line().encode(
    x="q / p",
    y=alt.Y(
        "mean(gini)",
        # title="Hyperedge Purity"
        # title="max(n1, n2) / n1 + n2"
        title="gini"
    ),
)

error + mean_line

In [None]:
# alt.layer(pdf, distrib, data=grouped_selp_deg).facet(column='q')
alt.layer(error, mean_line, data=df_fraction_1000).facet(column='strat')

In [None]:
alt.Chart(df_fraction2).mark_line().encode(
    x="q / p",
    y=alt.Y(
        "mean(gini)",
        # title="Hyperedge Purity"
        # title="max(n1, n2) / n1 + n2"
        title="mean(Gini*)"
    ),
    # color=alt.Color("N / E:N", scale=alt.Scale(scheme="category10"))
    # color=alt.Color("N / E:N", scale=alt.Scale()),
    strokeDash="N / E"
).facet(column='strat')

In [None]:
# frequent => mettre majority
# Augmenter la taille des labels, axis, 

gini_strat = alt.Chart(df_fraction_1000).mark_line(color=markColor, opacity=1, size=3).encode(
    x="q / p",
    y=alt.Y(
        "mean(gini)",
        # title="Hyperedge Purity"
        # title="max(n1, n2) / n1 + n2"
        # title="normalized gini",
        title="Ḡ"
    ),
    # color=alt.Color("N / E:N", scale=alt.Scale(scheme="category10"))
    # color=alt.Color("N / E:N", scale=alt.Scale()),
    strokeDash=alt.StrokeDash("strat", scale=alt.Scale(range=[[1,0], [4, 1], [8, 2], [4, 8]]), legend=alt.Legend(title="Strategy", orient="top-right", symbolSize=750, symbolStrokeWidth=3)),
).configure_legend(
titleFontSize=20,
labelFontSize=16,
).properties(height=330).configure_axisY(
    titleAngle=0,
    titleX=-60,
)

gini_strat

In [None]:
# mettre la legende à l'interieur
# Utiliser couleur et stroke?
gini_size = alt.Chart(df_fraction2[df_fraction2.strat == "weighted"]).mark_line().encode(
    x="q / p",
    y=alt.Y(
        "mean(gini)",
        # title="Hyperedge Purity"
        # title="max(n1, n2) / n1 + n2"
        # title="normalized gini",
        title="Ḡ"
    ),
    # color=alt.Color("N / E:N", scale=alt.Scale(scheme="category10"))
    color=alt.Color("N / E:O", scale=alt.Scale(scheme="blues"), legend=alt.Legend(orient="top-right")),
    # strokeDash="strat"
).configure_legend(
titleFontSize=20,
labelFontSize=16,
).properties(height=330).configure_axisY(
    titleAngle=0,
    titleX=-60,
)


gini_size

In [52]:
# Weird legend position
# alt.hconcat(gini_strat, gini_size)

## Normalize 

In [110]:
df_fraction2_var = (df_fraction2
    # .groupby('q / p')
    .groupby(['q / p', "N / E", "strat"])
    # ['value']
    ['gini']
    .agg(['mean', 'std'])
    .assign(stdNorm = lambda df: df['std'] / df['mean'])
    .reset_index())

In [111]:
df_fraction2_var[(df_fraction2_var.strat == "max") & (df_fraction2_var["N / E"] == 10)]

Unnamed: 0,q / p,N / E,strat,mean,std,stdNorm
21,0.0,10.0,max,1.0,0.0,0.0
45,0.01,10.0,max,0.913132,0.138607,0.151793
69,0.05,10.0,max,0.648382,0.220581,0.340203
93,0.1,10.0,max,0.450826,0.205899,0.456715
117,0.2,10.0,max,0.273107,0.145933,0.534344
141,0.3,10.0,max,0.205392,0.104034,0.506515
165,0.4,10.0,max,0.177851,0.084242,0.473667
189,0.5,10.0,max,0.164607,0.074527,0.452755
213,0.6,10.0,max,0.157176,0.069064,0.439408
237,0.7,10.0,max,0.150743,0.066062,0.438243


In [112]:
# Figure 6. Essayer en couleurs. Enlever le strat. Double colones
# Pour le max de strat max: metrre plus de point
# Tracer en fct de E / N en faisant varier N
alt.Chart(df_fraction2_var).mark_line().encode(
    x="q / p",
    y=alt.Y(
        "stdNorm",
        # "std",
        title="δ"
    ),
    color=alt.Color("N / E:O", scale=alt.Scale(scheme="blues")),
    # strokeDash=alt.StrokeDash("N / E:O", legend=alt.Legend(symbolSize=750))
).facet(
    facet=alt.Facet("strat", header=alt.Header(title="Strategy", labelFontSize=18, titleFontSize=20))
       ).configure_axisY(
    titleAngle=0,
    titleX=-60,
)

In [159]:
alt.Chart(df_fraction2_var).mark_line().encode(
    x="q / p",
    y=alt.Y(
        "std",
        # "std",
        title="σ"
    ),
    color=alt.Color("N / E:O", scale=alt.Scale(scheme="blues")),
    # strokeDash=alt.StrokeDash("N / E:O", legend=alt.Legend(symbolSize=750))
).facet(
    facet=alt.Facet("strat", header=alt.Header(title="Strategy", labelFontSize=18, titleFontSize=20))
       ).configure_axisY(
    titleAngle=0,
    titleX=-60,
)

In [300]:
alt.Chart(df_fraction2_var).mark_line().encode(
    x="q / p",
    y=alt.Y(
        "mean",
        # "std",
        title="mean"
    ),
    color=alt.Color("N / E:O", scale=alt.Scale(scheme="blues")),
    # strokeDash=alt.StrokeDash("N / E:O", legend=alt.Legend(symbolSize=750))
).facet(
    facet=alt.Facet("strat", header=alt.Header(title="Strategy", labelFontSize=18, titleFontSize=20))
       ).configure_axisY(
    titleAngle=0,
    titleX=-60,
)

In [79]:
# df_fraction2_max = (df_fraction2_var
#     .groupby(["N / E", "strat"])
#     ['stdNorm']
#     .agg(['max'])
#     # .assign(max = lambda df: df['std'] / df['mean'])
#     .reset_index())

In [80]:
# # Tracer en fct de E / N en faisant varier N
# alt.Chart(df_fraction2_max).mark_line().encode(
#     x="N / E",
#     y=alt.Y(
#         "max",
#         # title="Hyperedge Purity"
#         # title="max(n1, n2) / n1 + n2"
#         title="gini*",
#     )
# ).facet("strat")

In [505]:
# Pour quelques valeurs de q /p, tracer la distribution de la valeur
# Tracer la variance de la quantite vs q/p

In [506]:
df_fraction2

Unnamed: 0,q,gini,strat,q / p,N / E
0,0.0000,1.000000,weighted,0.00,1.0
1,0.0000,1.000000,weighted,0.00,1.0
2,0.0000,1.000000,weighted,0.00,1.0
3,0.0000,1.000000,weighted,0.00,1.0
4,0.0000,1.000000,weighted,0.00,1.0
...,...,...,...,...,...
6237590,0.0015,0.833333,frequent,0.05,10.0
6237591,0.0015,0.955556,frequent,0.05,10.0
6237592,0.0015,0.925926,frequent,0.05,10.0
6237593,0.0015,0.789474,frequent,0.05,10.0


In [101]:
df_fraction2[(df_fraction2["q / p"].isin([0.2])) & (df_fraction2["N / E"].isin([1]))]

Unnamed: 0,q,gini,strat,q / p,N / E
799614,0.006,0.833333,weighted,0.2,1.0
799615,0.006,1.000000,weighted,0.2,1.0
799616,0.006,1.000000,weighted,0.2,1.0
799617,0.006,0.866667,weighted,0.2,1.0
799618,0.006,0.833333,weighted,0.2,1.0
...,...,...,...,...,...
1119436,0.006,1.000000,majority,0.2,1.0
1119437,0.006,1.000000,majority,0.2,1.0
1119438,0.006,0.833333,majority,0.2,1.0
1119439,0.006,0.666667,majority,0.2,1.0


In [None]:
alt.Chart(df_fraction2[(df_fraction2["q / p"].isin([0.2])) & (df_fraction2["N / E"].isin([10, 1, 2.5]))]).transform_density(
    'gini',
    groupby=['N / E', "strat"],
    as_=['value', 'density'],
).mark_line().encode(
    x=alt.X("value:Q", title="G"),
    y=alt.Y('density:Q', title="P(G)"),
    color=alt.Color("N / E:O", scale=alt.Scale(scheme="blues"))
).facet(facet=alt.Facet("strat", header=alt.Header(title="Strategy", labelFontSize=18, titleFontSize=20))
       ).configure_axisY(
    titleAngle=0,
    titleX=-60,
)

In [None]:
alt.Chart(df_fraction2[(df_fraction2["q / p"].isin([0.05, 0.1, 0.2, 0.3, 0.4])) & (df_fraction2["N / E"].isin([10])) & (df_fraction2["strat"] == "max")]).transform_density(
    'gini',
    groupby=['q / p'],
    as_=['value', 'density'],
).mark_line().encode(
    x=alt.X("value:Q", title="G"),
    y=alt.Y('density:Q', title="P(G)"),
    color=alt.Color("q / p:N", scale=alt.Scale(scheme="purples",  domainMax=1, domainMin=0.1))
).configure_axisY(
    titleAngle=0,
    titleX=-60,
)

In [162]:
# gini_plot = alt.Chart(df_fraction2[(df_fraction2.N_nodes == 1000) & (~df_fraction2["q / p"].isin([0, 0.01]))]).transform_density(
# gini_plot = alt.Chart(df_fraction2[(df_fraction2["N / E"] == 5) & (~df_fraction2["q / p"].isin([0, 0.01]))]).transform_density(
gini_plot = alt.Chart(df_fraction2[(df_fraction2["N / E"] == 10) & (~df_fraction2["q / p"].isin([0, 0.01]))]).transform_density(
    'gini',
    groupby=['q / p', "strat"],
    as_=['gini', 'density'],
    # counts=True
).mark_line().encode(
    x=alt.X("gini:Q", title="G"),
    y=alt.Y('density:Q', title="P(G)", scale=alt.Scale(domainMax=8)),
    color=alt.Color("q / p:N",  scale=alt.Scale(scheme="purples"))
)

In [None]:
gini_plot.facet(facet=alt.Facet("strat", header=alt.Header(title="Strategy", labelFontSize=18, titleFontSize=20))).configure_axisY(
    titleAngle=0,
    titleX=-60,
)

In [244]:
gini_plot = alt.Chart(df_fraction2["q / p"].isin([0.2])).transform_density(
    'gini',
    groupby=['q / p', "strat"],
    as_=['gini', 'density'],
    counts=True
).mark_line().encode(
    x=alt.X("gini:Q", title="gini"),
    y='density:Q',
    color=alt.Color("strat",  scale=alt.Scale(scheme="greys"))
)

# Distrib effectives

In [None]:
df_fits

## Degree distrib fit

In [None]:
points = alt.Chart(df_fits).mark_point().encode(
    x=alt.X('q', sort=q_frac_order),
    # x=alt.X('q', sort=q_frac_order, axis=alt.Axis(labelExpr=q_frac_order)),
    y=alt.Y('peff', title="p_eff"),
).properties(
    width=800,
    height=300
)

In [None]:
regresssion = points.transform_regression('q', 'peff', method="poly").mark_line(color="black")

In [None]:
line = alt.Chart(pd.DataFrame({'y': [p]})).mark_rule().encode(y='y')

In [None]:
# points + regresssion + line
# Pour plusieurs valeur de N
points + line

### Normalize

In [173]:
df_fits["peff / p"] = df_fits["peff"] / df_fits["p"] 
df_fits["p_h_eff / p"] = df_fits["p_hsize_eff"] / df_fits["p"] 
df_fits["q / p"] = df_fits["q"] / df_fits["p"] 

df_fits["Neff / Nedges"] = df_fits["Neff"] / N_edges
df_fits["NeffHsize / N"] = df_fits["n_eff_hsize"] / N

In [174]:
df_fits2["q / p"] = df_fits2["q"] / df_fits2["p"] 
df_fits2["peff / p"] = df_fits2["peff"] / df_fits2["p"] 
df_fits2["neff / Nedges"] = df_fits2["Neff"] / N_edges
df_fits2["neff / N"] = df_fits2["Neff"] / N

## Old

In [None]:
points = alt.Chart(df_fits).mark_point().encode(
    x=alt.X('q / p', sort=q_frac_order),
    # x=alt.X('q', sort=q_frac_order, axis=alt.Axis(labelExpr=q_frac_order)),
    y=alt.Y('peff / p'),
).properties(
    width=800,
    height=300
)

points

In [None]:
points_Neff = alt.Chart(df_fits).mark_point().encode(
    x=alt.X('q / p', sort=q_frac_order),
    # x=alt.X('q', sort=q_frac_order, axis=alt.Axis(labelExpr=q_frac_order)),
    y=alt.Y('Neff / Nedges'),
).properties(
    width=800,
    height=300
)

points_Neff

In [None]:
alt.Chart(df_fits).mark_line().encode(
    x=alt.X('peff / p', sort=q_frac_order),
    # x=alt.X('q', sort=q_frac_order, axis=alt.Axis(labelExpr=q_frac_order)),
    y=alt.Y('Neff / Nedges'),
).properties(
    width=800,
    height=300
)

In [None]:
alt.Chart(df_fits).mark_point().encode(
    x=alt.X('q / p', sort=q_frac_order),
    # x=alt.X('q', sort=q_frac_order, axis=alt.Axis(labelExpr=q_frac_order)),
    y=alt.Y('Neffpfixed'),
).properties(
    width=800,
    height=300
)

In [None]:
# Fitter par une droite (rouge)
# Point plus grand,

alt.Chart(df_fits2[df_fits2.measure == "degree"]).mark_point().encode(
    x=alt.X('q / p', sort=q_frac_order),
    # x=alt.X('q', sort=q_frac_order, axis=alt.Axis(labelExpr=q_frac_order)),
    y=alt.Y('neff / Nedges'),
    color="bounds"
).properties(
    width=800,
    height=300
)

In [None]:
alt.Chart(df_fits2[df_fits2.measure == "degree"]).mark_point().encode(
    x=alt.X('q / p', sort=q_frac_order),
    # x=alt.X('q', sort=q_frac_order, axis=alt.Axis(labelExpr=q_frac_order)),
    y=alt.Y('peff / p'),
    color="bounds"
).properties(
    width=800,
    height=300
)

In [None]:
df_fits2

## Hsize fits

In [None]:
points = alt.Chart(df_fits).mark_point().encode(
    x=alt.X('q / p', sort=q_frac_order),
    # x=alt.X('q', sort=q_frac_order, axis=alt.Axis(labelExpr=q_frac_order)),
    y=alt.Y('p_h_eff / p'),
).properties(
    width=800,
    height=300
)

points

In [None]:
points_Neff = alt.Chart(df_fits).mark_point().encode(
    x=alt.X('q / p', sort=q_frac_order),
    # x=alt.X('q', sort=q_frac_order, axis=alt.Axis(labelExpr=q_frac_order)),
    y=alt.Y("NeffHsize / N"),
).properties(
    width=800,
    height=300
)

points_Neff

## Several bounds

In [111]:
chart = alt.Chart(df_fits2[df_fits2.measure == "hsize"]).mark_point().encode(
    x=alt.X('q / p'),
    # x=alt.X('q / p', sort=q_frac_order),
    # x=alt.X('q', sort=q_frac_order, axis=alt.Axis(labelExpr=q_frac_order)),
    y=alt.Y('neff / N'),
    # color="bounds"
).properties(
    width=800,
    height=300
)

# chart

In [444]:
chart + chart.transform_regression('q / p', 'neff / N').mark_line()

In [445]:
alt.Chart(df_fits2[df_fits2.measure == "hsize"]).mark_point().encode(
    x=alt.X('q / p'),
    # x=alt.X('q / p', sort=q_frac_order),
    # x=alt.X('q', sort=q_frac_order, axis=alt.Axis(labelExpr=q_frac_order)),
    y=alt.Y('peff / p'),
    color="bounds"
).properties(
    width=800,
    height=300
)

In [175]:
df = df_fits2[(df_fits2.measure == "hsize") & (df_fits2.bounds == "p Fixed") & (~df_fits2["q / p"].isin([0.01, 0.05]))]
chart = alt.Chart(df).mark_point(size=80, opacity=0.8, fill=markColor, stroke="black").encode(
# chart = alt.Chart(df).mark_point(size=60, opacity=0.7).encode(
    x=alt.X('q / p', axis=alt.Axis(format='')),
    # x=alt.X('q / p', sort=q_frac_order),
    # x=alt.X('q', sort=q_frac_order, axis=alt.Axis(labelExpr=q_frac_order)),
    y=alt.Y('neff / N', title="N* / N", scale=alt.Scale(domain=[0, 1.1])),
).properties(
    width=600,
    height=300
)

In [176]:
hsize_chart = chart.transform_regression('q / p', 'neff / N').mark_line(color=markColor) + chart
hsize_chart

In [177]:
df = df_fits2[(df_fits2.measure == "degree") & (df_fits2.bounds == "p Fixed") & (~df_fits2["q / p"].isin([0.01, 0.05]))]
chart2 = alt.Chart(df).mark_point(size=80, opacity=0.8, fill=markColor, stroke="black", shape="square").encode(
    # x=alt.X('q / p', sort=q_frac_order, axis=None),
    x=alt.X('q / p', axis=alt.Axis(format='')),
    # x=alt.X('q', sort=q_frac_order, axis=alt.Axis(labelExpr=q_frac_order)),
    y=alt.Y('neff / Nedges', title="E* / E", scale=alt.Scale(domain=[0, 1.1])),
).properties(
    width=600,
    height=300
)

In [178]:
reg_degree = chart2.transform_regression('q / p', 'neff / Nedges', params=False).mark_line(color=markColor)

In [179]:
degree_chart = reg_degree + chart2 
degree_chart

In [180]:
# Faire varier l'axe en 0.10
# appeler Neff et Eeff

degree_chart | hsize_chart
# alt.layer(degree_chart, hsize_chart)

### Regression coeffs

In [199]:
import altair_transform

In [202]:
reg = chart2.transform_regression('q / p', 'neff / Nedges', params=True)
# print(altair_transform.extract_data(reg))

In [209]:
chart2.transform_regression(
    'q / p', 'neff / Nedges', params=True
).transform_calculate(
    intercept='datum.coef[0]',
    slope='datum.coef[1]',
).mark_text(align='left').encode(
    x=alt.value(20),  # pixels from left
    y=alt.value(20),  # pixels from top
    text='slope:N'
)

In [211]:
reg2 = chart.transform_regression('q / p', 'neff / N', params=True)
# print(altair_transform.extract_data(reg2))

In [214]:
chart.transform_regression(
    'q / p', 'neff / N', params=True
).transform_calculate(
    intercept='datum.coef[0]',
    slope='datum.coef[1]',
).mark_text(align='left').encode(
    x=alt.value(20),  # pixels from left
    y=alt.value(20),  # pixels from top
    text='intercept:N'
)

In [None]:
# Fixer p est plus facilement explicable: plutot parler de ca dans le papier