In [106]:
import json
import math
import networkx as nx
import pandas as pd
import altair as alt
import numpy as np
from scipy import stats
from scipy.optimize import curve_fit
import distfit
import os


from generator_v2 import Generator

alt.data_transformers.enable("vegafusion")

DataTransformerRegistry.enable('vegafusion')

In [148]:
# define the theme by returning the dictionary of configurations

font = "monospace"
# Axes
axisColor = "#000000"
gridColor = "#DEDDDD"
# Colors
main_palette = ["#1696d2", 
                "#d2d2d2",
                "#000000", 
                "#fdbf11", 
                "#ec008b", 
                "#55b748", 
                "#5c5859", 
                "#db2b27", 
               ]
sequential_palette = ["black", 
                      "#a2d4ec", 
                      "#73bfe2", 
                      "#46abdb", 
                      "#1696d2", 
                      "#12719e", 
                     ]


# markColor = "#8856a7"
# markColor = "#6a51a3"
markColor = "#9ebcda"

# Define custom color schemes for ordinal and categorical data
categorical_color_scheme = ['red', 'green', 'blue', 'purple', 'orange']
ordinal_color_scheme = ["#fcfbfd",
"#efedf5",
"#dadaeb",
"#bcbddc",
"#9e9ac8",
"#807dba",
"#6a51a3",
"#4a1486"]


def simple_theme():
    return {
        'config': {
            'view': {
                'height': 300,
                'width': 300,
            },
            "title": {
                    "fontSize": 18,
                    "font": font,
                    "anchor": "start", # equivalent of left-aligned.
                    "fontColor": "#000000"
                },
            "facet": {
                "labelFontSize": 16,
                "titleFontSize": 20  # Adjust the fontsize as needed
            },
            "axisX": {
                    "labelFont": font,
                    "labelFontSize": 16,
                    "titleFontSize": 20,
                    "grid": False,
                },
                "axisY": {
                    "domain": False,
                    "grid": False,
                    "labelFont": font,
                    "labelFontSize": 16,
                    "titleFontSize": 20,
                },
            'mark': {
                'color': markColor,
                'fill': markColor
            },
            "line": {
                "fill": None,
               "stroke": "black",
           },
            "scale": {"color": {"scheme": "blue"}},
            "legend": {

                "titleFontSize": 20,
                "labelFontSize": 16
            }            
        }
    }


# alt.themes.register("theme", theme)
alt.themes.register("theme", simple_theme)
alt.themes.enable("theme")
#alt.themes.enable("ggplot2")

ThemeRegistry.enable('theme')

In [149]:
N = 1000
N_edges = 200
N_coms = 4
# p = 300 / N
p = 0.1

q = 0.1 * p

# qs = [1, 0.1, 0.01, 0.001]
qs = [0.1, 0.01]


community_array = []
for i in range(N_coms):
    community_array += [i for x in range(N // N_coms)]

In [150]:
p

0.1

In [151]:
all_hsizes = []
q_to_fit = {}


df_hsizes = pd.DataFrame(columns=["hsize", "q / p"], dtype=int)

dist = stats.binom

for q_frac in qs:
    print(q_frac)
    q = q_frac * p

    all_hsizes = []
    for i in range(100):

        gen = Generator(N, N_edges, N_coms, p, q, community_array, "min")
        gen.run()
        hsizes = dict(gen.hyperedge_sizes()).values()
        all_hsizes = all_hsizes + list(hsizes)
    
        # bounds = [(N / N_coms, N), (p, p)]
        bounds = [(0, N), (p, p)]
        # bounds = [(0, N), (0, p)]

        
    res = stats.fit(dist, all_hsizes, bounds)
    peff_hsize_pfixed = res.params[1]
    neff_hsize_pfixed = res.params[0]

    q_to_fit[q_frac] = res

    df = pd.DataFrame({"hsize": all_hsizes, "q / p": [q_frac] * len(all_hsizes)})
    df_hsizes = pd.concat([df_hsizes, df])
    

0.1
0.01


## Fit

In [152]:
grouped = df_hsizes.groupby('q / p')['hsize'].value_counts(normalize=True).reset_index(name='Percentage')

In [153]:
# Add COlumn
grouped["pdf"] = None

In [154]:
np.sum(grouped.Percentage)

2.0000000000000004

In [155]:
grouped[(grouped["hsize"] == 74) & (grouped["q / p"] == 0.001)]

Unnamed: 0,q / p,hsize,Percentage,pdf


In [156]:
x = range(min(df_hsizes.hsize), max(df_hsizes.hsize))
x

range(2, 45)

In [157]:
# Generate points for the fitted distribution
x = range(min(df_hsizes.hsize), max(df_hsizes.hsize))


for q_frac in qs:
    res = q_to_fit[q_frac]
    for i in x:

        pdf_val = stats.binom.pmf(i, res.params[0], res.params[1])
        # print(i, pdf_val)
    

        # print(grouped[((grouped["q / p"] == q_frac) & (grouped["hsize"] == i))])
        grouped.loc[(grouped["q / p"] == q_frac) & (grouped["hsize"] == i), "pdf"] = pdf_val

# pdf_fitted = stats.binom.pmf(x, neff_hsize_pfixed, peff_hsize_pfixed)

# # Convert data to a DataFrame for Altair
# df_data = pd.DataFrame({'value': all_hsizes})
# df_pdf = pd.DataFrame({'x': x, 'pdf_fitted': pdf_fitted})

In [158]:
grouped.head()

Unnamed: 0,q / p,hsize,Percentage,pdf
0,0.01,24,0.05065,0.05042
1,0.01,25,0.048,0.038319
2,0.01,26,0.04665,0.027839
3,0.01,23,0.04595,0.063318
4,0.01,22,0.04525,0.075762


In [159]:
grouped.loc[grouped["q / p"] == 0.01, "pdf"] = None

## Plot

In [160]:
# distrib = alt.Chart(grouped_selp).mark_bar(width=5,  opacity=0.7, color="black").encode(
distrib = alt.Chart(grouped).mark_bar().encode(
        x=alt.X('hsize:Q', title="m", axis=alt.Axis(tickMinStep=5)),
        y=alt.Y('Percentage:Q', title="P(m)"),
        # facet=alt.Facet("q / p:O")
        # color="q"
    )

fit = alt.Chart(grouped).mark_line().encode(
        x=alt.X('hsize:Q'),
        y=alt.Y('pdf:Q'),
        # facet=alt.Facet("q / p:O")
    )

In [161]:
alt.layer(distrib, fit, data=grouped).facet(
    facet=alt.Facet(
        'q / p:O',
        header=alt.Header(labelFontSize=18, titleFontSize=20)
    ),
    columns=2
).configure_axisY(
    titleAngle=0,
    titleX=-80,
)

In [None]:
# Essayer de fit une loi gamma?
# Simulations sur l'ordre des noeuds
# Courbe de Complexité sur quelques points

# One Simulation

In [208]:
all_hsizes = []

dist = stats.binom

p = 0.3
q = p * 0.005

N = 1000
N_edges = 300

for i in range(20):
    gen = Generator(N, N_edges, N_coms, p, q, community_array, "frequent")
    gen.run()
    hsizes = dict(gen.hyperedge_sizes()).values()
    all_hsizes = all_hsizes + list(hsizes)

In [209]:
print(p, q)

0.3 0.0015


## fit

In [210]:
dist = stats.binom

In [211]:
bounds = [(0, N), (p, p)]

res = stats.fit(dist, all_hsizes, bounds)
peff_hsize_pfixed = res.params[1]
neff_hsize_pfixed = res.params[0]
print(peff_hsize_pfixed, neff_hsize_pfixed)

0.3 256.0


In [212]:
# Step 2: Generate points for the fitted log-normal distribution
# x = np.linspace(min(all_hsizes), max(all_hsizes), 1000)
# x = range(100)
x = range(max(all_hsizes))

pdf_fitted = stats.binom.pmf(x, neff_hsize_pfixed, peff_hsize_pfixed)

# Convert data to a DataFrame for Altair
df_data = pd.DataFrame({'value': all_hsizes})
df_pdf = pd.DataFrame({'x': x, 'pdf_fitted': pdf_fitted})

## Plot

In [213]:
df_hsizes = pd.DataFrame(all_hsizes, columns=["hsize"], dtype=int)

In [214]:
grouped = df_hsizes['hsize'].value_counts(normalize=True).reset_index(name='Percentage')

In [215]:
distrib = alt.Chart(grouped).mark_bar().encode(
        x=alt.X('hsize:Q', title="m", axis=alt.Axis(tickMinStep=5)),
        y=alt.Y('Percentage:Q'),
        # y="count()",
    )

fit = alt.Chart(df_pdf).mark_line(stroke='red').encode(
        x=alt.X('x:Q'),
        y=alt.Y('pdf_fitted'),
    )

In [216]:
distrib + fit

In [197]:
gen.hyperedges_types()[0:20]

['mixed',
 'mixed',
 'mixed',
 'mixed',
 'mixed',
 'mixed',
 'mixed',
 'mixed',
 'mixed',
 'pure',
 'mixed',
 'pure',
 'pure',
 'mixed',
 'mixed',
 'pure',
 'pure',
 'pure',
 'mixed',
 'pure']

In [198]:
print(gen.hyperedges_composition()[0:20])

[[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 2, 2, 0, 2, 3, 3, 3, 3, 2, 0, 0, 2, 3, 2, 2, 3, 2, 0, 3, 2, 3, 0, 0, 2, 0, 2, 3, 0, 3, 0, 3, 0, 2, 0, 2, 2, 2, 3, 3, 0, 2, 2, 2, 0, 3, 2, 0, 0, 2, 0, 2, 2, 0, 0, 3, 0, 3, 3, 0, 3, 2, 3, 3, 0, 0, 3, 2, 3, 3, 2, 0, 0, 3, 3, 0, 0, 0, 3, 3, 3, 2, 3, 0, 3, 2, 3, 0, 3, 2, 3, 0, 2, 0, 2, 2, 3, 3, 3, 0, 2, 2, 0, 0, 3, 3, 3, 0, 3, 3, 0, 0, 2, 2, 0, 2], [3, 3, 1, 3, 3, 3, 1, 3, 3, 3, 1, 1, 1, 1, 3, 1, 1, 1, 3, 3, 3, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 1, 3, 3, 3, 3, 3, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 1, 1, 3, 1, 3, 0, 0, 0, 0, 3, 0, 3, 0, 0, 0, 0, 1, 1, 0, 3, 0, 3, 3, 0, 1, 3, 3, 0, 1, 3, 0, 3, 0, 1, 1, 3, 1, 3, 1, 1, 0, 1, 1, 1, 1, 1, 3, 0, 1, 0, 0, 3, 0, 1, 0, 3, 0, 0, 0, 1, 3, 3, 0, 0, 3, 0, 3, 1, 3, 1, 3, 1, 1, 3, 1, 1, 3, 1, 0, 3, 0, 1, 3, 0, 1, 0, 3, 0, 2, 2, 0, 2, 3, 1, 2, 0, 0, 0, 1, 2, 3, 1, 3, 2, 3, 0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 1, 0, 

## LogNorm fit (not working)

In [388]:
data = list(df_hsizes.hsize)
log_data = np.log(data)

# Step 2: Estimate parameters of the log-normal distribution
# Fit the data to a normal distribution (because log(data) is normally distributed)
shape, loc, scale = stats.lognorm.fit(data, floc=0)  # 'floc=0' fixes location parameter to zero

# Parameters of the fitted log-normal distribution
print(f"Shape (sigma): {shape}")
print(f"Location (loc): {loc}")
print(f"Scale (exp(mu)): {scale}")

Shape (sigma): 0.80215141513417
Location (loc): 0
Scale (exp(mu)): 25.56240338039136


In [405]:
# Step 2: Generate points for the fitted log-normal distribution
x = np.linspace(min(data), max(data), 1000)
pdf_fitted = stats.lognorm.pdf(x, shape, loc, scale)


# Convert data to a DataFrame for Altair
df_data = pd.DataFrame({'value': data})
df_pdf = pd.DataFrame({'x': x, 'pdf_fitted': pdf_fitted})

In [434]:
distrib2 = alt.Chart(df_hsizes).mark_bar().encode(
        # x=alt.X('hsize:O', title="Hyperedge Size", axis=alt.Axis(tickMinStep=10), scale=alt.Scale(domain=list(range(3, 51)), padding=0.1, paddingInner=0.1)),
        x=alt.X('hsize:Q', title="m", axis=alt.Axis(tickMinStep=5)),
        # x=alt.X('hsize:Q', title="m").bin(maxbins=40),
        y=alt.Y('count()'),
        # y="count()",
        # facet=alt.Facet("q:O")
        # color="q"
    )

fit = alt.Chart(df_pdf).mark_line(stroke='red').encode(
        x=alt.X('x:Q'),
        y=alt.Y('pdf_fitted'),
    )

In [435]:
distrib + fit