In [1]:
import json
import math
import networkx as nx
import pandas as pd
import altair as alt
import numpy as np
from scipy import stats

from generator_v2 import Generator

In [2]:
COLOR = "#99d8c9"

In [3]:
alt.__version__

'5.1.1'

In [4]:
N = 500
N_edges = N // 2
N_coms = 2
sampling_strat = "weighted"
# sampling_strat = "max"
p = 20/N
q = 3/N

In [5]:
community_array = [0 for x in range(N//2)]  + [1 for x in range(N//2)]

# Node Degree

In [6]:
gen = Generator(N, N_edges, N_coms, p, q, community_array, sampling_strat)
gen.run()

In [7]:
degrees = dict(gen.degrees()).values()

In [8]:
degrees_df = pd.DataFrame(degrees, columns=["degree"])

In [9]:
degrees_df

Unnamed: 0,degree
0,9
1,4
2,12
3,7
4,6
...,...
495,5
496,3
497,8
498,7


In [10]:
alt.Chart(degrees_df).mark_bar().encode(
    alt.X("degree:Q"),
    y='count()',
)

# Node Degree Multi Simulation

In [11]:
N_sim = 20

In [12]:
df_sim = pd.DataFrame(columns=["count", "simNumber"], dtype=int)
for i in range(N_sim):
    gen = Generator(N, N // 2, N_coms, 20/N, 3/N, community_array, sampling_strat)
    gen.run()
    degrees = dict(gen.degrees()).values()
    degrees_df = pd.DataFrame(degrees, columns=["degree"])

    countdf = degrees_df.groupby(['degree'])['degree'].count()
    countdf = countdf.to_frame().rename(columns={"degree": "count"})
    countdf["simNumber"] = i
    
    df_sim = pd.concat([df_sim, countdf])

In [13]:
df_sim = df_sim.reset_index(names="degree")
# df_sim.rename(columns={}

In [14]:
df_sim

Unnamed: 0,degree,count,simNumber
0,0,2,0
1,1,3,0
2,2,19,0
3,3,48,0
4,4,68,0
...,...,...,...
292,9,44,19
293,10,21,19
294,11,9,19
295,12,7,19


In [15]:
bars = alt.Chart(df_sim).mark_bar(color=COLOR).encode(
    alt.X("degree:Q", scale=alt.Scale(domain=[0, 18])),
    alt.Y("mean(count):Q"),
)

In [16]:
error = alt.Chart(df_sim).mark_errorbar(extent="ci", rule=True).encode(
    x=alt.X("degree:Q", scale=alt.Scale(domain=[0, 18])),
    y=alt.Y(
        "count:Q",
        scale=alt.Scale(zero=False),
        title="Absolute Frequency"
    ),
)

In [17]:
bars + error

# Hyperedge Size

In [18]:
hsizes = dict(gen.hyperedge_sizes()).values()

In [19]:
hsizes

dict_values([11, 14, 10, 12, 12, 19, 14, 19, 15, 9, 9, 11, 12, 15, 15, 16, 14, 6, 14, 11, 18, 16, 16, 16, 4, 10, 14, 17, 12, 13, 15, 12, 13, 8, 15, 10, 11, 10, 13, 12, 13, 10, 12, 17, 12, 14, 13, 9, 11, 11, 17, 10, 13, 21, 13, 13, 8, 10, 11, 14, 13, 8, 13, 14, 15, 14, 14, 7, 12, 14, 16, 11, 12, 14, 9, 13, 6, 21, 20, 11, 13, 16, 22, 10, 12, 9, 15, 12, 14, 17, 10, 10, 13, 10, 17, 8, 12, 8, 15, 7, 12, 17, 15, 16, 12, 12, 19, 10, 9, 7, 15, 15, 13, 9, 14, 12, 15, 15, 18, 16, 13, 9, 11, 6, 14, 12, 10, 14, 12, 11, 19, 9, 11, 12, 11, 12, 12, 14, 8, 17, 7, 8, 11, 19, 13, 10, 12, 10, 8, 11, 16, 18, 10, 12, 7, 14, 10, 13, 9, 12, 10, 4, 9, 14, 9, 9, 13, 8, 13, 18, 9, 8, 18, 14, 12, 16, 9, 15, 10, 17, 13, 8, 20, 14, 5, 13, 8, 15, 5, 11, 11, 13, 6, 9, 9, 15, 11, 14, 16, 16, 14, 8, 9, 8, 15, 11, 13, 9, 11, 10, 13, 17, 12, 18, 11, 10, 15, 10, 17, 15, 11, 10, 12, 13, 17, 12, 13, 14, 14, 17, 7, 10, 11, 13, 10, 19, 11, 13, 12, 9, 5, 9, 13, 11, 9, 20, 16, 12, 14, 18])

In [20]:
hsizes_df = pd.DataFrame(hsizes, columns=["hsize"])

In [21]:
alt.Chart(hsizes_df).mark_bar().encode(
    alt.X("hsize:Q"),
    y='count()',
)

## Hyperedge Sim

In [22]:
df_sim = pd.DataFrame(columns=["count", "simNumber"], dtype=int)
for i in range(N_sim):
    gen = Generator(N, N_edges, N_coms, p, q, community_array, sampling_strat)
    gen.run()
    hsizes = dict(gen.hyperedge_sizes()).values()
    hsizes_df = pd.DataFrame(hsizes, columns=["hsize"])

    countdf = hsizes_df.groupby(['hsize'])['hsize'].count()
    countdf = countdf.to_frame().rename(columns={"hsize": "count"})
    countdf["simNumber"] = i
    
    df_sim = pd.concat([df_sim, countdf])

In [23]:
df_sim = df_sim.reset_index(names="hsize")

In [24]:
df_sim.head()

Unnamed: 0,hsize,count,simNumber
0,4,2,0
1,5,3,0
2,6,3,0
3,7,9,0
4,8,10,0


In [25]:
bars = alt.Chart(df_sim).mark_bar(color=COLOR).encode(
    alt.X("hsize:Q", scale=alt.Scale(domain=[0, 25]), title="Hyperedge Size"),
    alt.Y("mean(count):Q"),
)

error = alt.Chart(df_sim).mark_errorbar(extent="ci", rule=True).encode(
    x=alt.X("hsize:Q", scale=alt.Scale(domain=[0, 25])),
    y=alt.Y(
        "count:Q",
        scale=alt.Scale(zero=False),
        title="Absolute Frequency"
    ),
)

In [26]:
bars + error

# p effectif tests

In [27]:
# Bounds of n and p effectif
bounds = [(N / 2, N), (q, p)]
dist = stats.binom
res = stats.fit(dist, list(degrees), bounds)

In [28]:
res

  params: FitParams(n=268.0, p=0.023119397440310774, loc=0.0)
 success: True
 message: 'Optimization terminated successfully.'

In [29]:
res.params[1]

0.023119397440310774

In [30]:
p

0.04

# Fraction Dist

In [31]:
# Faire pareil sur 100/1000 réseaux
# Accumuler la liste des degrées/hsize
# Une value de p/q => plusieurs simulations => faire le fit la dessus

In [32]:
p_init = 30 / N
q_init = 30 / N

In [33]:
q_init

0.06

In [59]:
p = p_init
q = q_init 

df_degrees = pd.DataFrame(columns=["degree"], dtype=int)
df_hsizees = pd.DataFrame(columns=["hsize"], dtype=int)

df_sim = pd.DataFrame(columns=["sim", "type", "count"], dtype=int)
df_fraction = pd.DataFrame(columns=["sim", "count", "fraction0"], dtype=int)
df_peff = pd.DataFrame(columns=["peff", "p", "q", "q_frac"], dtype=int)

increment = 0.05
N_sim = int(1 / increment) + 1
q_frac_order = []
N_graphs = 10

q_to_degreefit = {}
q_to_hsizefit = {}

for i in range(N_sim):
    q = round(q_init - (p * increment * i), 4)
    print(i, p, q)
    
    q_frac = f"{round(1 - (increment * (i)), 3)}p"
    q_frac_order.append(q_frac)
    
#     We can fit on several graphs at the same time
    all_degrees = []
    all_hsizes = []    
    
    for n_graph in range(N_graphs):
        gen = Generator(N, N_edges, N_coms, p, q, community_array, sampling_strat)
        gen.run()
        comp = gen.hyperedges_types()
        n_pure = comp.count("pure")
        n_mixed = comp.count("mixed")
    
        df = pd.DataFrame({"sim": [i, i], "q": [q_frac, q_frac], "type": ["pure", "mixed"], "count": [n_pure, n_mixed]})
        df_sim = pd.concat([df_sim, df])
    
    #     For fraction distribution of mixed edges
        comp = gen.mixed_he_fraction_to_count()
        for fraction, count in comp.items():
            df = pd.DataFrame({"sim": [i], "q": [q_frac], "count": [count], "fraction0": fraction})
            df_fraction = pd.concat([df_fraction, df])
        
        # for peffectif computation
        degrees = dict(gen.degrees()).values()
        hsizes = dict(gen.hyperedge_sizes()).values()
        
        all_degrees = all_degrees + list(degrees)
        all_hsizes = all_hsizes + list(hsizes)
    
    
    df = pd.DataFrame({"degree": all_degrees, "q": q_frac})
    df_degrees = pd.concat([df_degrees, df])
    
    
    # Distrib of degree
    # Bounds of n and p effectif
    # bounds = [(N_edges, N_edges), (q, p)]
    bounds = [(0, N_edges), (q, p)]
    
    # dist = stats.binom
    # res = stats.fit(dist, list(degrees), bounds)
    # peff = res.params[1]
    # df = pd.DataFrame({"peff": [peff], "p": [p], "q": [q], "q_frac": [q_frac]})
    # df_peff = pd.concat([df_peff, df])
    # neff_degree = res.params[0]
    
    dist = stats.binom
    res = stats.fit(dist, all_degrees, bounds)
    peff = res.params[1]
    df = pd.DataFrame({"peff": [peff], "p": [p], "q": [q], "q_frac": [q_frac]})
    df_peff = pd.concat([df_peff, df])
    neff_degree_all = res.params[0]
    
    q_to_degreefit[q] = res
    
    # Distrib of hsizes
    # bounds = [(N / N_coms, N), (q, p)]
    # dist = stats.binom
    # res = stats.fit(dist, list(hsizes), bounds)
    # peffhsize = res.params[1]
    # neff_hsize = res.params[0]
    
    # Distrib of hsizes
    bounds = [(N / N_coms, N), (q, p)]
    dist = stats.binom
    res = stats.fit(dist, all_hsizes, bounds)
    peffhsize = res.params[1]
    neff_hsize = res.params[0]
    
    q_to_hsizefit[q] = res
    
    print("PEFF ", peff, peffhsize)
    print("NEFF ", neff_degree, neff_degree_all, neff_hsize)    

0 0.06 0.06
PEFF  0.06 0.06
NEFF  210.0 250.0 500.0
1 0.06 0.057
PEFF  0.06 0.06
NEFF  210.0 250.0 500.0
2 0.06 0.054
PEFF  0.05904319487354867 0.059881537818708926
NEFF  210.0 250.0 493.0
3 0.06 0.051
PEFF  0.05702959503697557 0.05952984834275478
NEFF  210.0 250.0 479.0
4 0.06 0.048
PEFF  0.057326224696105536 0.05901940800564885
NEFF  210.0 244.0 474.0
5 0.06 0.045
PEFF  0.05542938268755883 0.05840946039980741
NEFF  210.0 245.0 465.0
6 0.06 0.042
PEFF  0.0533902784297376 0.05848070468903851
NEFF  210.0 247.0 451.0
7 0.06 0.039
PEFF  0.05312364636117557 0.05172847987499058
NEFF  210.0 241.0 495.0
8 0.06 0.036
PEFF  0.05125207348181549 0.05715667708808941
NEFF  210.0 242.0 434.0
9 0.06 0.033
PEFF  0.0521887396624973 0.05953382221934028
NEFF  210.0 231.0 405.0
10 0.06 0.03
PEFF  0.04740728241081827 0.05684271444792677
NEFF  210.0 247.0 412.0
11 0.06 0.027
PEFF  0.05553365362519108 0.059139735236962185
NEFF  210.0 205.0 385.0
12 0.06 0.024
PEFF  0.0592983783971765 0.059138539524025875
NEF

## Degree histograms with fit

In [103]:
qs = [0.2, 0.5, 0.9]
for q in qs:
    df = df_degrees[df_degrees["q"] == f"{q}p"]   
    
    fit = q_to_degreefit[q * p]
    
    chart = alt.Chart(df)

    # Create a density plot using transform_density
    density = chart.transform_density(
        density='degree',
        as_=['values', 'density'],  # Output field names
    ).mark_line(  # You can use mark_line() for a line plot
        opacity=0.6,  # Adjust the opacity of the area plot
    ).encode(
        x=alt.X('values:Q'),  # Set the X-axis label
        y=alt.Y('density:Q', title='Density'),  # Set the Y-axis label
    )
    
    
    # Calculate the binomial PDF values for a range of x values
    x_values = range(30)  # Assuming a range from 0 to 10
    binomial_pmf = [stats.binom.pmf(x, fit.params[0], fit.params[1]) for x in x_values]

    # Create a DataFrame for the binomial PDF
    binomial_df = pd.DataFrame({
        'values': x_values,
        'binomial_pmf': binomial_pmf
    })

    # Create a line plot for the binomial PDF
    binomial = alt.Chart(binomial_df).mark_line(color='red').encode(
        x=alt.X('values:Q'),
        y=alt.Y('binomial_pmf:Q'),
    )
    
    chart = density + binomial
    chart.display()

## Distribution of types of edges

In [36]:
alt.Chart(df_sim).mark_bar().encode(
    x=alt.X('q:O', sort=q_frac_order),
    y=alt.Y('count', title="Number of hyperedges"),
    color=alt.Color('type', 
    scale = alt.Scale(domain=['mixed', "pure"], range=['#9ebcda', '#e0ecf4']))
).properties(
    width=800,
    height=300
)

In [37]:
alt.Chart(df_fraction).mark_bar().encode(
    x=alt.X('q:O', sort=q_frac_order),
    y=alt.Y('count'),
    color=alt.Color('fraction0').scale(scheme="lightgreyteal"),
    order=alt.Order(
      # Sort the segments of the bars by this field
      'fraction0',
      sort='descending'
    )
    # scale = alt.Scale(domain=['mixed', "pure"], range=['#9ebcda', '#e0ecf4']))
).properties(
    width=800,
    height=300
)

In [104]:
# Tracer la meme chose avec des histogrammes (+ nouvelle quantité) (pour quelques valuers de q, une proche de 0, de p, et vers 0.4)
# Regarder la fraction moyenne, variance (max(n1, n2) / n1 + n2), a moyenne sur l'ensemble des hyperliens. à tracer sur q

# Calculer le GINI pour ncom = 4 ?

## Distrib effectives

In [105]:
df_peff

Unnamed: 0,peff,p,q,q_frac
0,0.06,0.06,0.06,1.0p
0,0.06,0.06,0.057,0.95p
0,0.059043,0.06,0.054,0.9p
0,0.05703,0.06,0.051,0.85p
0,0.057326,0.06,0.048,0.8p
0,0.055429,0.06,0.045,0.75p
0,0.05339,0.06,0.042,0.7p
0,0.053124,0.06,0.039,0.65p
0,0.051252,0.06,0.036,0.6p
0,0.052189,0.06,0.033,0.55p


In [113]:
points = alt.Chart(df_peff).mark_point().encode(
    x=alt.X('q', sort=q_frac_order),
    # x=alt.X('q', sort=q_frac_order, axis=alt.Axis(labelExpr=q_frac_order)),
    y=alt.Y('peff', title="p_eff"),
).properties(
    width=800,
    height=300
)

In [114]:
regresssion = points.transform_regression('q', 'peff', method="poly").mark_line(color="black")

In [115]:
line = alt.Chart(pd.DataFrame({'y': [p]})).mark_rule().encode(y='y')

In [116]:
# points + regresssion + line
# Pour plusieurs valeur de N
points + line

In [119]:
df_peff["peff / p"] = df_peff["peff"] / df_peff["p"] 
df_peff["q / p"] = df_peff["q"] / df_peff["p"] 

In [122]:
points = alt.Chart(df_peff).mark_point().encode(
    x=alt.X('q / p', sort=q_frac_order),
    # x=alt.X('q', sort=q_frac_order, axis=alt.Axis(labelExpr=q_frac_order)),
    y=alt.Y('peff / p'),
).properties(
    width=800,
    height=300
)

In [124]:
points

In [43]:
# Tracer peff / p, en fonction de q / p

# Tracer Neff / N, en fonction de q / p
# Tracer Eeff / N, en fonction de q / p


# Plotter la distribution sous jacente pour certaines valeurs de q (à la milieu de la distrib, pour le plateau)
# Essayer de visualiser l'erreur de peff et Neff