In [1]:
import networkx as nx
import powerlaw
import numpy as np
import pandas as pd
import multiprocessing as mp
import random

import sys
sys.path.append('..')
import network_utils as ne

In [2]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

In [3]:
ROUND_DIG = 4
with pd.HDFStore('../data/gene_network_data.h5') as store:
    tec = store['TEC']
np_tec_abs = np.abs(tec.to_numpy(copy=True))

In [4]:
def generate_synthetic_ks(degree_sequence, ntail, xmin) -> list[int]:
    """
    Generate a synthetic distribution for scale-free analysis 
    --------------------------
    Args:
        degree_sequence (list[int]): Array of node degrees.
        ntail (int): Number of nodes with degrees greater than k*_min.
        xmin (int): k*_min. 
    Returns:
        synthetic_seq (list[int]): Synthetic degree sequence. 
    """
    fit = powerlaw.Fit(degree_sequence, xmin=xmin, discrete=True, verbose=False)
    n = len(degree_sequence)
    synthetic_seq = []
    emperical_set = [deg for deg in degree_sequence if deg < xmin] # distribution of nodes degree below xmin 
    
    for _ in range(n):
        if np.random.rand() < (ntail / n): # sample from fitted power law
            synthetic_seq.append(fit.power_law.generate_random(1)[0])
        else: # sample from emperical distribution below k*_min
            synthetic_seq.append(np.random.choice(emperical_set))
    return synthetic_seq

In [5]:

def goodness_of_fit_multiprocess(degree_sequence, fit, num_synthetic=1000) -> float:
    """
    Test the goodness of fit of power law distribution 
    --------------------------
    Args:
        degree_sequence (list[int]): Emperical degree distribution.
        fit: powerlaw package fit of degree_sequence.
        num_synthetic (int): Number of synthetic distribution used for testing. 
    Returns:
        p_value (float): p_value for goodness of fit.  
    """
    ntail = sum(deg >= fit.xmin for deg in degree_sequence) # number of nodes with degree above xmin 
    
    # use multiprocessing for faster speed
    pool_args = [(degree_sequence, ntail, fit.xmin) for _ in range(num_synthetic)]
    with mp.get_context("fork").Pool(processes=mp.cpu_count()) as pool:
        results = [pool.apply_async(generate_synthetic_ks, args=args) for args in pool_args]
        synthetic_samples = [r.get() for r in results]
    
    D_synthetic = [] # fit each synethetic to its own power law
    for sample in synthetic_samples:
        fit_synthetic = powerlaw.Fit(sample, discrete=True, verbose=False)
        D_synthetic.append(fit_synthetic.D)
    p_value = np.sum(D_synthetic >= fit.D) / len(D_synthetic) # fraction of synthetic distributions with a worse KS D statistic 
    return p_value

In [6]:
thresholds = [0.9, 0.85]
#! Running takes multiple hours to finish
# thresholds = [0.9, 0.85, 0.8, 0.75, 0.7, 0.65, 0.6, 0.55, 0.5]
results = []
for thresh in thresholds:
    curr_result = []
    curr_result.append(thresh)
    curr_al = ne.threshold_weighted_adjacency_list(np_tec_abs, thresh)
    curr_graph = ne.construct_network(curr_al, "TEC", tec.columns)

    tec_degrees = [curr_graph.degree(n) for n in curr_graph.nodes()]
    tec_fit = powerlaw.Fit(tec_degrees, discrete=True, verbose=False)
    curr_result.append(round(tec_fit.xmin, ROUND_DIG))
    curr_result.append(round(tec_fit.alpha, ROUND_DIG))
    curr_result.append(round(tec_fit.D, ROUND_DIG))

    tec_degrees = [curr_graph.degree(n) for n in curr_graph.nodes()]
    pfit = goodness_of_fit_multiprocess(tec_degrees, tec_fit, 1000).item()
    curr_result.append(round(pfit, ROUND_DIG))

    n = nx.number_of_nodes(curr_graph)
    ba_m = nx.number_of_edges(curr_graph) // nx.number_of_nodes(curr_graph)
    G_barabasi_albert = nx.barabasi_albert_graph(n, ba_m, seed=SEED)

    ba_degrees = [G_barabasi_albert.degree(n) for n in G_barabasi_albert.nodes()]
    ba_fit = powerlaw.Fit(ba_degrees, discrete=True, verbose=False)
    ba_pfit = goodness_of_fit_multiprocess(ba_degrees, ba_fit, 1000)
    curr_result.append(round(ba_pfit, ROUND_DIG))

    alternatives = ['exponential', 'lognormal_positive', 'truncated_power_law'] 
    for alternative in alternatives:
        curr_r, curr_p = tec_fit.distribution_compare('power_law', alternative, nested=False)
        curr_result.append((curr_r, curr_p))
    results.append(curr_result)

xmin progress: 97%

Assuming nested distributions


xmin progress: 97%

Assuming nested distributions


In [7]:
result_df = pd.DataFrame(
    results, 
    columns=['Threshold', 'k_min', 'alpha', 'D statistic', 'p_fit', 'BA p_fit','exponential', 'lognormal_positive', 'truncated_power_law']
)
result_df.head()

Unnamed: 0,Threshold,k_min,alpha,D statistic,p_fit,BA p_fit,exponential,lognormal_positive,truncated_power_law
0,0.9,1.0,1.4011,0.1448,0.0,0.605,"(49.89887919185541, 0.0035566803765326345)","(-12.106117160038437, 3.7085812827541936e-05)","(-24.399915000442277, 2.8346214264729497e-12)"
1,0.85,1.0,1.5094,0.0775,0.0,0.93,"(381.17397437571276, 1.88940594226276e-48)","(-11.443810416206889, 0.022920229058544135)","(-29.403556729171598, 1.7430501486614958e-14)"
