In [1]:
# Loading of Env Vars to enable parameterized command line usage
import os

GRAPHLET_SIZE = int(os.environ.get('GRAPHLET_SIZE', 3))
DATASET = os.environ.get('DATASET', "yeastInter_st.txt")
EXPERIMENT_OUT = os.environ.get('EXPERIMENT_OUT', "yeastInter_st")
METRIC_NAME = os.environ.get('METRIC_NAME', "degree")

In [2]:
from pmotifs.analysis_utilities.metric_consolidation import metrics

potential_metrics = metrics.keys()
assert METRIC_NAME in potential_metrics

In [3]:
from pmotifs.analysis_utilities.loading import Result
from pmotifs.config import config

r = Result.load_result(
    config.DATASET_DIRECTORY /  DATASET,
    config.EXPERIMENT_OUT / EXPERIMENT_OUT,
    GRAPHLET_SIZE,
)

g = r.pmotif_graph
df = r.positional_metric_df

Loading graphlet metrics: 100%|█████████████████████████████████| 13150/13150 [00:00<00:00, 141551.09it/s]
Loading anchor nodes: 100%|███████████████████████████████████████████| 42/42 [00:00<00:00, 371647.19it/s]
Loading anchor node shortest paths: 100%|██████████████████████████████| 42/42 [00:00<00:00, 54220.00it/s]
Loading graph modules: 100%|██████████████████████████████████████████| 27/27 [00:00<00:00, 268355.94it/s]


In [None]:
randomized_results = Result.load_randomized_results(g, GRAPHLET_SIZE, supress_tqdm=True)

Loading Randomized Results: 100%|█████████████████████████████████████| 1000/1000 [00:37<00:00, 26.44it/s]


In [None]:
randomized_results = {
    r.pmotif_graph: r.positional_metric_df
    for r in randomized_results
}

# Analysis

In [None]:
from pmotifs.graphlet_representation import GRAPHLET_CLASS_NAME_LOOKUP, get_graphlet_size_from_class

def all_graphlet_classes_of_size(graphlet_size: int):
    return [
        graphlet_class
        for graphlet_class in GRAPHLET_CLASS_NAME_LOOKUP.keys()
        if get_graphlet_size_from_class(graphlet_class) == graphlet_size
    ]
all_graphlet_classes_of_size(GRAPHLET_SIZE)

In [None]:
from collections import defaultdict

# Collect frequencies per class
def to_graphlet_class_frequency(result_df):
    graphlet_size = get_graphlet_size_from_class(result_df["graphlet_class"][0])
    all_frequencies = defaultdict.fromkeys(all_graphlet_classes_of_size(graphlet_size), 0)
           
    return dict(all_frequencies | dict(result_df.groupby("graphlet_class").agg("count")["nodes"]))

to_graphlet_class_frequency(df)

In [None]:
from tqdm import tqdm
import pandas as pd
# Collect graphlet frequencies per random graph in a df
# random graph num | graphlet class 1 | graphlet class 2
# 1                | 15               | 500
# 2                | 600              | 600
# ...

data = []
for random_df in tqdm(randomized_results.values()):
    data.append(to_graphlet_class_frequency(random_df))
random_frequencies = pd.DataFrame(data)

In [None]:
from statistics import mean, stdev
from typing import List

def get_zscore(point: float, values: List[float]):
    return (point - mean(values)) / stdev(values)

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, len(all_graphlet_classes_of_size(GRAPHLET_SIZE)), figsize=(10,5))

original_frequencies = to_graphlet_class_frequency(df)
    
for i, graphlet_class in enumerate(all_graphlet_classes_of_size(GRAPHLET_SIZE)):
    ax = axes[i]
    
    original_value = original_frequencies[graphlet_class]
    distribution = random_frequencies[graphlet_class]
    z_score = get_zscore(original_value, distribution)
    
    distribution.plot.hist(ax=ax, label="Expected Distribution")
    
    ax.axvline(original_value, color="tab:orange", label=f"Original (zscore={round(z_score, 2)})")
    ax.set_title(GRAPHLET_CLASS_NAME_LOOKUP[graphlet_class])
    ax.legend(loc="upper right")