In [1]:
from pathlib import Path

DATASET_BASE_PATH = Path("/home/timgarrels/masterthesis/datasets/")
OUT_BASE_PATH = Path("/home/timgarrels/masterthesis/output/cluster_calc/")

ANALYSIS_CANDIDATES = {
    "yeast": {
        "edgelist": DATASET_BASE_PATH / "yeastInter_st.txt",
        "outpath": OUT_BASE_PATH,
    },
}

In [2]:
CANDIDATE = "yeast"

GRAPH_EDGELIST = ANALYSIS_CANDIDATES[CANDIDATE]["edgelist"]
OUT = ANALYSIS_CANDIDATES[CANDIDATE]["outpath"]
GRAPHLET_SIZE = 4

In [4]:
# Retrieve Graphlets and their metrics
import pickle

from pmotifs.PMotifGraph import PMotifGraph


g = PMotifGraph(GRAPH_EDGELIST, OUT)

In [5]:
graphlet_pos = g.load_graphlet_pos_zip(GRAPHLET_SIZE)

In [None]:
graphlet_metrics = g.load_positional_data(GRAPHLET_SIZE)

Loading Anchor Nodes: 100%|████████████████████████████████| 42/42 [00:00<00:00, 29007.21it/s]
Loading Graph Modules: 100%|███████████████████████████████| 27/27 [00:00<00:00, 42799.02it/s]
Loading Graphlet Metrics:  35%|█████▌          | 5819165/16694356 [00:39<00:54, 198330.95it/s]

In [None]:
# Analysis by frequency does not make sense in local scope, all graphlet occurrences account to only one score
graphlet_frequencies = g.load_graphlet_freq_file(GRAPHLET_SIZE)
graphlet_frequencies

In [None]:
graphlet_classes = {k.graphlet_class for k in graphlet_metrics.graphlet_metrics.keys()}
graphlet_classes

# Analysis

In [None]:
from typing import Union

from pmotifs.GraphletPositionalMetrics import GraphletPositionalMetrics


def get_pos_metric_name():
    return "degree"

def get_positional_metric(pm: GraphletPositionalMetrics) -> Union[int, float]:
    """A wrapper to consolidate metrics
    In the case of this example, motif degree does not need to be consolidated"""
    return pm.degree

In [None]:
def reduce_graphlet_metrics_to_class(graphlet_metrics, graphlet_class):
    return [
        (k,v)
        for k,v in graphlet_metrics.items()
        if k.graphlet_class == graphlet_class
    ]

In [None]:
def reduce_graphlet_metrics_to_pos_metric(graphlet_metrics, metric_callback):
    return [metric_callback(g_pm) for g_pm in dict(graphlet_metrics).values()]

In [None]:
import matplotlib.pyplot as plt

from pmotifs.graphlet_representation import graphlet_class_to_name

fig, axes = plt.subplots(1, len(graphlet_classes), figsize=(len(graphlet_classes) * 5, 5))

for i, graphlet_class in enumerate(graphlet_classes):
    ax = axes[i]

    relevant_graphlet_occurrences = reduce_graphlet_metrics_to_class(
        graphlet_metrics.graphlet_metrics,
        graphlet_class,
    )
    metric = reduce_graphlet_metrics_to_pos_metric(
        relevant_graphlet_occurrences,
        get_positional_metric,
    )
    
    # Quantiles
    
    ax.hist(metric, label=graphlet_class_to_name(graphlet_class))
    ax.legend()
    ax.set_xlabel(get_pos_metric_name())
    ax.set_ylabel("Frequency")

TODO: Which statistical tests would be appropriate here to determine the prevalency of one graphlet class over the other? Or is the plot sufficient?