# Compute graph topology



## Imports 



In [1]:
import sys
import os
from pathlib import Path
import networkx as nx
import numpy as np
import pickle

from joblib import Parallel, delayed
from collections import ChainMap

In [2]:
sys.path.append(str(Path(os.path.abspath("")).parent))

import config
from dataset import load_ids
from dataset import _process_problem

In [3]:
#ID_FILE = '../id_files/deepmath.txt'
ID_FILE = '../id_files/train.txt'

ids = load_ids(ID_FILE)

#ids = ids[:10] # FIXME
print('Number of problems', len(ids))

Number of problems 22179


In [4]:
#ids

## Statistics computation function

In [5]:
def compute_stats(i: str):
    
    res = {i: {}}

    try:
        data = _process_problem(i, config.PROBLEM_DIR, remove_argument_node=False)

        G = nx.Graph()    
        sources = np.array(data.edge_index[0].detach())
        targets = np.array(data.edge_index[1].detach())
        G.add_edges_from(list(zip(sources, targets)))

        # Compute max, min, avg of degrees
        degrees = list((dict(G.degree).values()))
        degree_stats = {'Degree max': max(degrees), 'Degree min': min(degrees), 'Degree mean': np.mean(degrees)}
        res[i].update(degree_stats)

        # Number of nodes and edges
        res[i].update({'No Edges': len(data.edge_index[0])})
        res[i].update({'No Nodes': len(data.x)})

        # Number of axioms
        res[i].update({'No Axioms': len(data.premise_index)})

        # Average shortest vertex-vertex distance
        if nx.is_connected(G): # Fails if not connected - just ignore
            nx.average_clustering(G)
            res[i].update({'Average geodesic': nx.average_shortest_path_length(G)})

        # Average cluster
        res[i].update({'Cluster coefficient': nx.average_clustering(G)})

        # Degree correlation
        res[i].update({'Degree Correlation': nx.degree_pearson_correlation_coefficient(G)})

        # Transitivity - fraction of all possible triangles
        res[i].update({'Transitivity': nx.transitivity(G)})

        # Communities - using Louvain method
        res[i].update({'Communities': len(nx.community.louvain_communities(G, seed=7))})

        # Diameter
        res[i].update({'Diameter': nx.diameter(G)})
    
    except Exception:
        return {i: -1}

    return res

In [6]:
result = Parallel(n_jobs=os.cpu_count() - 2)(delayed(compute_stats)(i) for i in ids)
data = dict(ChainMap(*result)) # Convert lsit to dict of dict

In [7]:
#data

## Save as a pickle

In [8]:
with open('graph_top_stats.pickle', 'wb') as handle:
    pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)