# Compute graph topology



## Imports 



In [1]:
import sys
import os
from pathlib import Path
import networkx as nx
import numpy as np
import pickle

from joblib import Parallel, delayed
from collections import ChainMap

In [2]:
sys.path.append(str(Path(os.path.abspath("")).parent))

import config
from dataset import load_ids
from dataset import _process_problem

In [10]:
#ID_FILE = '../id_files/deepmath.txt'
ID_FILE = '../id_files/train.txt'

ids = load_ids(ID_FILE)

ids = ids[:5] # FIXME
print('Number of problems', len(ids))

Number of problems 5


In [11]:
ids

['l16_msuhom_1', 't17_pdiff_3', 't46_intpro_1', 't12_circled1', 't20_zf_model']

## Statistics computation function

In [12]:
def compute_stats(i: str):
    
    res = {i: {}}

    data = _process_problem(i, config.PROBLEM_DIR, remove_argument_node=False)
    
    G = nx.Graph()    
    sources = np.array(data.edge_index[0].detach())
    targets = np.array(data.edge_index[1].detach())
    G.add_edges_from(list(zip(sources, targets)))

    # Compute max, min, avg of degrees
    degrees = list((dict(G.degree).values()))
    degree_stats = {'Degree max': max(degrees), 'Degree min': min(degrees), 'Degree mean': np.mean(degrees)}
    res[i].update(degree_stats)
 
    # Number of nodes and edges
    res[i].update({'No Edges': len(data.edge_index[0])})
    res[i].update({'No Nodes': len(data.x)})
    
    # Number of axioms
    res[i].update({'No Axioms': len(data.premise_index)})
    
    # Average shortest vertex-vertex distance
    nx.average_clustering(G)
    res[i].update({'Average geodesic': nx.average_shortest_path_length(G)})
    
    # Average cluster
    res[i].update({'Cluster coefficient': nx.average_clustering(G)})
    
    # Degree correlation
    res[i].update({'Degree Correlation': nx.degree_pearson_correlation_coefficient(G)})
    
    # Transitivity - fraction of all possible triangles
    res[i].update({'Transitivity': nx.transitivity(G)})

    # Communities - using Louvain method
    res[i].update({'Communities': len(nx.community.louvain_communities(G, seed=7))})
    
    # Diameter
    res[i].update({'Diameter': nx.diameter(G)})
    
    return res

In [13]:
result = Parallel(n_jobs=3)(delayed(compute_stats)(i) for i in ids)
data = dict(ChainMap(*result)) # Convert lsit to dict of dict

In [14]:
data

{'t20_zf_model': {'Degree max': 71,
  'Degree min': 1,
  'Degree mean': 3.465818759936407,
  'No Edges': 2180,
  'No Nodes': 1258,
  'No Axioms': 24,
  'Average geodesic': 6.701373421715974,
  'Cluster coefficient': 0.13773184949655556,
  'Degree Correlation': -0.05518756174486314,
  'Transitivity': 0.0743801652892562,
  'Communities': 19,
  'Diameter': 16},
 't12_circled1': {'Degree max': 15,
  'Degree min': 1,
  'Degree mean': 3.112871287128713,
  'No Edges': 786,
  'No Nodes': 505,
  'No Axioms': 10,
  'Average geodesic': 6.913107024988213,
  'Cluster coefficient': 0.0675247524752475,
  'Degree Correlation': -0.17764842759165253,
  'Transitivity': 0.052564102564102565,
  'Communities': 15,
  'Diameter': 16},
 't46_intpro_1': {'Degree max': 40,
  'Degree min': 1,
  'Degree mean': 3.4409005628517826,
  'No Edges': 917,
  'No Nodes': 533,
  'No Axioms': 14,
  'Average geodesic': 5.43899617712198,
  'Cluster coefficient': 0.17592245153220742,
  'Degree Correlation': -0.06748509857500148

## Save as a pickle

In [8]:
with open('graph_top_stats.pickle', 'wb') as handle:
    pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)