# Compute graph topology



## Imports 



In [1]:
import sys
import os
from pathlib import Path
import networkx as nx
import numpy as np
import pickle

from joblib import Parallel, delayed
from collections import ChainMap

In [2]:
sys.path.append(str(Path(os.path.abspath("")).parent))

import config
from dataset import load_ids
from dataset import _process_problem

In [10]:
#ID_FILE = '../id_files/deepmath.txt'
ID_FILE = '../id_files/train.txt'

ids = load_ids(ID_FILE)

#ids = ids[:10] # FIXME
print('Number of problems', len(ids))

Number of problems 22179


In [11]:
ids

['l16_msuhom_1',
 't17_pdiff_3',
 't46_intpro_1',
 't12_circled1',
 't20_zf_model',
 't24_laplace',
 't21_anproj_2',
 't36_equation',
 't52_quaterni',
 't55_intpro_1',
 'l13_euclid_8',
 't25_rlsub_1',
 't82_bvfunc11',
 't80_matrix10',
 't32_fuzzy_1',
 't16_procal_1',
 't13_hilbert3',
 't217_member_1',
 't38_bvfunc14',
 'l78_intpro_1',
 't57_nat_d',
 'l46_modelc_1',
 't12_fvsum_1',
 't2_c0sp2',
 't43_integra9',
 't59_group_11',
 'l7_int_2',
 't6_zf_lang1',
 't129_jordan2c',
 't20_fscirc_1',
 't11_collsp',
 't14_quatern3',
 't36_tsep_1',
 't24_diff_1',
 't24_facirc_1',
 't21_fuzzy_2',
 't32_quaterni',
 'l75_fib_num4',
 't12_taylor_1',
 't17_gate_1',
 't44_setfam_1',
 't14_rusub_2',
 't50_nfcont_1',
 't94_scmfsa_2',
 't12_series_5',
 't1_arytm_1',
 'l153_jordan',
 't29_bvfunc_1',
 'l8_topreal2',
 't40_jgraph_6',
 'l48_waybel_6',
 't17_conlat_1',
 't11_isomichi',
 't54_valued_2',
 't67_funct_2',
 'l8_xreal_0',
 't17_topalg_2',
 't7_cohsp_1',
 't28_substut1',
 'l27_xcmplx_1',
 't44_afvect0'

## Statistics computation function

In [5]:
def compute_stats(i: str):
    
    res = {i: {}}


    data = _process_problem(i, config.PROBLEM_DIR, remove_argument_node=False)

    G = nx.Graph()    
    sources = np.array(data.edge_index[0].detach())
    targets = np.array(data.edge_index[1].detach())
    G.add_edges_from(list(zip(sources, targets)))

    # Compute max, min, avg of degrees
    degrees = list((dict(G.degree).values()))
    degree_stats = {'Degree max': max(degrees), 'Degree min': min(degrees), 'Degree mean': np.mean(degrees)}
    res[i].update(degree_stats)

    # Number of nodes and edges
    res[i].update({'No Edges': len(data.edge_index[0])})
    res[i].update({'No Nodes': len(data.x)})

    # Number of axioms
    res[i].update({'No Axioms': len(data.premise_index)})

    # Average shortest vertex-vertex distance
    if nx.is_connected(G): # Fails if not connected - just ignore
        nx.average_clustering(G)
        res[i].update({'Average geodesic': nx.average_shortest_path_length(G)})

    # Average cluster
    res[i].update({'Cluster coefficient': nx.average_clustering(G)})

    # Degree correlation
    res[i].update({'Degree Correlation': nx.degree_pearson_correlation_coefficient(G)})

    # Transitivity - fraction of all possible triangles
    res[i].update({'Transitivity': nx.transitivity(G)})

    # Communities - using Louvain method
    res[i].update({'Communities': len(nx.community.louvain_communities(G, seed=7))})

    # Diameter
    res[i].update({'Diameter': nx.diameter(G)})

    return res

In [6]:
result = Parallel(n_jobs=os.cpu_count() - 2)(delayed(compute_stats)(i) for i in ids)
data = dict(ChainMap(*result)) # Convert lsit to dict of dict

In [9]:
#data

## Save as a pickle

In [8]:
with open('graph_top_stats.pickle', 'wb') as handle:
    pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)