<a href="https://colab.research.google.com/github/CarlosVargasF/Stage_Liris_Vargas/blob/master/Code/two_func.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install infomap
!pip install wurlitzer
!pip install leidenalg 
!pip install git+https://github.com/GiulioRossetti/cdlib.git > /dev/null
#!pip install cigram 

Collecting infomap
[?25l  Downloading https://files.pythonhosted.org/packages/aa/eb/7033f55100c74385cfbf41762e903adcc0844ab8d2765f556f5d6f9e4d39/infomap-1.1.3.tar.gz (264kB)
[K     |█▎                              | 10kB 19.6MB/s eta 0:00:01[K     |██▌                             | 20kB 1.8MB/s eta 0:00:01[K     |███▊                            | 30kB 2.3MB/s eta 0:00:01[K     |█████                           | 40kB 2.6MB/s eta 0:00:01[K     |██████▏                         | 51kB 2.0MB/s eta 0:00:01[K     |███████▍                        | 61kB 2.3MB/s eta 0:00:01[K     |████████▋                       | 71kB 2.5MB/s eta 0:00:01[K     |██████████                      | 81kB 2.8MB/s eta 0:00:01[K     |███████████▏                    | 92kB 3.0MB/s eta 0:00:01[K     |████████████▍                   | 102kB 2.8MB/s eta 0:00:01[K     |█████████████▋                  | 112kB 2.8MB/s eta 0:00:01[K     |██████████████▉                 | 122kB 2.8MB/s eta 0:00:01[K 

In [3]:
from cdlib import algorithms
#from cdlib import viz
from cdlib import NodeClustering
from cdlib import evaluation
import networkx as nx
#from networkx.generators.community import LFR_benchmark_graph
import time
import pandas as pd
import numpy as np
#from cigram import lfr_benchmark_graph
import matplotlib.pyplot as plt
from urllib.request import urlopen
from io import BytesIO
from zipfile import ZipFile

In [14]:
#*********************** INNER FUNC (GENERIC FUNCTION)*************************************
def evaluate(graph_comms_list, algorithm_dict, eval_method_dict, benchmark, case_name=None):
  
  if case_name:
    case = case_name
  else:
    case = '-'

  #Check graph_comms_list parameter
  if isinstance(graph_comms_list, list) and len(graph_comms_list):
    graphs = [gc[0] for gc in graph_comms_list]
    communities = [gc[1] for gc in graph_comms_list]
  elif isinstance(graph_list, tuple):
    graphs = list(graph_comms_list[0])
    communities = list(graph_comms_list[1])
  else:
    raise ValueError("Please insert a list of couples (graph,commnunities).")

  #Creation of NodeClustering objects     
  lfr_nodclust_obj = [NodeClustering(C, G, 'Ground Truth') for C, G in zip(communities, graphs)]

  #Application of algorithms over all the benchmarks
  res = pd.DataFrame(columns=['Algorithm', 'Graph', 'Performance', 'Eval_Method', 'Benchmark', 'Case', 'Exec_Time', 'Status'])
     
  for alg in algorithm_dict:
    algo_data = []    
    for G, C in zip(graphs, lfr_nodclust_obj):
      try:
        s_time = time.time()
        solution = algorithm_dict[alg](G)
        e_time = time.time() - s_time
        for method in eval_method_dict:
          #Performance evaluation (NMI)
          perf = eval_method_dict[method](C, solution)
          algo_data.append([alg, G.name, perf[0], method, benchmark, case, e_time, 'ok'])
      except Exception as e:
        algo_data.append([alg, G.name, '-', '-', benchmark, case, '-', repr(e)])
    
    #Update of results
    algo_data_df = pd.DataFrame(algo_data, columns=['Algorithm', 'Graph', 'Performance', 'Eval_Method', 'Benchmark', 'Case', 'Exec_Time', 'Status'])
    #algo_data_df['NMI'] = perf_df['score']
    res = pd.concat([res, algo_data_df], ignore_index=True)
   
  return res

In [21]:
# *********************************** OUTTER FUNC (STANDARD TEST CASES) *******************************************

def evaluate_and_compare(algos_list, methods_list, benchmark_type, case_list='all'):
  '''
  Some CD algorthms (CDlib) <algos_list>       [CD algorithms to apply] :
    - 'gn'            [girvan_newman]
    - 'greedy'        [greedy_modularity (Clauset et al)]
    - 'louvain'       [louvain (Blondel et al)]
    - 'cfinder'       [kclique]
    - 'mcl'           [markov_clustering]
    - 'infomap'       [infomap]
    - 'em'            [expectation-maximization]
    - 'leiden'        [leiden]
  
  Partition evaluation methods (CDlib) <methods_list>       [evaluation metric to apply] :
    - 'nmi'           [girvan_newman]
    - 'ami'           [greedy_modularity (Clauset et al)]
    - 'ari'           [louvain (Blondel et al)]
    - 'f1'            [kclique]
    - 'nf1'           [markov_clustering]
    - 'omega'         [infomap]
    - 'onmi_lfk'      [expectation-maximization]
    - 'onmi_mgh'      [leiden]
    - 'var_inf'       [leiden]

  Possible values for <benchmark_type>       [benchmark to evaluate] :
    - 'lfr_undir_unwei'
    - 'lfr_undir_wei'
    - 'lfr_dir_unwei'
    - 

  Possible values for <case_list>       [benchmark cases to evaluate] :
    - 'n1k_small'     [n=1000, minc=10, maxc=50]
    - 'n1k_big'       [n=1000, minc=10, maxc=50] 
    - 'n5k_small'     [n=5000, minc=20, maxc=100]
    - 'n5k_big'       [n=5000, minc=20, maxc=100]
    - 'all'           [perform evaluation over all 4 cases]

    * It is possible to select multiple cases passing them as a list.
  '''
  
  start_time = time.time()
  
  #CD Algorithms names list (Crisp Communities)
  algos_dict = {
    'gn'          : algorithms.girvan_newman,
    'greedy'      : algorithms.greedy_modularity,
    'louvain'     : algorithms.louvain,
    'cfinder'     : algorithms.kclique,
    'mcl'         : algorithms.markov_clustering,
    'infomap'     : algorithms.infomap,
    'em'          : algorithms.em,
    'leiden'      : algorithms.leiden,

    'agdl'        : algorithms.agdl,
    'aslpaw'      : algorithms.aslpaw,
    'async_fluid' : algorithms.async_fluid,
    'cpm'         : algorithms.cpm,
    'der'         : algorithms.der,
    'edmot'       : algorithms.edmot,
    'eigenvector' : algorithms.eigenvector,
    'gdmp2'       : algorithms.gdmp2,
    'label_prop'  : algorithms.label_propagation,
    'rber_pots'   : algorithms.rber_pots,
    'rb_pots'     : algorithms.rb_pots,
    'scan'        : algorithms.scan,
    'signif_com'  : algorithms.significance_communities,
    'spinglass'   : algorithms.spinglass,
    'surp_com'    : algorithms.surprise_communities,
    'walktrap'    : algorithms.walktrap,
    'sbm_dl'      : algorithms.sbm_dl,
    'sbm_dl_nstd' : algorithms.sbm_dl_nested       
  }

  #Partition comparisons scores (CDlib)
  methods_dict = {
    'nmi'      : evaluation.normalized_mutual_information,
    'ami'      : evaluation.adjusted_mutual_information,
    'ari'      : evaluation.adjusted_rand_index,
    'f1'       : evaluation.f1,
    'nf1'      : evaluation.nf1,
    'omega'    : evaluation.omega,
    'onmi_lfk' : evaluation.overlapping_normalized_mutual_information_LFK,
    'onmi_mgh' : evaluation.overlapping_normalized_mutual_information_MGH,
    'vi'  : evaluation.variation_of_information
  }

  #Importing and reading benchmark files

  #Url base path
  url = 'https://github.com/CarlosVargasF/LFR_benchmarks_for_testing/raw/933ec3b9736fcf1225d43606f5a2fb73fbeb9216/'
  
  #Folders for undirected and unweighted benchmarks
  src_undir_unwei = {
    'n1k_small' : 'lfr_n1000_small/',
    'n1k_big'   : 'lfr_n1000_big/',
    'n5k_small' : 'lfr_n5000_small/',
    'n5k_big'   : 'lfr_n5000_big/'
  }

  #Folders for undirected and weighted benchmarks
  src_undir_wei = {
    'n5k_small_mut05' : 'lfr_n5000_small_mut05/',
    'n5k_small_mut08' : 'lfr_n5000_small_mut08/',
    'n5k_big_mut05'   : 'lfr_n5000_big_mut05/',
    'n5k_big_mut08'   : 'lfr_n5000_big_mut08/'
  }

   #Folders for directed and unweighted benchmarks
  src_dir_unwei = {
    'n1k_small' : 'lfr_n1000_small/',
    'n1k_big'   : 'lfr_n1000_big/',
    'n5k_small' : 'lfr_n5000_small/',
    'n5k_big'   : 'lfr_n5000_big/'
  }

  #Graph type
  if isinstance(benchmark_type, str) and len(benchmark_type):
    if benchmark_type == 'lfr_undir_unwei':
      url = url + 'undirected_unweighted/'
      src = src_undir_unwei
    elif benchmark_type == 'lfr_undir_wei':
      url = url + 'undirected_weighted/'
      src = src_undir_wei
    elif benchmark_type == 'lfr_dir_unwei':
      url = url + 'directed_unweighted/'
      src = src_dir_unwei
    else:
      raise ValueError('benchmark_type not supported')
  else:
    raise TypeError('Insert a valide benchmark_type as a string')

  #Checking case parameter  
  if isinstance(case_list, str): 
    if case_list == 'all':
      cases = [opt for opt in src]
    else:
      cases = list(case_list.split())
  elif (isinstance(case_list, list)) and (len(case_list)): 
    for c in case_list:
      if not(c in src):
        raise ValueError('Invalid <case> option. Verify available cases for the selected benchmark.')
    cases = case_list    
  else:
    raise TypeError('Invalid <case> input format. Please insert a list or a space-separated string of valide options or "all" for use all of them.')
  
  #Checking algos_list parameter
  if (isinstance(algos_list, str)):
    algos = dict((k, v) for k, v in algos_dict.items() if k in list(algos_list.split()))
  elif (isinstance(algos_list, list)) and (len(algos_list)): 
    for a in algos_list:
      if not(a in algos_dict):
        raise ValueError('Invalid algos_dict key. Verify available algorithm names.')
    algos = dict((k, v) for k, v in algos_dict.items() if k in algos_list)    
  else:
    raise TypeError('Invalid <algos_list> input format. Please insert a list or a space-separated string of valide algorithm names.') 

  #Checking methods_list parameter
  if (isinstance(methods_list, str)):
    methods = dict((k, v) for k, v in methods_dict.items() if k in list(methods_list.split()))
  elif (isinstance(methods_list, list)) and (len(methods_list)): 
    for m in methods_list:
      if not(m in methods_dict):
        raise ValueError('Invalid methods_dict key. Verify available evaluation method names.')
    methods = dict((k, v) for k, v in methods_dict.items() if k in methods_list)    
  else:
    raise TypeError('Invalid <methods_list> input format. Please insert a list or a space-separated string of evaluation method names.') 

  #Generate graphs according to cases   
  results = pd.DataFrame(columns=['Algorithm', 'Graph', 'Performance', 'Eval_Method', 'Benchmark', 'Case', 'Exec_Time', 'Status'])
  for case in cases:
    base_path = url + src[case]

    #Open zip files
    zip_comms = urlopen(base_path + 'cnl_files.zip')
    zip_edges = urlopen(base_path + 'nse_files.zip')
    zipfile_c = ZipFile(BytesIO(zip_comms.read()))
    zipfile_e = ZipFile(BytesIO(zip_edges.read()))

    #Read data files
    data_comms = [zipfile_c.open(line1).read().decode('utf-8').splitlines() for line1 in zipfile_c.namelist()]
    data_edges = [(zipfile_e.open(line2), line2) for line2 in zipfile_e.namelist()]
        
    #Construction of the lfr graphs
    lfr_graphs = []
    for edge_file in data_edges:
      g = nx.read_edgelist(edge_file[0], nodetype=int, data=False)
      g.name = edge_file[1]
      lfr_graphs.append(g)

    #Construction of the lfr communities
    lfr_comms = [[list(map(int, item.split())) for item in comm_file] for comm_file in data_comms]

    #Formating for evaluate function
    graphs_comms = [(g,c) for g,c in zip(lfr_graphs, lfr_comms)]

   
    #----------INNER FUNC CALL--------------------------------------------------

    #Apply selected CD algorithms and evaluate them according to selected methods
    res_eval = evaluate(graphs_comms, algos, methods, benchmark_type, case)

    #----------INNER FUNC CALL--------------------------------------------------

    results = pd.concat([results, res_eval], ignore_index=True)

  total_time = (time.time() - start_time)
  print("--- Total execution time: %d min %d sec ---" % (total_time//60, total_time%60))

  return results 


In [25]:
p=evaluate_and_compare('infomap', 'nmi', 'lfr_undir_wei', 'n5k_small_mut08')
p

--- Total execution time: 1 min 41 sec ---


Unnamed: 0,Algorithm,Graph,Performance,Eval_Method,Benchmark,Case,Exec_Time,Status
0,infomap,lfr_mut0.8_0.1_1.nse,2.402004e-16,nmi,lfr_undir_wei,n5k_small_mut08,0.968308,ok
1,infomap,lfr_mut0.8_0.1_10.nse,3.129524e-16,nmi,lfr_undir_wei,n5k_small_mut08,0.918057,ok
2,infomap,lfr_mut0.8_0.1_2.nse,2.946352e-16,nmi,lfr_undir_wei,n5k_small_mut08,0.970450,ok
3,infomap,lfr_mut0.8_0.1_3.nse,2.904458e-16,nmi,lfr_undir_wei,n5k_small_mut08,0.945808,ok
4,infomap,lfr_mut0.8_0.1_4.nse,2.719752e-16,nmi,lfr_undir_wei,n5k_small_mut08,0.980059,ok
...,...,...,...,...,...,...,...,...
75,infomap,lfr_mut0.8_0.8_5.nse,2.846815e-16,nmi,lfr_undir_wei,n5k_small_mut08,1.793720,ok
76,infomap,lfr_mut0.8_0.8_6.nse,3.403983e-16,nmi,lfr_undir_wei,n5k_small_mut08,0.988506,ok
77,infomap,lfr_mut0.8_0.8_7.nse,3.222626e-16,nmi,lfr_undir_wei,n5k_small_mut08,0.972161,ok
78,infomap,lfr_mut0.8_0.8_8.nse,2.972570e-16,nmi,lfr_undir_wei,n5k_small_mut08,0.981597,ok
