<a href="https://colab.research.google.com/github/CarlosVargasF/Stage_Liris_Vargas/blob/master/validation_func.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install infomap
!pip install wurlitzer
!pip install leidenalg 
!pip install git+https://github.com/GiulioRossetti/cdlib.git > /dev/null
#!pip install cigram 

Collecting infomap
[?25l  Downloading https://files.pythonhosted.org/packages/aa/eb/7033f55100c74385cfbf41762e903adcc0844ab8d2765f556f5d6f9e4d39/infomap-1.1.3.tar.gz (264kB)
[K     |█▎                              | 10kB 17.9MB/s eta 0:00:01[K     |██▌                             | 20kB 1.7MB/s eta 0:00:01[K     |███▊                            | 30kB 2.3MB/s eta 0:00:01[K     |█████                           | 40kB 2.5MB/s eta 0:00:01[K     |██████▏                         | 51kB 2.0MB/s eta 0:00:01[K     |███████▍                        | 61kB 2.3MB/s eta 0:00:01[K     |████████▋                       | 71kB 2.5MB/s eta 0:00:01[K     |██████████                      | 81kB 2.7MB/s eta 0:00:01[K     |███████████▏                    | 92kB 2.9MB/s eta 0:00:01[K     |████████████▍                   | 102kB 2.8MB/s eta 0:00:01[K     |█████████████▋                  | 112kB 2.8MB/s eta 0:00:01[K     |██████████████▉                 | 122kB 2.8MB/s eta 0:00:01[K 

In [0]:
from cdlib import algorithms
#from cdlib import viz
from cdlib import NodeClustering
from cdlib import evaluation
import networkx as nx
#from networkx.generators.community import LFR_benchmark_graph
import time
import pandas as pd
import numpy as np
#from cigram import lfr_benchmark_graph
import matplotlib.pyplot as plt
from urllib.request import urlopen

In [0]:
def evaluate_and_compare(algos_list, case='all', num_reps=10):
  '''
  Possible values for <case> list       [benchmark cases to evaluate] :
    - 'n1k_small'     [n=1000, minc=10, maxc=50]
    - 'n1k_big'       [n=1000, minc=10, maxc=50] 
    - 'n5k_small'     [n=5000, minc=20, maxc=100]
    - 'n5k_big'       [n=5000, minc=20, maxc=100]
    - 'all'           [perform evaluation over all 4 cases]

    * It is possible to select multiple cases passing them as a list.
  
  Possible values for <algo_name> list       [algorithms to consider] :
    - 'gn'            [girvan_newman]
    - 'greedy'        [greedy_modularity (Clauset et al)]
    - 'louvain'       [louvain (Blondel et al)]
    - 'cfinder'       [kclique]
    - 'mcl'           [markov_clustering]
    - 'infomap'       [infomap]
    - 'em'            [expectation-maximization]
    - 'leiden'        [leiden]
  '''
  
  start_time = time.time()
  
  #Algorithms names list
  algos_names = {
    'gn'      : algorithms.girvan_newman,
    'greedy'  : algorithms.greedy_modularity,
    'louvain' : algorithms.louvain,
    'cfinder' : algorithms.kclique,
    'mcl'     : algorithms.markov_clustering,
    'infomap' : algorithms.infomap,
    'em'      : algorithms.em,
    'leiden'  : algorithms.leiden      
  }
  #Importing and reading benchmark files

  #Url base path
  url = 'https://raw.githubusercontent.com/CarlosVargasF/LFR_benchmarks_for_testing/07c1b075f0e66945a4b586f4b7278e08064c3782/'
  
  #Specific folders for undirected and unweighted benchmarks
  src_undir_unwei = {
    'n1k_small' : 'LFR_n1000_small/',
    'n1k_big'   : 'LFR_n1000_big/',
    'n5k_small' : 'LFR_n5000_small/',
    'n5k_big'   : 'LFR_n5000_big/'
  }

  #Graph type
  src = src_undir_unwei

  #Checking case parameter  
  if (case == 'all'):
    cases = [opt for opt in src]
  elif (isinstance(case, list)) and (len(case)): 
    for c in case:
      if not(c in src):
        raise ValueError('Invalid <case> option. Verify available cases.')
    cases = case    
  else:
    raise ValueError('Invalid <case> input format. Please insert a list of valide options or "all" for use all of them.')
  
  #Checking algo_name parameter
  if (isinstance(algos_list, list)) and (len(algos_list)): 
    for a in algos_list:
      if not(a in algos_names):
        raise ValueError('Invalid <algos_list> name. Verify available algorithm names.')
    algos = algos_list    
  else:
    raise ValueError('Invalid <algos_list> input format. Please insert a list of valide algorithm names.') 

  perfs = {}
  for case in cases:
    base_path = url + src[case]  
    data_comms = []
    data_edges = []
    for mu_val in range(1, 9):
      for rep in range(1, num_reps + 1): 
        #Absolute path of files
        path_comms = base_path + 'community_files/lfr_' + str(mu_val/10) + '_' + str(rep) + '.cnl'
        path_edges = base_path + 'edge_files/lfr_' + str(mu_val/10) + '_' + str(rep) + '.nse'
        #Read data files
        data_comms.append(urlopen(path_comms).read().decode('utf-8').splitlines())
        data_edges.append(urlopen(path_edges))         
        
    #Construction of the lfr graphs
    lfr_graphs = [nx.read_edgelist(edge_file, nodetype=int, data=False) for edge_file in data_edges]

    #Construction of the lfr communities
    lfr_comms = [[list(map(int, item.split())) for item in comm_file] for comm_file in data_comms]

    #Creation of NodeClustering objects 
    lfr_comms_nc = []
    
    for C, G in zip(lfr_comms, lfr_graphs): 
      lfr_comms_nc.append(NodeClustering(C, G, 'Ground_truth'))

    #Application of algorithms over all the benchmarks
    perf_alg = {}
    for alg in algos:
      calculated_comms = [algos_names[alg](G) for G in lfr_graphs]
      
      #Performance evaluation (NMI)
      perf = [evaluation.normalized_mutual_information(lfr, calculated) for lfr, calculated in zip(lfr_comms_nc, calculated_comms)]
      df = pd.DataFrame(perf)
      init = 0
      avg_perf =[]
      for i in range(1,9):
        avg_perf.append(df['score'].iloc[init:i*10].mean())
        init+=10    
    
      perf_alg[alg] = avg_perf
    perfs[case] = perf_alg
  
  #Plotting NMI vs mixed parameter(mu)
  for alg in algos:
    fig = plt.figure()
    mu = np.linspace(0.1, 0.8, 8)
    for p in perfs:
      nmi = perfs[p][alg]
      nmi_mu_plt = plt.plot(mu, nmi, '-o', label=p) 
    plt.xlabel('Mixing parameter')
    plt.ylabel('Normalized Mutual Information')
    plt.title(alg)
    plt.legend(loc='lower left')      
  
  total_time = (time.time() - start_time)
  print("--- Execution time: %d min %d sec ---" % (total_time//60, total_time%60))

  return perfs, cases, algos
  

In [5]:
p = evaluate_and_compare(['infomap'])

ConnectionResetError: ignored