# Exercício Aula 08 - Classificando Modelos de Redes

Autores:

- Gabriel Van Loon
- Mathias Fernandes 

Prof.:  Francisco Aparecido Rodrigues

Universidade de São Paulo, São Carlos, Brasil.

In [17]:
import collections
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import time

from scipy.stats import pearsonr, spearmanr

from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

from networkx.algorithms import community, centrality
from networkx.algorithms.community import greedy_modularity_communities
from community import community_louvain

from sklearn.decomposition import PCA

In [18]:
def momment_of_degree_distribution(G,m):
    M, N = 0, len(G)
    for i in G.nodes: M = M + G.degree(i)**m
    return M/N

In [19]:
def pre_proccess_network(G, verbose=False):
    if verbose: print("[ ] Pre-Proccessing network...")
    G = G.to_undirected()  # Grafo não direcionado
    G.remove_edges_from(nx.selfloop_edges(G)) # Remover auto-loops
    Gcc = sorted(nx.connected_components(G), key=len, reverse=True)  # Escolhe maior componente
    G = G.subgraph(Gcc[0])
    G = nx.convert_node_labels_to_integers(G, first_label=0)  # Renomeia os vértices
    return G

In [20]:
def load_network(dataset_path, delimiter=","):
    """
    Carrega uma network vinda de um dataset e realiza os seguintes pré-processamentos:
    - Extração do maior componente da rede;
    - Remoção de pesos das arestas e auto-loops;
    - Transformação p/ grafo não direcionado.
    
    Return:
    - G: loaded graph object
    - N: Number of nodes in G
    - k: average degree of the network
    """
    print("[ ] Loading network at", dataset_path, "...")
    G = nx.read_edgelist(dataset_path, nodetype=int, delimiter=delimiter)
    G = pre_proccess_network(G, verbose=True)
    return G, G.order(), (2*G.number_of_edges())/G.order()

### Medidas selecionadas

Para prever qual modelo se adequa mais a rede, escolhemos diversas medidas conhecidas, como conectividade e centralidade. Essa decisão é baseada na recomendação do artigo [1] "However, if there is little
information about the network structure, the best choice is to consider a large set of models
and measurements; otherwise, the classification may not be accurate."


Conectividade: vertex degree, average neighbor connectivity, clustering coefficient, and assortative coefficient.

Centralidade: betweenness centrality, degree centrality, closeness centrality, core number and others.

Posteriormente, as medidas de centralidade foram comantadas, devido ao enorme tempo de execução.

In [21]:
def feature_extraction(G, verbose=False):
    if verbose: print("[ ] Feature Extraction Start...")
    features = []
    
    # Network measures
    if verbose: print("[+]    Network measures...")
    k1, k2 = momment_of_degree_distribution(G,1), momment_of_degree_distribution(G,2)
    features.append(k1)
    features.append(k2)
    features.append(k1 - (k2**2))
    features.append(nx.average_clustering(G))
    features.append(nx.average_shortest_path_length(G))
    features.append(nx.degree_assortativity_coefficient(G))
    
    #features.append(nx.transitivity(G))
    #features.append(nx.diameter(G))
    
    # Centrality Measures (Need to fix)
    if verbose: print("[+]    Network centrality measures...")
    #features.append(np.mean(list(dict(nx.betweenness_centrality(G)).values())))
    #features.append(np.mean(list(dict(nx.degree_centrality(G)).values())))
    #features.append(np.mean(list(dict(nx.closeness_centrality(G)).values())))
    #features.append(np.mean(list(dict(nx.core_number(G)).values())))
    # features.append(np.mean(list(dict(nx.eigenvector_centrality(G, max_iter=100)).values())))
    
    return features

In [22]:
labels = {"ER": 1, "WS": 2, "BA": 3}
def generate_models_features(N, k, qtd=30):
    print("[ ] Generating models...")
    x, y = [], []
    
    # Generate Random Graphs (Erdos-Renyi)
    print("[+]    Erdos-Renyi (Random Graphs)...")
    for i in range(qtd):
        G = nx.gnp_random_graph(N, k/N, seed=None, directed=False)
        G = pre_proccess_network(G)
        x.append(feature_extraction(G))
        y.append(labels["ER"])
    
    # Generate Watts Strogatz Networks
    print("[+]    Watts Strogatz Networks...")
    for i in range(qtd):
        G = nx.watts_strogatz_graph(N, int(k), p=0.1, seed=None)
        G = pre_proccess_network(G)
        x.append(feature_extraction(G))
        y.append(labels["WS"])
        
    # Generate Barabasi Albert Networks
    print("[+]    Barabasi Albert Networks...")
    for i in range(qtd):
        G = nx.barabasi_albert_graph(N, int(k//2))
        G = pre_proccess_network(G)
        x.append(feature_extraction(G))
        y.append(labels["BA"])
        
    return np.array(x), np.array(y, dtype=np.uint8)

In [34]:
def model_predict(x_target, x, y, dim_reduction, neighbors=5):
    print("[ ] Starting classifier and scaler...")
    
    # Starting Scaller and scale 
    scaler = StandardScaler().fit(x)
    x      = scaler.transform(x)
    x_target = scaler.transform(x_target.reshape((1, -1)))
    
    # Apply Dimensionality Reduction
    if dim_reduction:
        pca = PCA(0.95)
        pca.fit(x)
        x = pca.transform(x)
        x_target = pca.transform(x_target)
        
        # Extract the most important variables
        n_pcs= pca.components_.shape[0]
        most_important = [np.abs(pca.components_[i]).argmax() for i in range(n_pcs)]
        initial_feature_names = ['k1','k2','Var k','Avg Cluster','Avg SPL', 'Assortativity Coefficient']
        most_important_names = [initial_feature_names[most_important[i]] for i in range(n_pcs)]
        
        print("[+]    The importance of each variable for PC")
        for i in range(n_pcs):
            print("[+]    PC{0} - {1} - {2}.".format(i, most_important_names[i], round(pca.explained_variance_ratio_[i], 2)))

            
    # Use K-neighboors to classify x_target
    model = KNeighborsClassifier(n_neighbors=neighbors, metric='euclidean')
    model.fit(x,y)
    
    return model.predict(x_target),  model.predict_proba(x_target)

In [24]:
def pipeline(network_path, dim_reduction=False, delimiter=","):
    start = time.time()
    G, N, k = load_network(network_path, delimiter=delimiter)
    print("[+]    Network Order: ", N)
    print("[+]    Network Avg Degree: ",k)
    x_target = np.array(feature_extraction(G, verbose=True))
    x, y       = generate_models_features(N, k, qtd=30)
    y_pred, y_probs = model_predict(x_target, x, y, dim_reduction, neighbors=5)
    print("[+]    Network Class: ", y_pred)
    print("[+]    Distribution:  ", y_probs)
    print("[ ] Elapsed time:", time.time() - start)
    

In [9]:
pipeline("data_classify/bio_budapest_male_20k/edges2.csv", delimiter=",")

[ ] Loading network at data_classify/bio_budapest_male_20k/edges2.csv ...
[ ] Pre-Proccessing network...
[+]    Network Order:  1015
[+]    Network Avg Degree:  105.58817733990148
[ ] Feature Extraction Start...
[+]    Network measures...
[+]    Network centrality measures...
[ ] Generating models...
[+]    Erdos-Renyi (Random Graphs)...
[+]    Watts Strogatz Networks...
[+]    Barabasi Albert Networks...
[ ] Starting classifier and scaler...
[+]    Network Class:  [2]
[+]    Distribution:   [[0. 1. 0.]]
[ ] Elapsed time: 352.15787744522095


In [10]:
pipeline("data_classify/bio_interactome/edges2.csv", delimiter=",")

[ ] Loading network at data_classify/bio_interactome/edges2.csv ...
[ ] Pre-Proccessing network...
[+]    Network Order:  1458
[+]    Network Avg Degree:  2.672153635116598
[ ] Feature Extraction Start...
[+]    Network measures...
[+]    Network centrality measures...
[ ] Generating models...
[+]    Erdos-Renyi (Random Graphs)...
[+]    Watts Strogatz Networks...
[+]    Barabasi Albert Networks...
[ ] Starting classifier and scaler...
[+]    Network Class:  [1]
[+]    Distribution:   [[1. 0. 0.]]
[ ] Elapsed time: 188.6230330467224


In [11]:
pipeline("data_classify/bio_plant_pol/edges2.csv", delimiter=",")

[ ] Loading network at data_classify/bio_plant_pol/edges2.csv ...
[ ] Pre-Proccessing network...
[+]    Network Order:  1882
[+]    Network Avg Degree:  16.210414452709884
[ ] Feature Extraction Start...
[+]    Network measures...
[+]    Network centrality measures...
[ ] Generating models...
[+]    Erdos-Renyi (Random Graphs)...
[+]    Watts Strogatz Networks...
[+]    Barabasi Albert Networks...
[ ] Starting classifier and scaler...
[+]    Network Class:  [3]
[+]    Distribution:   [[0. 0. 1.]]
[ ] Elapsed time: 444.24902963638306


In [12]:
pipeline("data_classify/social_crime/edges2.csv", delimiter=",")

[ ] Loading network at data_classify/social_crime/edges2.csv ...
[ ] Pre-Proccessing network...
[+]    Network Order:  1263
[+]    Network Avg Degree:  2.1805225653206652
[ ] Feature Extraction Start...
[+]    Network measures...
[+]    Network centrality measures...
[ ] Generating models...
[+]    Erdos-Renyi (Random Graphs)...
[+]    Watts Strogatz Networks...
[+]    Barabasi Albert Networks...
[ ] Starting classifier and scaler...
[+]    Network Class:  [3]
[+]    Distribution:   [[0. 0. 1.]]
[ ] Elapsed time: 137.09188795089722


In [13]:
pipeline("data_classify/social_netscience/edges2.csv", delimiter=",")

[ ] Loading network at data_classify/social_netscience/edges2.csv ...
[ ] Pre-Proccessing network...
[+]    Network Order:  379
[+]    Network Avg Degree:  4.823218997361478
[ ] Feature Extraction Start...
[+]    Network measures...
[+]    Network centrality measures...
[ ] Generating models...
[+]    Erdos-Renyi (Random Graphs)...
[+]    Watts Strogatz Networks...
[+]    Barabasi Albert Networks...
[ ] Starting classifier and scaler...
[+]    Network Class:  [2]
[+]    Distribution:   [[0. 1. 0.]]
[ ] Elapsed time: 17.468687057495117


In [14]:
pipeline("data_classify/social_new_zealand_collab/edges2.csv", delimiter=",")

[ ] Loading network at data_classify/social_new_zealand_collab/edges2.csv ...
[ ] Pre-Proccessing network...
[+]    Network Order:  1463
[+]    Network Avg Degree:  5.804511278195489
[ ] Feature Extraction Start...
[+]    Network measures...
[+]    Network centrality measures...
[ ] Generating models...
[+]    Erdos-Renyi (Random Graphs)...
[+]    Watts Strogatz Networks...
[+]    Barabasi Albert Networks...
[ ] Starting classifier and scaler...
[+]    Network Class:  [3]
[+]    Distribution:   [[0. 0. 1.]]
[ ] Elapsed time: 239.94393920898438


In [15]:
pipeline("data_classify/tec_internet_Kdl.csv/edges2.csv", delimiter=",")

[ ] Loading network at data_classify/tec_internet_Kdl.csv/edges2.csv ...
[ ] Pre-Proccessing network...
[+]    Network Order:  754
[+]    Network Avg Degree:  2.374005305039788
[ ] Feature Extraction Start...
[+]    Network measures...
[+]    Network centrality measures...
[ ] Generating models...
[+]    Erdos-Renyi (Random Graphs)...
[+]    Watts Strogatz Networks...
[+]    Barabasi Albert Networks...
[ ] Starting classifier and scaler...
[+]    Network Class:  [1]
[+]    Distribution:   [[1. 0. 0.]]
[ ] Elapsed time: 52.54865550994873


In [16]:
pipeline("data_classify/tec_jung/edges2.csv", delimiter=",")

[ ] Loading network at data_classify/tec_jung/edges2.csv ...
[ ] Pre-Proccessing network...
[+]    Network Order:  6120
[+]    Network Avg Degree:  16.434640522875817
[ ] Feature Extraction Start...
[+]    Network measures...
[+]    Network centrality measures...
[ ] Generating models...
[+]    Erdos-Renyi (Random Graphs)...
[+]    Watts Strogatz Networks...
[+]    Barabasi Albert Networks...
[ ] Starting classifier and scaler...
[+]    Network Class:  [3]
[+]    Distribution:   [[0. 0. 1.]]
[ ] Elapsed time: 4881.798941612244


In [9]:
pipeline("data_classify/tec_powergrid/edges2.csv", delimiter=",")

[ ] Loading network at data_classify/tec_powergrid/edges2.csv ...
[ ] Pre-Proccessing network...
[+]    Network Order:  4941
[+]    Network Avg Degree:  2.66909532483303
[ ] Feature Extraction Start...
[+]    Network measures...
[+]    Network centrality measures...
[ ] Generating models...
[+]    Erdos-Renyi (Random Graphs)...
[+]    Watts Strogatz Networks...
[+]    Barabasi Albert Networks...
[ ] Starting classifier and scaler...
[+]    Network Class:  [1]
[+]    Distribution:   [[1. 0. 0.]]
[ ] Elapsed time: 2848.0157766342163


### Hipótese - Redes de mesma categoria são classificadas pelos mesmos modelos

Re: Como fazemos a comparação da apenas 3 redes de cada tipo e com um conjunto limitado de features não podemos tirar uma conclusão tão forte. Mas das redes utilizadas podemos atestar que utilizando Knn (k=5) as redes não corroboraram a hipótese ao serem classificadas em categorias distintas.

# Classifying Network - Test Sample (Jazz.txt)

In [9]:
pipeline("data/jazz.txt", delimiter=" ")

[ ] Loading network at data/jazz.txt ...
[ ] Pre-Proccessing network...
[+]    Network Order:  198
[+]    Network Avg Degree:  27.696969696969695
[ ] Feature Extraction Start...
[+]    Network measures...
[+]    Network centrality measures...
[ ] Generating models...
[+]    Erdos-Renyi (Random Graphs)...
[+]    Watts Strogatz Networks...
[+]    Barabasi Albert Networks...
[ ] Starting classifier and scaler...
[+]    Network Class:  [1]
[+]    Distribution:   [[1. 0. 0.]]
[ ] Elapsed time: 11.500862836837769


In [35]:
pipeline("data/jazz.txt", dim_reduction=True, delimiter=" ")

[ ] Loading network at data/jazz.txt ...
[ ] Pre-Proccessing network...
[+]    Network Order:  198
[+]    Network Avg Degree:  27.696969696969695
[ ] Feature Extraction Start...
[+]    Network measures...
[+]    Network centrality measures...
[ ] Generating models...
[+]    Erdos-Renyi (Random Graphs)...
[+]    Watts Strogatz Networks...
[+]    Barabasi Albert Networks...
[ ] Starting classifier and scaler...
[+]    The importance of each variable for PC
[+]    PC0 - k2 - 0.68.
[+]    PC1 - k1 - 0.25.
[+]    PC2 - Assortativity Coefficient - 0.05.
[+]    Network Class:  [1]
[+]    Distribution:   [[1. 0. 0.]]
[ ] Elapsed time: 8.881659507751465


### Dimensionality Reduction

Como diversos métodos de medida são utilizados, talvez surga o problema devido ao alto grau de correlação entre diversas váriaveis, que pode levar a problemas conhecidos como "curse of dimensionality", do inglês, a maldição da dimensinalidade.

Para eliminar isso, foi usado o PCA (Principal Component analysis), que preserve a variância em 0.95%.

In [36]:
pipeline("data_classify/bio_budapest_male_20k/edges2.csv", dim_reduction=True, delimiter=",")

[ ] Loading network at data_classify/bio_budapest_male_20k/edges2.csv ...
[ ] Pre-Proccessing network...
[+]    Network Order:  1015
[+]    Network Avg Degree:  105.58817733990148
[ ] Feature Extraction Start...
[+]    Network measures...
[+]    Network centrality measures...
[ ] Generating models...
[+]    Erdos-Renyi (Random Graphs)...
[+]    Watts Strogatz Networks...
[+]    Barabasi Albert Networks...
[ ] Starting classifier and scaler...
[+]    The importance of each variable for PC
[+]    PC0 - k2 - 0.6.
[+]    PC1 - Avg Cluster - 0.24.
[+]    PC2 - Assortativity Coefficient - 0.16.
[+]    Network Class:  [2]
[+]    Distribution:   [[0. 1. 0.]]
[ ] Elapsed time: 297.17249059677124


In [37]:
pipeline("data_classify/bio_interactome/edges2.csv", dim_reduction=True, delimiter=",")

[ ] Loading network at data_classify/bio_interactome/edges2.csv ...
[ ] Pre-Proccessing network...
[+]    Network Order:  1458
[+]    Network Avg Degree:  2.672153635116598
[ ] Feature Extraction Start...
[+]    Network measures...
[+]    Network centrality measures...
[ ] Generating models...
[+]    Erdos-Renyi (Random Graphs)...
[+]    Watts Strogatz Networks...
[+]    Barabasi Albert Networks...
[ ] Starting classifier and scaler...
[+]    The importance of each variable for PC
[+]    PC0 - k2 - 0.48.
[+]    PC1 - k1 - 0.41.
[+]    PC2 - Avg Cluster - 0.06.
[+]    Network Class:  [1]
[+]    Distribution:   [[1. 0. 0.]]
[ ] Elapsed time: 182.46490716934204


In [38]:
pipeline("data_classify/bio_plant_pol/edges2.csv", dim_reduction=True, delimiter=",")

[ ] Loading network at data_classify/bio_plant_pol/edges2.csv ...
[ ] Pre-Proccessing network...
[+]    Network Order:  1882
[+]    Network Avg Degree:  16.210414452709884
[ ] Feature Extraction Start...
[+]    Network measures...
[+]    Network centrality measures...
[ ] Generating models...
[+]    Erdos-Renyi (Random Graphs)...
[+]    Watts Strogatz Networks...
[+]    Barabasi Albert Networks...
[ ] Starting classifier and scaler...
[+]    The importance of each variable for PC
[+]    PC0 - k2 - 0.66.
[+]    PC1 - k1 - 0.29.
[+]    PC2 - Assortativity Coefficient - 0.04.
[+]    Network Class:  [3]
[+]    Distribution:   [[0. 0. 1.]]
[ ] Elapsed time: 715.2855706214905


In [39]:
pipeline("data_classify/social_crime/edges2.csv", dim_reduction=True, delimiter=",")

[ ] Loading network at data_classify/social_crime/edges2.csv ...
[ ] Pre-Proccessing network...
[+]    Network Order:  1263
[+]    Network Avg Degree:  2.1805225653206652
[ ] Feature Extraction Start...
[+]    Network measures...
[+]    Network centrality measures...
[ ] Generating models...
[+]    Erdos-Renyi (Random Graphs)...
[+]    Watts Strogatz Networks...
[+]    Barabasi Albert Networks...
[ ] Starting classifier and scaler...
[+]    The importance of each variable for PC
[+]    PC0 - k2 - 0.51.
[+]    PC1 - k1 - 0.35.
[+]    PC2 - Assortativity Coefficient - 0.07.
[+]    PC3 - Avg Cluster - 0.05.
[+]    Network Class:  [3]
[+]    Distribution:   [[0. 0. 1.]]
[ ] Elapsed time: 251.32137441635132


In [40]:
pipeline("data_classify/social_netscience/edges2.csv", dim_reduction=True, delimiter=",")

[ ] Loading network at data_classify/social_netscience/edges2.csv ...
[ ] Pre-Proccessing network...
[+]    Network Order:  379
[+]    Network Avg Degree:  4.823218997361478
[ ] Feature Extraction Start...
[+]    Network measures...
[+]    Network centrality measures...
[ ] Generating models...
[+]    Erdos-Renyi (Random Graphs)...
[+]    Watts Strogatz Networks...
[+]    Barabasi Albert Networks...
[ ] Starting classifier and scaler...
[+]    The importance of each variable for PC
[+]    PC0 - k2 - 0.65.
[+]    PC1 - k1 - 0.31.
[+]    Network Class:  [2]
[+]    Distribution:   [[0. 1. 0.]]
[ ] Elapsed time: 28.56266474723816


In [41]:
pipeline("data_classify/social_new_zealand_collab/edges2.csv", dim_reduction=True, delimiter=",")

[ ] Loading network at data_classify/social_new_zealand_collab/edges2.csv ...
[ ] Pre-Proccessing network...
[+]    Network Order:  1463
[+]    Network Avg Degree:  5.804511278195489
[ ] Feature Extraction Start...
[+]    Network measures...
[+]    Network centrality measures...
[ ] Generating models...
[+]    Erdos-Renyi (Random Graphs)...
[+]    Watts Strogatz Networks...
[+]    Barabasi Albert Networks...
[ ] Starting classifier and scaler...
[+]    The importance of each variable for PC
[+]    PC0 - Avg SPL - 0.69.
[+]    PC1 - Assortativity Coefficient - 0.28.
[+]    Network Class:  [3]
[+]    Distribution:   [[0. 0. 1.]]
[ ] Elapsed time: 448.9275770187378


In [42]:
pipeline("data_classify/tec_internet_Kdl.csv/edges2.csv", dim_reduction=True, delimiter=",")

[ ] Loading network at data_classify/tec_internet_Kdl.csv/edges2.csv ...
[ ] Pre-Proccessing network...
[+]    Network Order:  754
[+]    Network Avg Degree:  2.374005305039788
[ ] Feature Extraction Start...
[+]    Network measures...
[+]    Network centrality measures...
[ ] Generating models...
[+]    Erdos-Renyi (Random Graphs)...
[+]    Watts Strogatz Networks...
[+]    Barabasi Albert Networks...
[ ] Starting classifier and scaler...
[+]    The importance of each variable for PC
[+]    PC0 - k2 - 0.5.
[+]    PC1 - k1 - 0.36.
[+]    PC2 - Assortativity Coefficient - 0.06.
[+]    PC3 - Avg Cluster - 0.06.
[+]    Network Class:  [1]
[+]    Distribution:   [[1. 0. 0.]]
[ ] Elapsed time: 92.49473714828491


In [43]:
pipeline("data_classify/tec_jung/edges2.csv", dim_reduction=True, delimiter=",")

[ ] Loading network at data_classify/tec_jung/edges2.csv ...
[ ] Pre-Proccessing network...
[+]    Network Order:  6120
[+]    Network Avg Degree:  16.434640522875817
[ ] Feature Extraction Start...
[+]    Network measures...
[+]    Network centrality measures...
[ ] Generating models...
[+]    Erdos-Renyi (Random Graphs)...
[+]    Watts Strogatz Networks...
[+]    Barabasi Albert Networks...
[ ] Starting classifier and scaler...
[+]    The importance of each variable for PC
[+]    PC0 - k2 - 0.65.
[+]    PC1 - k1 - 0.33.
[+]    Network Class:  [3]
[+]    Distribution:   [[0. 0. 1.]]
[ ] Elapsed time: 7763.542801618576


In [45]:
pipeline("data_classify/tec_powergrid/edges2.csv", dim_reduction=True, delimiter=",")

[ ] Loading network at data_classify/tec_powergrid/edges2.csv ...
[ ] Pre-Proccessing network...
[+]    Network Order:  4941
[+]    Network Avg Degree:  2.66909532483303
[ ] Feature Extraction Start...
[+]    Network measures...
[+]    Network centrality measures...
[ ] Generating models...
[+]    Erdos-Renyi (Random Graphs)...
[+]    Watts Strogatz Networks...
[+]    Barabasi Albert Networks...
[ ] Starting classifier and scaler...
[+]    The importance of each variable for PC
[+]    PC0 - Var k - 0.49.
[+]    PC1 - k1 - 0.41.
[+]    PC2 - Avg Cluster - 0.06.
[+]    PC3 - Assortativity Coefficient - 0.04.
[+]    Network Class:  [1]
[+]    Distribution:   [[1. 0. 0.]]
[ ] Elapsed time: 1776.9076867103577


### Conclusão
A inclusão do PCA melhorou o tempo de execução, porém os resultados obtidos foram os mesmos - redes de mesma categoria não parecem seguir o mesmo modelo. Vale notar que o conjunto de testes usado ainda foi pequena e não foram consideradas medidas de centralidade, o que não confere confiança a essa conclusão.