In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.linalg as la
import pylab as pl

# Function definitions for global scale graph properties

In [2]:
def get_num_vertices(adj_mat):
    return adj_mat.shape[0]

def get_num_edges(adj_mat):
    n = get_num_vertices(adj_mat)
    upper_tri_idx = np.triu_indices(n, 1)
    return np.count_nonzero(adj_mat[upper_tri_idx])

def compute_graph_volume(adj_mat):
    return float(np.sum(adj_mat))/2.0

def compute_graph_density(adj_mat):
    n = get_num_vertices(adj_mat)
    m = get_num_edges(adj_mat)
    
    density = float(2 * m)/float(n * (n - 1))
    return density

# Function definitions for local scale graph properties

In [3]:
def reject_outliers(data, m=2):
    return data[abs(data - np.mean(data)) < m * np.std(data)]

def compute_degree_distribution(adj_mat):
    # Assuming a simple, possibly weighted graph, degree of each vertex
    # will be the sum along one axis
    deg_arr = np.sum(adj_mat, axis=0)
    return deg_arr

def compute_median_degree(adj_mat):
    deg_arr = compute_degree_distribution(adj_mat)    
    return np.median(deg_arr)

def compute_max_degree(adj_mat):
    return np.max(adj_mat)

def compute_unweighted_degree_distribution(adj_mat):
    adj_mat[adj_mat > 0] = 1
    deg_arr = np.sum(adj_mat, axis=0)
    return deg_arr

def compute_average_degree(adj_mat):
    deg_arr = compute_degree_distribution(adj_mat)    
    return np.mean(deg_arr)

def display_degree_distribution(adj_mat, dataset_name, override_min=0, num_bins_log_log=50):
    deg_arr = compute_degree_distribution(adj_mat)
    #deg_arr = reject_outliers(deg_arr)
    # Display histogram of degree distribution
    plt.subplot(2, 1, 1)
    plt.hist(deg_arr, bins='auto')
    plt.title('Deg Dist of {}'.format(dataset_name))
    
    # Display histogram of degree distribution with log-log plot
    plt.subplot(2, 1, 2)
    max_val = np.amax(adj_mat)
    min_val = np.amin(adj_mat[adj_mat > override_min])
    print min_val
    pow_ten_min = 10 ** (np.floor(np.log10(min_val+0.1)))
    pow_ten_max = 10 ** (np.ceil(np.log10(max_val)))
    plt.hist(deg_arr, bins=np.logspace(np.log10(pow_ten_min), np.log10(pow_ten_max), num_bins_log_log))
    plt.gca().set_xscale('log')
    plt.title('Log-Log Deg Dist of {}'.format(dataset_name))
    
    plt.show()
    
def get_neighborhood_set_of_node(adj_mat, node):
    nbrhd = []
    n = get_num_vertices(adj_mat)
    
    # Find all edges connecting node to other nodes
    for j in range(0, n):
        if adj_mat[node][j] != 0:
            nbrhd.append(j)
    return nbrhd
    
def compute_clustering_coefficient(adj_mat, node):
    # Get the degree of the node
    node_deg = compute_unweighted_degree_distribution(adj_mat)[node]
    
    if node_deg <= 1:
        #print("Node {} has degree {}. CC is undefined".format(node, node_deg))
        return 0
    
    # Get the neighborhood set of the node
    nbrhd = get_neighborhood_set_of_node(adj_mat, node)
       
    # Get reduced neighborhood matrix of node
    nbrhd_matrix = adj_mat[:, nbrhd]
    nbrhd_matrix = nbrhd_matrix[nbrhd, :]
    nbrhd_matrix[nbrhd_matrix != 0] = 1
       
    # Count the number of triangles formed by the neighborhood set
    num_triangles = float(np.sum(nbrhd_matrix))/2.0

    # Compute the clustering coefficient and add it to the array
    clustering_coefficient = float(2 * num_triangles)/float(node_deg * (node_deg - 1))
    return clustering_coefficient
    
def compute_all_clustering_coefficients(adj_mat):
    # For each node in the graph, compute the clustering coefficient and store
    # it in an array
    n = get_num_vertices(adj_mat)
    cluster_coeff_arr = np.zeros((n, 1))
    
    for node in range(0, n):
        # Compute the clustering coefficient and add it to the array
        cluster_coeff_arr[node] = compute_clustering_coefficient(adj_mat, node)
        
    return cluster_coeff_arr    

def display_clustering_coefficient_dist(adj_mat, dataset_name, num_bins_log_log=50):
    cluster_coeff_arr = compute_all_clustering_coefficients(adj_mat)
    #cluster_coeff_arr = reject_outliers(cluster_coeff_arr)
    # Display histogram of clustering coefficients
    plt.subplot(2, 1, 1)
    plt.hist(cluster_coeff_arr, bins='auto')
    plt.title('CC Dist of {}'.format(dataset_name))
    
    # Display histogram of degree distribution with log-log plot
    plt.subplot(2, 1, 2)
    max_val = np.amax(cluster_coeff_arr)
    min_val = np.amin(cluster_coeff_arr)
    pow_ten_min = 10 ** (np.floor(np.log10(min_val+0.1)))
    pow_ten_max = 10 ** (np.ceil(np.log10(max_val)))
    plt.hist(deg_arr, bins=np.logspace(np.log10(pow_ten_min), np.log10(pow_ten_max), num_bins_log_log))
    plt.gca().set_xscale('log')
    plt.title('Log-Log CC Dist of {}'.format(dataset_name))
    
    plt.show()

def compute_average_clustering_coefficient(adj_mat):
    n = get_num_vertices(adj_mat)
    avg_cc = float(np.sum(compute_all_clustering_coefficients(adj_mat)))/float(n)
    return avg_cc
   