In [1]:
import sys
import cudf
import cugraph
from cugraph import Graph, connected_components
from numba import cuda, jit, int32
import numpy as np 
import cupy as cp
from math import ceil, floor, log2
import seaborn as sns
from numba.cuda.random import create_xoroshiro128p_states, xoroshiro128p_uniform_float32
from cugraph.generators import multi_rmat, rmat

GPU_MEM_LIMIT = (18*1024**3) / 1e9



In [None]:

class Analysis:

    graph_properties = [
        'nodes_number',
        'edges_number',
        'avg_clustering_coefficient',
        'degree_distribution_tot',
        'degree_distribution_in',
        'degree_distribution_out'
    ]

    def __init__(self, direction=False):
        self.graph = Graph(directed=direction)
        self.__properties = {} # Store all the extra variables

    def new_property(self, **kwargs):
        for kwarg in kwargs:
            if kwarg not in self.graph_properties:
                raise KeyError(f'Got an unexpected key "{kwarg}"')
        
        self.__properties.update(kwargs)

    def get_graph(self):
        return self.graph


    def get_property(self, key):
        try:
            return self.__properties[key]
        except KeyError:
            sys.exit(f'Invalid key {key}')

    def get_properties(self):
        return self.__properties

In [2]:
#CUDA Kernels to compute:
# - Degree Distribution
# - Adjacency Lists ( directed and undirected )
# - Edges between neighbors (directed and undirected )


@cuda.jit
def compute_degree_distribution(n, count, percentage):
    """
    - n                 number of vertices
    - count             input cudf series
    - percentage        output cudf series
    """
    i = cuda.threadIdx.x + (cuda.blockIdx.x * cuda.blockDim.x)
    
    if i < count.size:
        percentage[i] = (count[i] / n) * 100

@cuda.jit
def adj_list(nodes, src, dst, undirected, out):
    tx = cuda.threadIdx.x
    bx = cuda.blockIdx.x
    dx = cuda.blockDim.x
    tid = dx * bx + tx
    pos = 0
    if tid < len(nodes):
        u_node = nodes[tid]
        for j in range(len(src)):
            if u_node == src[j]:
                out[tid, pos] = dst[j]
                pos += 1
            if undirected:
                if u_node == dst[j]:
                    out[tid, pos] = src[j]
                    pos += 1

@cuda.jit
def reciprocal_count(A, nodes, src, dst, M, N, out):
    ty = cuda.threadIdx.y; tx = cuda.threadIdx.x
    by = cuda.blockIdx.y; bx = cuda.blockIdx.x
    dy = cuda.blockDim.y; dx = cuda.blockDim.x
    row = dy * by + ty
    column = dx * bx + tx
    
    if row < M and column < N and A[row, column] != -1:
        v_node = A[row, column]
        u_node = nodes[row]
        for j in range(len(src)):
            if v_node == src[j] and u_node == dst[j]:
                cuda.atomic.add(out, row, 1)


@cuda.jit
def find_uv_edges(A, src, dst, M, N, out):
    ty = cuda.threadIdx.y; tx = cuda.threadIdx.x
    by = cuda.blockIdx.y; bx = cuda.blockIdx.x
    dy = cuda.blockDim.y; dx = cuda.blockDim.x
    row = dy * by + ty
    column = dx * bx + tx
    
    if row < M and column < N and A[row, column] != -1:
        u_node = A[row, column]
        for j in range(N):
            v_node = A[row, j]
            if v_node != -1 and u_node != v_node:
                common = explore_edges(u_node, v_node, src, dst)
                cuda.atomic.add(out, row, common)
                
#            j += 1


@cuda.jit(device=True)
def explore_edges(u, v, src, dst):
    result, k = 0, 0
    while k < src.size:
        if u == src[k] and v == dst[k]:
            result += 1
        k += 1

    return result


@cuda.jit
def lcc(nodes, edges, df_degree, undirected, lcc_array):
    tid = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
    if tid < len(nodes):
        lcc = 0.0
        item = edges[tid]
        if item > 0:
            node_deg = df_degree[tid]
            if undirected:
                lcc = 2 * (item / (node_deg * (node_deg - 1)))
            else:
                lcc = item / (node_deg * (node_deg - 1))

        cuda.atomic.add(lcc_array, 0, lcc)

@cuda.jit
def gnp_erdos_renyi(p, rng_states, M, N, matrix):
    tid = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
    if tid < M:
        for pos in range(N):
            rnd = xoroshiro128p_uniform_float32(rng_states, tid)
            if rnd <= p:
                matrix[tid, pos] = 1


@cuda.jit
def align(src, const):
    tid = cuda.blockDim.x * cuda.blockIdx.x + cuda.threadIdx.x
    if tid < len(src):
        src[tid] = src[tid] + const
                

In [None]:

GPU_MEM_LIMIT = (18*1024**3) / 1e9

def load_data(filepath):
    df_edges = cudf.read_csv(filepath, delimiter=',', names=['src','dst','wt'],
                             dtype=['int32','int32','float64'])
                                
    df_edges.drop_duplicates(subset=['src', 'dst'], inplace=True)
    df_edges.dropna(axis=0, how='any', inplace=True)

    return df_edges

def build_graph(
    graph, 
    edges, 
    source='src',
    destination='dst',
    edge_attr='wt',
    renumber=True,
    store_transposed=False
) -> Graph:

    graph.from_cudf_edgelist(edges, source, destination, edge_attr, renumber, 
                             store_transposed)

    print(f'Building graph completed')
    return graph

def view_edgelist(graph):
    edges = graph.view_edge_list()
    return edges

    
def nodes(graph):
    res = graph.nodes().sort_values(ascending=True).to_cupy()
    return res

def number_of_vertices(graph):
    res = graph.number_of_vertices()
    print(f'Number of nodes calculated')
    return res

def number_of_edges(graph):
    res = graph.number_of_edges()
    print(f'Number of edges calculated')
    return res

def degree(graph, mode='tot'):

    if mode in 'tot': df = graph.degree()
    elif mode in 'in': df = graph.in_degree()
    elif mode in 'out': df = graph.out_degree()

    df = df.sort_values(by='vertex', ignore_index=True)
    print(f'"{mode}" degree calculated')
    return df


def degree_distribution(n, df, mode='tot') -> cudf.DataFrame:
    """
    - df           cudf dataframe containing in/out/total degree per each node
    - n            number of vertices    
    - mode         tot OR in OR out degree to specify nothing(??????????????)      
    """
    
    degree_series = df['degree'].value_counts()
    df_distribution = cudf.DataFrame({'degree': degree_series.index.to_cupy(),
                                      'count': degree_series.to_cupy(),
                                      'percentage': 0.0})

    size = len(df_distribution)
    compute_degree_distribution.forall(size)(n, df_distribution['count'],
                                                  df_distribution['percentage'])
    
    print(f'"{mode}" degree distribution calculated')
    return df_distribution


def build_main_weakly_connected_component_edges(graph, edges) -> cudf.DataFrame:
    df_components = connected_components(graph, connection='weak')
    target_label = df_components['labels'].mode()[0]
    df_nodes = df_components[df_components['labels'] == target_label]
    edges_list = edges.loc[edges['src'].isin(df_nodes['vertex'])]
    
    print(f'Main component\'s edges calculated')
    return edges_list




def compute_bounds(x, y, bytes_) -> int:
    size = ((x * y * bytes_) / 1e9)
    if size > GPU_MEM_LIMIT:
        x  = compute_bounds(int(x/2), y, bytes_)
    return x


def init_cc(n, batch_size, iteration, mod):

    """
    - n                 number of vertices of the graph
    - batch_size        range of nodes examined each epoch
    - iteration         current epoch
    - mod               the margin of n / batch_size
    """

    if (batch_size * iteration) <= n:
        start = 0 + (batch_size*(iteration - 1))
        stop = start + batch_size
        M = batch_size
    else:
        start = 0 + (batch_size*(iteration - 1))
        stop = start + mod
        M = mod
    print(start, stop)
    return start, stop, M
        
    
def avg_clustering_coefficient(
    n,
    src,
    dst,
    df_degree,
    N,
    nodes_cp=None,
    undirected=False

) -> float:


    local_ccs = cp.zeros((1,), dtype='float32')
    M = compute_bounds(n, N, cp.dtype(cp.int32).itemsize)
    epochs = ceil(n / M)
    leftovers = n % M
     
    for i in range(1, epochs+1):  
        start, stop, M = init_cc(n, M, i, leftovers)
        nodes = cp.arange(start, stop, 1)
        if nodes_cp is not None: nodes = nodes_cp[start : stop]
        matrix = cp.empty((M, N), dtype='int32')
        matrix.fill(-1)
        edgespernode = cp.zeros(M, dtype='int32')
        reciprocal = cp.zeros(M, dtype='int32')
        start_ev = cuda.event()
        stop_ev = cuda.event()

        threadsperblock = 1024
        blockspergrid = (M + (threadsperblock -1)) // threadsperblock
        threadsperblock_2D = (32, 32)
        blockspergrid_x = (N + (threadsperblock_2D[1] - 1)) // threadsperblock_2D[1]
        blockspergrid_y = (M + (threadsperblock_2D[0] - 1)) // threadsperblock_2D[0]
        blockspergrid_2D = (blockspergrid_x, blockspergrid_y)



        start_ev.record()
        adj_list[blockspergrid, threadsperblock](nodes, src, dst, undirected, matrix)
#        reciprocal_count[blockspergrid_2D, threadsperblock_2D](matrix, nodes, src, dst, M, N, reciprocal)
        find_uv_edges[blockspergrid_2D, threadsperblock_2D](matrix, src, dst, M, N, edgespernode)       
        lcc[blockspergrid, threadsperblock](nodes, edgespernode, df_degree, undirected, local_ccs)
        stop_ev.record()
        cuda.synchronize()

    elapsed_t = (cuda.event_elapsed_time(start_ev, stop_ev) / 1000) / 60
    result = local_ccs[0].get() / n
    print(f'Average clustering coefficient calculcated. {result}')
    print('Elapsed time: %.6f minutes' % elapsed_t)
    return result


def random_graph_generator(n, edges) -> cudf.DataFrame:
    print(f'Generating random graph with ER model...')
    L = Graph(directed=True)
    df = cudf.DataFrame({'src': None, 'dst': None})
    p = edges / (n * (n - 1))
    N = n
    M = compute_bounds(n, N, cp.dtype(cp.int32).itemsize)
    epochs = ceil(n / M)
    leftovers = n % M
    epoch = 1

    while epoch <= epochs:
        L.clear()
        start, stop, M = init_cc(n, M, epoch, leftovers)
        matrix = cp.zeros((M, N), dtype='int32')
        threadsperblock = 1024
        blockspergrid = (matrix.shape[0] + (threadsperblock - 1)) // threadsperblock
        rng_states = create_xoroshiro128p_states(threadsperblock * blockspergrid, seed=42)
        gnp_erdos_renyi[blockspergrid, threadsperblock](p, rng_states, M, N, matrix)
        a_matrix = np.empty((M,N), dtype='int32')
        cp.asnumpy(matrix, stream=None, out=a_matrix)
#        arrays.append(a_matrix)
        L.from_numpy_array(a_matrix)
        df_l = L.view_edge_list()
        df_l.pop('weights')
        size = len(df['src'])
        align.forall(size)(df_l['src'], start)
        df = cudf.concat([df, df_l], ignore_index=True)
        del a_matrix
        del matrix
        epoch += 1
    
#    A = np.concatenate(arrays)
    df.dropna(inplace=True)

    print(f'Random graph\'s edges list calculated')
    return df


In [None]:
input_file = "../graphs/ethereum/2020-01-01_2020-01-01/network.csv"

In [None]:
df_edgelist = load_data(input_file)
analysis_ = Analysis(direction=True)
G = analysis_.get_graph()
build_graph(G, df_edgelist, renumber=False)

vertex_number_ = number_of_vertices(G)
edges_number_ = number_of_edges(G)
df_total_deg = degree(G, 'tot')
df_in_deg = degree(G, 'in')
df_out_deg = degree(G, 'out')
df_total_dist = degree_distribution(vertex_number_, df_total_deg, 'tot')
df_in_dist = degree_distribution(vertex_number_, df_in_deg, 'in')
df_out_dist = degree_distribution(vertex_number_, df_out_deg, 'out')
#totals = sns.displot(df_total_dist.to_pandas(), x='degree', kde=True)
#outs = sns.displot(df_out_dist.to_pandas(), x='degree', kde=True)
#ins = sns.displot(df_in_dist.to_pandas(), x='degree', kde=True)
N = df_out_deg['degree'].max()
avg_cc = avg_clustering_coefficient(
    vertex_number_, 
    df_edgelist['src'],
    df_edgelist['dst'], 
    df_out_deg['degree'],
    N, 
    undirected=False
)


In [None]:
df_edgelist_mc = build_main_weakly_connected_component_edges(G, df_edgelist)
analysis_mc = Analysis(direction=True)
G = analysis_mc.get_graph()
build_graph(G, df_edgelist_mc, renumber=True)

vertex_number_mc = number_of_vertices(G)
edges_number_mc = number_of_edges(G)
df_total_deg_mc = degree(G, 'tot')
df_in_deg_mc = degree(G, 'in')
df_out_deg_mc = degree(G, 'out')
df_total_dist_mc = degree_distribution(vertex_number_mc, df_total_deg_mc, 'tot')
df_in_dist_mc = degree_distribution(vertex_number_mc, df_in_deg_mc, 'in')
df_out_dist_mc = degree_distribution(vertex_number_mc, df_out_deg_mc, 'out')
#totals = sns.displot(df_total_dist_mc.to_pandas(), x='degree', kde=True)
#outs = sns.displot(df_out_dist_mc.to_pandas(), x='degree', kde=True)
#ins = sns.displot(df_in_dist_mc.to_pandas(), x='degree', kde=True)
vertices_mc = nodes(G)
N = df_out_deg_mc['degree'].max()
avg_cc_mc = avg_clustering_coefficient(
    vertex_number_mc, 
    df_edgelist_mc['src'],
    df_edgelist_mc['dst'], 
    df_out_deg_mc['degree'],
    N,
    nodes_cp=vertices_mc,
    undirected=True
)

In [None]:
G.clear()

In [None]:
df_edgelist_rnd = random_graph_generator(vertex_number_, edges_number_)
analysis_rnd = Analysis(direction=True)
G = analysis_rnd.get_graph()
build_graph(G, df_edgelist_rnd, edge_attr=None, renumber=False)
vertex_number_rnd = number_of_vertices(G)
edges_number_rnd = number_of_edges(G)
df_total_deg_rnd = degree(G, 'tot')
df_in_deg_rnd = degree(G, 'in')
df_out_deg_rnd = degree(G, 'out')
df_total_dist_rnd = degree_distribution(vertex_number_rnd, df_total_deg_rnd, 'tot')
df_in_dist_rnd = degree_distribution(vertex_number_rnd, df_in_deg_rnd, 'in')
df_out_dist_rnd = degree_distribution(vertex_number_rnd, df_out_deg_rnd, 'out')
#totals = sns.displot(df_total_dist_rnd.to_pandas(), x='degree', kde=True)
#outs = sns.displot(df_out_dist_rnd.to_pandas(), x='degree', kde=True)
#ins = sns.displot(df_in_dist_rnd.to_pandas(), x='degree', kde=True)
N = df_out_deg_rnd['degree'].max()
avg_cc_rnd = avg_clustering_coefficient(
    vertex_number_rnd,
    df_edgelist_rnd['src'],
    df_edgelist_rnd['dst'],
    df_out_deg_rnd['degree'],
    N,
    undirected=False
)


In [None]:
df_edgelist_rnd_mc = build_main_weakly_connected_component_edges(G, df_edgelist_rnd)
analysis_rnd_mc = Analysis(direction=True)
G = analysis_rnd_mc.get_graph()
build_graph(G, df_edgelist_rnd_mc, edge_attr=None, renumber=True)
vertex_number_rnd_mc = number_of_vertices(G)
edges_number_rnd_mc = number_of_edges(G)
df_total_deg_rnd_mc = degree(G, 'tot')
df_in_deg_rnd_mc = degree(G, 'in')
df_out_deg_rnd_mc = degree(G, 'out')
df_total_dist_rnd_mc = degree_distribution(vertex_number_rnd_mc, df_total_deg_rnd_mc, 'tot')
df_in_dist_rnd_mc = degree_distribution(vertex_number_rnd_mc, df_in_deg_rnd_mc, 'in')
df_out_dist_rnd_mc = degree_distribution(vertex_number_rnd_mc, df_out_deg_rnd_mc, 'out')
#totals = sns.displot(df_total_dist_rnd_mc.to_pandas(), x='degree', kde=True)
#outs = sns.displot(df_out_dist_rnd_mc.to_pandas(), x='degree', kde=True)
#ins = sns.displot(df_in_dist_rnd_mc.to_pandas(), x='degree', kde=True)
vertices_rnd_mc = nodes(G)
N = df_out_deg_rnd_mc['degree'].max()
avg_cc_rnd_mc = avg_clustering_coefficient(
    vertex_number_rnd_mc,
    df_edgelist_rnd_mc['src'],
    df_edgelist_rnd_mc['dst'],
    df_out_deg_rnd_mc['degree'],
    N,
    nodes_cp=vertices_rnd_mc,
    undirected=False
)

In [None]:
input_file = "../graphs/litecoin/network.csv"

In [None]:
"""
    COMPUTE AVG_CC ON UNDIRECTED GRAPH

In order to properly compute the average clustering coefficient on undirected
graphs, we need to:

-   Create a directed graph with the given edgelist (downloaded) and
    and edge_attr=None to ignore weights numbers
-   Transform created graph as an undirected graph
-   Take again the new edgelist from the undirected graph

"""
df_edgelist_undirected = load_data(input_file)
analysis_undirected = Analysis(direction=True)
G = analysis_undirected.get_graph()
build_graph(G, df_edgelist_undirected, edge_attr=None, renumber=False)
G = G.to_undirected()
df_edgelist_undirected = view_edgelist(G)
vertex_number_undirected = number_of_vertices(G)
edges_number_undirected = number_of_edges(G)
df_total_deg_undirected = degree(G, 'tot')
df_in_deg_undirected = degree(G, 'in')
df_out_deg_undirected = degree(G, 'out')
df_total_dist_undirected = degree_distribution(vertex_number_undirected, df_total_deg_undirected, 'tot')
df_in_dist_undirected = degree_distribution(vertex_number_undirected, df_in_deg_undirected, 'in')
df_out_dist_undirected = degree_distribution(vertex_number_undirected, df_out_deg_undirected, 'out')
#totals = sns.displot(df_total_dist_undirected.to_pandas(), x='degree', kde=True)
#outs = sns.displot(df_out_dist_undirected.to_pandas(), x='degree', kde=True)
#ins = sns.displot(df_in_dist_undirected.to_pandas(), x='degree', kde=True)
N = df_out_deg_undirected['degree'].max()
avg_cc_undirected = avg_clustering_coefficient(
    vertex_number_undirected,
    df_edgelist_undirected['src'],
    df_edgelist_undirected['dst'],
    df_out_deg_undirected['degree'],
    N,
    undirected=True
)