In [4]:
import numpy as np
import pickle
import networkx as nx
import pandas as pd

In [5]:
with open("./data-challenge-kernel-methods-2022-2023/training_data.pkl", "rb") as f:
    train_data = pickle.load(f)

with open("./data-challenge-kernel-methods-2022-2023/training_labels.pkl", "rb") as f:
    train_labels = pickle.load(f)

In [6]:
## Labels
np.unique(train_labels)

array([0, 1])

In [7]:
## Number of training data points
len(train_data)

6000

In [8]:
g = train_data[10]
print(f"Nodes : {g.nodes()}")
help(g)

Nodes : [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]
Help on Graph in module networkx.classes.graph object:

class Graph(builtins.object)
 |  Graph(incoming_graph_data=None, **attr)
 |  
 |  Base class for undirected graphs.
 |  
 |  A Graph stores nodes and edges with optional data, or attributes.
 |  
 |  Graphs hold undirected edges.  Self loops are allowed but multiple
 |  (parallel) edges are not.
 |  
 |  Nodes can be arbitrary (hashable) Python objects with optional
 |  key/value attributes, except that `None` is not allowed as a node.
 |  
 |  Edges are represented as links between nodes with optional
 |  key/value attributes.
 |  
 |  Parameters
 |  ----------
 |  incoming_graph_data : input graph (optional, default: None)
 |      Data to initialize graph. If None (default) an empty
 |      graph is created.  The data can be any format that is supported
 |      by the to_networkx_graph() function, currently including edge l

In [9]:
g.edges.data()

EdgeDataView([(0, 1, {'labels': [0]}), (1, 2, {'labels': [0]}), (2, 3, {'labels': [0]}), (2, 5, {'labels': [0]}), (3, 4, {'labels': [0]}), (5, 6, {'labels': [1]}), (5, 7, {'labels': [0]}), (7, 8, {'labels': [0]}), (7, 13, {'labels': [0]}), (8, 9, {'labels': [0]}), (9, 10, {'labels': [0]}), (10, 11, {'labels': [0]}), (10, 12, {'labels': [0]}), (12, 13, {'labels': [0]}), (14, 15, {'labels': [0]}), (15, 16, {'labels': [1]}), (15, 17, {'labels': [0]}), (17, 18, {'labels': [0]}), (18, 19, {'labels': [0]}), (18, 20, {'labels': [0]}), (18, 24, {'labels': [0]}), (20, 21, {'labels': [0]}), (21, 22, {'labels': [0]}), (21, 23, {'labels': [1]}), (24, 25, {'labels': [0]}), (24, 26, {'labels': [1]})])

In [10]:
g.nodes.data()

NodeDataView({0: {'labels': [1]}, 1: {'labels': [1]}, 2: {'labels': [2]}, 3: {'labels': [1]}, 4: {'labels': [1]}, 5: {'labels': [1]}, 6: {'labels': [0]}, 7: {'labels': [2]}, 8: {'labels': [1]}, 9: {'labels': [1]}, 10: {'labels': [2]}, 11: {'labels': [1]}, 12: {'labels': [1]}, 13: {'labels': [1]}, 14: {'labels': [0]}, 15: {'labels': [1]}, 16: {'labels': [0]}, 17: {'labels': [1]}, 18: {'labels': [1]}, 19: {'labels': [0]}, 20: {'labels': [1]}, 21: {'labels': [1]}, 22: {'labels': [0]}, 23: {'labels': [0]}, 24: {'labels': [1]}, 25: {'labels': [0]}, 26: {'labels': [0]}})

In [11]:
len(g.nodes)

27

In [8]:
def perform_random_walk(edge_list, g, rdm_node, l_walk):
    curr_node = rdm_node
    s = ''
    for _ in range(l_walk + 1):
        s += str(g.nodes[curr_node]['labels'][0])
        # Undirected graph !
        sample_neighbor = edge_list.loc[(edge_list['source'] == curr_node) + (edge_list["target"] == curr_node), :].sample()
        # Get new node
        if sample_neighbor["source"].values == curr_node:
            next_node = sample_neighbor["target"].values[0]
        else:
            next_node = sample_neighbor["source"].values[0]

        s+= str(sample_neighbor["labels"].values[0][0])
        curr_node = next_node  
    return s

In [6]:
def compute_kernel(g0, g1, l_walk=5, n_walks=100):
    
    # Store edge list
    edges_g0 = nx.to_pandas_edgelist(g0)
    edges_g1 = nx.to_pandas_edgelist(g1)
    
    # Run random walks
    seq_g0 = []
    seq_g1 = []
    for _ in range(n_walks):
        rdm_node_g0 = np.random.choice(g0.nodes)
        seq_g0.append(perform_random_walk(edges_g0, g0, rdm_node_g0, l_walk))

        rdm_node_g1 = np.random.choice(g1.nodes)
        seq_g1.append(perform_random_walk(edges_g1, g1, rdm_node_g1, l_walk))   
    
    # Compute kernel
    keys, vals = np.unique(seq_g1, return_counts=True)
    dico_g1 = dict(zip(list(keys), list(vals)))

    keys, vals = np.unique(seq_g0, return_counts=True)
    dico_g0 = dict(zip(list(keys), list(vals)))
    common_occ = 0
    for good_seq in np.intersect1d(seq_g0, seq_g1):
        common_occ += min(int(dico_g0[good_seq]), int(dico_g1[good_seq]))
    common_occ/=n_walks
    return common_occ

In [59]:
g0 = train_data[10]
g1 = train_data[100]
compute_kernel(g0, g1, l_walk=5, n_walks=100)

NameError: name 'compute_kernel' is not defined

### Try weisfeiler lehman test

In [31]:
from hashlib import blake2b
from collections import Counter, defaultdict

In [32]:
def _hash_label(label, digest_size):
    return blake2b(label.encode("ascii"), digest_size=digest_size).hexdigest()

In [33]:
def _neighborhood_aggregate(G, node, node_labels, edge_attr=None):
    """
    Compute new labels for given node by aggregating
    the labels of each node's neighbors.
    """
    label_list = []
    for nbr in G.neighbors(node):
        prefix = "" if edge_attr is None else str(G[node][nbr][edge_attr])
        label_list.append(prefix + node_labels[nbr])
    return node_labels[node] + "".join(sorted(label_list))

In [117]:
def weisfeiler_lehman_graph_hash(G, edge_attr=None, node_attr="labels", iterations=3, digest_size=16):
    def weisfeiler_lehman_step(G, labels, edge_attr=None):
        """
        Apply neighborhood aggregation to each node
        in the graph.
        Computes a dictionary with labels for each node.
        """
        new_labels = {}
        for node in G.nodes():
            label = _neighborhood_aggregate(G, node, labels, edge_attr=edge_attr)
            new_labels[node] = _hash_label(label, digest_size)
        return new_labels

    # set initial node labels
    node_labels = {u: str(dd[node_attr]) for u, dd in G.nodes(data=True)}

    subgraph_hash_counts = {}
    for it in range(iterations):
        node_labels = weisfeiler_lehman_step(G, node_labels, edge_attr=edge_attr)
        counter = Counter(node_labels.values())
        # normalize counter
        total = np.sum(list(counter.values()))
        for k in counter:
            counter[k] /= total
            
        # sort the counter, extend total counts
        subgraph_hash_counts[it] = sorted(counter.items(), key=lambda x: x[0])

    # hash the final counter
    # return _hash_label(str(tuple(subgraph_hash_counts)), digest_size)
    return subgraph_hash_counts

In [118]:
g1 = train_data[200]
g2 = train_data[1000]

In [129]:
def kernel(g1, g2, iterations=5, node_attr="labels", edge_attr="labels"):
    wl1 = weisfeiler_lehman_graph_hash(g1, iterations=iterations, node_attr=node_attr, edge_attr=edge_attr)
    wl2 = weisfeiler_lehman_graph_hash(g2, iterations=iterations, node_attr=node_attr, edge_attr=edge_attr)
    
    k = 0
    for i in range(iterations):
        dict1 = dict(wl1[i])
        dict2 = dict(wl2[i])
        # take scalar product only on common keys
        common_keys = set(dict1.keys()).intersection(set(dict2.keys()))
        print(k)
        k += np.sum([dict1[c]*dict2[c] for c in common_keys])
    return k

In [130]:
%%time
k = kernel(g1, g2, iterations=5, node_attr="labels", edge_attr="labels")

label = [1][0][1]
label = [1][0][1][0][1][0][2]
label = [2][0][1][0][1][0][1]
label = [1][0][1][0][2]
label = [1][0][1][2][1][2][1]
label = [1][0][4][2][1][2][1]
label = [4][0][1]
label = [1][2][1][2][1]
label = [1][2][1][2][1]
label = [1][2][1][2][1]
label = [1][0][2][2][1][2][1]
label = [2][0][1][0][1]
label = [1][0][2][0][2][1][2]
label = [2][0][1][1][1]
label = [1][0][1][0][2][1][0]
label = [0][1][1]
label = a0e059b611fa6e963b850e139eb1ad9e[0]8d8fe8ff31572bdd8eb88f0e5dd06ae0
label = 8d8fe8ff31572bdd8eb88f0e5dd06ae0[0]1de042e241799e1e6f51e6c57a7f2228[0]89a591b2a5cfa60689912dfb6abc9d13[0]a0e059b611fa6e963b850e139eb1ad9e
label = 1de042e241799e1e6f51e6c57a7f2228[0]5f78d83547fb87e3614278c619fb30bc[0]8d8fe8ff31572bdd8eb88f0e5dd06ae0[0]e210ba2c5866b83a8f028f0b23d4c874
label = e210ba2c5866b83a8f028f0b23d4c874[0]1de042e241799e1e6f51e6c57a7f2228[0]7b07eed3e2a4241d83f17a5329b864da
label = 7b07eed3e2a4241d83f17a5329b864da[0]e210ba2c5866b83a8f028f0b23d4c874[2]179b6479b79e03598e27990bbfa52909[2]

In [128]:
k

0.02403846153846154

In [91]:
test1 = wl1[0]
test2 = wl2[0]

In [92]:
dict1 = dict(test1)
dict1

{'05e0c4d27845a63c86c32e272f49f836': 0.1111111111111111,
 '1de042e241799e1e6f51e6c57a7f2228': 0.1111111111111111,
 '3028121fc5f783573b813b694050af7c': 0.037037037037037035,
 '419cba2ce4d595e3cfc8a0ca80b30fc1': 0.037037037037037035,
 '586a4e57cc8cd0892350f3986ebd7ccc': 0.037037037037037035,
 '87021bbf424a2c34864aed245ebd7baa': 0.14814814814814814,
 'a0e059b611fa6e963b850e139eb1ad9e': 0.07407407407407407,
 'b271c00c65ccc50cc584efa76ed13329': 0.07407407407407407,
 'e210ba2c5866b83a8f028f0b23d4c874': 0.2222222222222222,
 'f1f51066f08cd36f3430f8dabe09fd2b': 0.14814814814814814}

In [93]:
dict2 = dict(test2)

In [94]:
common_keys = set(dict1.keys()).intersection(set(dict2.keys()))

In [95]:
common_keys

{'a0e059b611fa6e963b850e139eb1ad9e', 'f1f51066f08cd36f3430f8dabe09fd2b'}

In [96]:
res = np.sum([dict1[k]*dict2[k] for k in common_keys])

In [121]:
g.nodes.data()

NodeDataView({0: {'labels': [1]}, 1: {'labels': [1]}, 2: {'labels': [2]}, 3: {'labels': [1]}, 4: {'labels': [1]}, 5: {'labels': [1]}, 6: {'labels': [0]}, 7: {'labels': [2]}, 8: {'labels': [1]}, 9: {'labels': [1]}, 10: {'labels': [2]}, 11: {'labels': [1]}, 12: {'labels': [1]}, 13: {'labels': [1]}, 14: {'labels': [0]}, 15: {'labels': [1]}, 16: {'labels': [0]}, 17: {'labels': [1]}, 18: {'labels': [1]}, 19: {'labels': [0]}, 20: {'labels': [1]}, 21: {'labels': [1]}, 22: {'labels': [0]}, 23: {'labels': [0]}, 24: {'labels': [1]}, 25: {'labels': [0]}, 26: {'labels': [0]}})