# Exercises on Frequent Subgraph Mining

In [None]:
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from tabulate import tabulate

## Exercise 1 - DFS codes
### Part A:

Consider the graph illustrated in Figure 1 and its DFS traversal:


![image-4.png](attachment:image-4.png)

The figure already lists some precedence relationships among edges in this DFS traversal, along with the ordering rule by which they hold. As a reminder, for $e_1 = (i_1, j_1), e_2 = (i_2, j_2)$, the following ordering rules hold:

1. If $i_1 = i_2$ and $j_1 < j_2 \Rightarrow e_1 < e_2$ 
2. If $i_1 < j_1$ and $j_1 = i_2 \Rightarrow e_1 < e_2$
3. If $e_1 < e_2$ and $e_2 < e_3 \Rightarrow e_1 < e_3$  (transitive property)

Complete the list of precedence relationships and rules by which they hold.

### Part B:

Apart from the DFS traversal examined in the above and the corresponding code, the same graph has other DFS codes as well. Figure 2 below shows two other cases.

![image-5.png](attachment:image-5.png)

Write the DFS code of the two rightmost graphs in the Figure.

### Part C:

Find the minimal DFS code of the same graph by lexicographic ordering. (hint: see slides)

### Part D:

Consider the following mutation of labels in the above graph: all ‘a’ labels are turned to ‘b’ and ‘b’ labels are turned to ‘a’ (i.e., ‘a’ and ‘b’ labels swap values). Find the minimal DFS code of the new, mutated graph.

## Exercise 2 (min DFS code generation).

Implement an algorithm that finds the minimum DFS-code of a given graph. Hint: You may extract such an algorithm from this [gSpan code](https://github.com/LasseRegin/gSpan).

Note that we only need to generate a minimum DFS-code for the whole graph, rather than exhaustively generating all the DFS codes for its subgraphs, as gSpan does.

Let's build the graph corresponding to Figure 1 and Figure 2, which we can use to test our implementation.

First, we have a graph and some helper function.

In [None]:
# Defining the graph (with labels)
G = nx.Graph()
G.add_node(4, label='Z')
G.add_node(0, label='X')
G.add_node(2, label='X')
G.add_node(1, label='Y')
G.add_node(3, label='Z')
G.add_edge(0, 1, label='a')
G.add_edge(2, 3, label='c')
G.add_edge(0, 2, label='a')
G.add_edge(1, 3, label='b')
G.add_edge(1, 4, label='d')
G.add_edge(1, 2, label='b')
pos = {0: (0., 1.), 1: (0.08, 0.75), 2: (0., 0.5), 3: (0.08, 0.25), 4: (0.25, 0.5)}  # Layout for plotting

# Plotting the graph
def plot_graph_with_labels(G, pos, ax, label_key='label'):
    # Draw nodes and edges (without labels)
    nx.draw(G, pos=pos, ax=ax) 
    # Draw node labels
    nx.draw_networkx_labels(G, pos=pos, labels={k:v for k, v in G.nodes(data=label_key)}, font_color='white') 
    # Draw edge labels
    nx.draw_networkx_edge_labels(G, pos=pos, edge_labels={tuple(pos): v for *pos, v in G.edges(data=label_key)}, font_color='black')
    
fig, ax = plt.subplots()
ax.set_xlim([-0.5, 0.5])
plot_graph_with_labels(G, pos, ax)

In [None]:
# A couple of python tricks: 
print("1. You can get the label of a node as follows:")
print("Label for node 1: ", G.nodes[1]['label'])

print("---")
print("2. You can get the nodes and associated labels as follows:")
for n, l in G.nodes(data='label'):
    print(n, l)
    
print("---")
print("3. You can get the edge labels as follows:")
print("Label for edge (1, 3): ", G.edges[(1, 3)]['label'])

print("---")
print("4. You can get all edges and their labels as follows: ")
for i, j, l in G.edges(data='label'):
    print("Node (%d, %d) has label %s" % (i, j, l))
    
print("---")
print("5. Neighbors of a node n can be found as follows:")
neighbors = list(G.neighbors(1))
neighbors_with_labels = [(n, G.nodes[n]['label']) for n in G.neighbors(1)]
print(neighbors, neighbors_with_labels)

print("You could even define a function that sorts them")
def neighbors(G, n):
    """ Returns list of tuples of (node_idx, neighbor_idx, 'edge_label', 'neighbor_label')
        sorted lexicographically by labels
    """
    edges = G.edges
    nodes = G.nodes
    
    neighbors = [(n, n_, edges[(n, n_)]['label'], nodes[n_]['label']) for n_ in G.neighbors(n)]
    neighbors = sorted(neighbors, key=lambda x: tuple(x[2:]))
    return neighbors

print(neighbors(G, 1))

In [None]:
def dfs(G, v): # Performs dfs search on G starting in node v. Builds and returns dfs code
    node_idx      = dict.fromkeys(G.nodes(), -1)      # Map that keeps order of discovery of each node.
    nodes_visited = dict.fromkeys(G.nodes(), False)   # Map that keeps visited nodes to not recurse forever.
    edges_visited = dict.fromkeys(G.edges(), False)   # Map that keeps handled edges (already output).
    
    node_idx[v]         = 0     # First node get idx 0
    nodes_visited[v]    = True  # Mark first node visited
     
    # Function to produce a code tuple from two node indices, i.e., (i, j, Li, Lij, Lj)
    tup = lambda fr, to: (node_idx[fr], node_idx[to], G.nodes[fr]['label'], G.edges[(fr, to)]['label'], G.nodes[to]['label'])
    
    # Shorthand for marking and reading marked edges. Solves issue with nodes that can be ordered arbitratily (i, j) or (j, i)
    def mark_edge(v1, v2):
        if (v1, v2) in edges_visited: edges_visited[(v1, v2)] = True
        else:                         edges_visited[(v2, v1)] = True
    edge_marked = lambda v1, v2: edges_visited.get((v1, v2), False) or edges_visited.get((v2, v1), False)
    
    # Initialize edge queue by edges from starting node in lexicographic order.
    queue    = neighbors(G, v)[::-1] # Reverse order to take lexicographically smallest first
    # [e1, e2, .... ei-1] + [ v.edges (sorted) ]
    code     = [] # Output code
    
    k = 1 # Counter for node idx
    while len(queue) > 0: # Visit all edges DFS
        node_from, curr_node, *_ = queue.pop() # Take last element from queue
        
        if edge_marked(node_from, curr_node): continue # Don't use the same edge multiple times
        mark_edge(node_from, curr_node) # Mark edge as visited
        
        if nodes_visited[curr_node]:                  # Backward node
            code.append(tup(node_from, curr_node))
            
        else:                                         # Forward edge => we need to add curr_nodes's edges to the queue
            node_idx[curr_node] = k
            nodes_visited[curr_node]  = True
            k += 1
            
            code.append(tup(node_from, curr_node))
            
            edges = [ (not nodes_visited[t[1]], *t) for t in neighbors(G, curr_node) ] # Add `visited` to tuple, to prioritize backward edges first
            edges = sorted( edges )
            queue += [t[1:] for t in edges][::-1] # Remove `visited` again
        
    return code
    

# print dfs codes for different starting nodes
print("Node 0\n", tabulate(dfs(G, 0)))
print("Node 1\n", tabulate(dfs(G, 1)))
print("Node 2\n", tabulate(dfs(G, 2)))
print("Node 3\n", tabulate(dfs(G, 3)))
print("Node 4\n", tabulate(dfs(G, 4)))



Then, we need to implement ordering rules.

In [None]:
def compare(c1, c2): # Check if c1 < c2 for dfs codes c1, c2
    """
        Function for comparing 2 dfs codes lexicographically. Defines sorting of dfs codes.
        Input:
            c1:   list of tuples [(i, j, L_i, L_{ij}, L_j), ...]
            c2:   list of tuples [(i, j, L_i, L_{ij}, L_j), ...]
        Output:
            1 if c1 > c2, -1 if c1 < c2, 0 if c1 == c2  
    """
    if len(c1) != len(c2): raise ValueException("Codes should be of same length")
        
    # TODO Your code here
    # Hint: Check slides for definition of comparrison of dfs codes.

    # TODO Your code here
        
    return 0 # codes are identical
 

Finally, compare all dfs code given by *dfs()*.

In [None]:
def min_dfs_code(G):
    """
        Input:
            G:   A networkx graph.
        Output:
            min_dfs: A list of tuples [(i, j, L_i, L_{ij}, L_j), ...]
            min_node: Index of node with minimum dfs code
    """
    min_dfs = []
    min_node = 0
    # TODO Your code here

    # TODO Your code here
    return min_dfs, min_node

In [None]:
min_code, min_node = min_dfs_code(G)

print("Min node is: ", min_node)
print("Code: ")
print(tabulate(min_code, headers=['i', 'j', 'Li', 'Lij', 'Lj']))

#### Exercise 2.1
Unfortunately, *dfs()* is incorrect (although it works fine with above graph). Can you give a counterexample and point out why it is wrong?