# Matcher

In [23]:
#TODO: Go over all comments and make sure they are correct and up to date

In [24]:
#| default_exp matcher

In [25]:
#| hide
from nbdev.showdoc import show_doc
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [26]:
#| export
from itertools import product, permutations
from typing import Tuple, Iterator

### Overview
In the previous modules, the LHS Parser received an LHS string, describing some graph pattern, and parsed it to an equivalent NetworkX DiGraph. In the following module, we search for **matches** to this pattern in our input graph - That is, find all subgraphs of our input graph, which have the same structure as the pattern in terms of nodes, the edges connecting them and the attributes they all have. 

Each match is basically a mapping from a subset of the input graph nodes to a the pattern nodes (as we can deduce what are the matched edges and attributes accordingly).

The **Matcher** in this module does the following:
* **Searches for matches** in the input graph, according to some LHS pattern graph.
* **Filters matches** based on an explicit boolean function, and / or constraints given to the matcher by the parser to be handled later.
* **Constructs a list of Match objects**, each corresponds to one of the filtered matches we've found. The list will be used in later modules, and will also allow users to view the matches and to use them imperatively.

The final, filtered list of Match objects is returned from this module's main function, **find_matches**.

### Requirements

In [27]:
#| export
from typing import *
from networkx import DiGraph
from networkx.algorithms import isomorphism # check subgraph's isom.
import itertools # iterating over all nodes\edges combinations

from graph_rewrite.core import NodeName, _create_graph, draw
from graph_rewrite.lhs import lhs_to_graph
from graph_rewrite.match_class import Match, mapping_to_match, is_anonymous_node,draw_match

### Find Matches
Given an input graph and a pattern graph, we want to find the list of matches from pattern nodes to input-graph nodes, each constructs a corresponding Match object. 

#### Checking for Attribute Existence and Constant Values
A crucial aspect of finding matches is comparing the attributes of nodes to determine whether they are compatible. Nodes that match based on either the existence of specific attributes or constant attribute values are considered candidates. These candidates are then further refined to identify all valid matches for the given pattern.

In [28]:
#| export
# TODO: Ensure we separate between constant attributes and existence checks (constants).
# a[id] -> existence check (can be checked before combinatorics)
# a[id=Constant(3)] -> constant value check (can be checked before combinatorics)

# TODO: Email Dean regarding the parser ability to support constant values in the pattern graph - it is currently not supported, and so all constant values will still
#  result in a None value in the pattern graph.
class Constant:
    def __init__(self, value):
        self.value = value


def _attributes_match(pattern_attrs: dict, input_attrs: dict) -> bool:
    """
    Check if the input attributes match the pattern attributes.

    This function supports both:
    - Existence checks (ensures that required attributes exist).
    - Constant value checks (ensures that constant values match).

    Args:
        pattern_attrs (dict): Attributes of the pattern (node or edge).
        input_attrs (dict): Attributes of the input (node or edge).

    Returns:
        bool: True if the input attributes match the pattern attributes, False otherwise.
    """
    for attr_name, attr_value in pattern_attrs.items():
        if attr_name not in input_attrs:  # If the attribute does not exist, return False
            return False
        
        if attr_value is None: # If the attribute exists, but the value is None, continue to the next attribute
            continue

        # TODO: This is not supported yet due to the parser not supporting constant values in the pattern graph - we will never reach this point, and it is implemented for future use, 
        # once the parser supports it.
        if isinstance(attr_value, Constant):  # If the attribute exists, and the value is a constant, check if the value matches
            if input_attrs[attr_name] != attr_value.value:
                return False

    return True

#### Narrow Down Search Space
Using the functions presented thus far, the search for matches might take a lot of time if the graph has a high number of nodes / edges. Nodes which are no real candidate to match any pattern node (do not share attributes with any pattern node) are checked eitherway, which is extermely inefficient. 

Therefore, before we search for matches in our input graph, we will reduce it to only contain the nodes that might match any of the pattern nodes (and their connected edges as well). This might improve the whole matching performance. The following function is used in order to do exactly that:

In [None]:
#| export
def _find_input_nodes_candidates(pattern_node: NodeName, pattern: DiGraph, input_graph: DiGraph) -> set[NodeName]:
    """
    Given a pattern node and an input graph, return a set of input graph nodes that:
    - Contain the required attributes of the pattern node, including constant value checks (if specified) and existence checks (if no value is specified / no constant value).
    - Have at least one edge with matching attributes for each edge of the pattern node that has attributes specified.

    Args:
        pattern_node (NodeName): The pattern node.
        pattern (DiGraph): The pattern graph.
        input_graph (DiGraph): The input graph.

    Returns:
        set[NodeName]: A set of input graph nodes that match the required attributes and have at least one matching edge.
    """

    pattern_node_attrs = pattern.nodes[pattern_node]

    if "_id" in pattern_node_attrs: #TODO: understand why this is here (_id)
        input_node_id = pattern_node_attrs.pop("_id")
        input_nodes_to_check = [input_node_id]
    else:
        input_nodes_to_check = list(input_graph.nodes)

    # Filter nodes by attributes first
    candidate_nodes = {
        input_node
        for input_node in input_nodes_to_check
        if _attributes_match(pattern_node_attrs, input_graph.nodes[input_node])
    }

    return candidate_nodes

In [None]:
#| export
def _filter_edge_candidates(input_graph: DiGraph, pattern: DiGraph, src_pattern_node: NodeName, dst_pattern_node: NodeName, 
                               src_candidates: Set[NodeName], dst_candidates: Set[NodeName]) -> Set[Tuple[NodeName, NodeName]]:
    """
    Filter the input node candidates for two pattern nodes by checking if the edges between them in the input graph exist
    and match the pattern edge attributes.

    This function reduces the number of candidate pairs before generating assignments in _find_pattern_based_matches.

    Args:
        input_graph (DiGraph): The input graph.
        pattern (DiGraph): The pattern graph (provides the edge attributes).
        src_pattern_node (NodeName): The source pattern node.
        dst_pattern_node (NodeName): The destination pattern node.
        src_candidates (Set[NodeName]): Current candidates for the source pattern node.
        dst_candidates (Set[NodeName]): Current candidates for the destination pattern node.

    Returns:
        Set[Tuple[NodeName, NodeName]]: A set of valid candidate edge assignments (source, destination).
    """
    pattern_edge_attrs = pattern.get_edge_data(src_pattern_node, dst_pattern_node, default={})

    # Filter input edge candidates for the pattern edge by checking if the input edge exists and matches the pattern edge attributes (if specified)
    valid_edge_candidates = {
        (src_candidate, dst_candidate)
        for src_candidate, dst_candidate in product(src_candidates, dst_candidates)
        if (src_candidate, dst_candidate) in input_graph.edges and
           _attributes_match(pattern_edge_attrs, input_graph.get_edge_data(src_candidate, dst_candidate, default={}))
    }

    return valid_edge_candidates

#### Find Pattern Based Matches
NetworkX provides an out-of-the-box isomorphism matcher, which compares the structure of two graphs and tells whether they are isomorphic (have the same nodes and edges). We utilize this isomorphism matcher by beginning our matching process with structural matches, after filtering out candidates that do not match based on existance of attributes or constant values of attributes:

In [None]:
#| export
def _add_candidates_to_assignment(src_candidate: NodeName, dst_candidate: NodeName, partial_assignment: Dict[NodeName, NodeName], 
                                  src_pattern_node: NodeName, dst_pattern_node: NodeName) -> Optional[frozenset]:
    """
    Helper function to handle the case of adding src and dst candidates to the partial assignment
    based on different conditions (both unassigned, one already assigned correctly, etc.).

    Args:
        src_candidate: The candidate for the source pattern node.
        dst_candidate: The candidate for the destination pattern node.
        partial_assignment: The current partial assignment being considered.
        src_pattern_node: The source pattern node.
        dst_pattern_node: The destination pattern node.

    Returns:
        A frozen set of the new assignment if valid, or None if it doesn't apply.
    """
    new_assignment = partial_assignment.copy()
    src_assigned = src_candidate in partial_assignment.values()
    dst_assigned = dst_candidate in partial_assignment.values()

    # Case 1: Neither src nor dst are assigned, add both
    if not src_assigned and not dst_assigned:
        new_assignment[src_pattern_node] = src_candidate
        new_assignment[dst_pattern_node] = dst_candidate
        return frozenset(new_assignment.items())

    # Case 2: src is already correctly assigned, add dst
    elif src_assigned and partial_assignment[src_pattern_node] == src_candidate and not dst_assigned:
        new_assignment[dst_pattern_node] = dst_candidate
        return frozenset(new_assignment.items())

    # Case 3: dst is already correctly assigned, add src
    elif dst_assigned and partial_assignment[dst_pattern_node] == dst_candidate and not src_assigned:
        new_assignment[src_pattern_node] = src_candidate
        return frozenset(new_assignment.items())

    return None  # No valid assignment if none of the cases match

In [None]:
#| export
def _find_pattern_based_matches(graph: DiGraph, pattern: DiGraph) -> Iterator[Tuple[DiGraph, Dict[NodeName, NodeName]]]:
    """
    Find all subgraphs in the input graph that match the given pattern graph based on both structure (nodes and edges)
    and attributes (existence of attributes or constant value checks).

    A subgraph is considered isomorphic if it has the same structure (nodes and edges) as the pattern graph
    and the attributes of the nodes and edges match the specified attributes in the pattern graph.

    Args:
        graph (DiGraph): The graph to search for matches.
        pattern (DiGraph): The pattern graph representing the structure and attributes to match.

    Yields:
        Iterator[Tuple[DiGraph, Dict[NodeName, NodeName]]]: Tuples of (subgraph, mapping),
        where subgraph is the matched subgraph, and mapping is a dictionary mapping nodes in the
        subgraph to nodes in the pattern.
    """

    #  Identify lonely pattern nodes (nodes with no edges) and their candidates in the input graph
    lonely_pattern_nodes = [n for n in pattern.nodes if pattern.in_degree(n) == 0 and pattern.out_degree(n) == 0]
    lonely_pattern_nodes_to_input_candidates = {n: _find_input_nodes_candidates(n, pattern, graph) for n in lonely_pattern_nodes}

    # For pattern edges, gather valid edge candidates (pairs of nodes with matching attributes)
    edge_candidates = {}
    for src_pattern_node, dst_pattern_node in pattern.edges:
        src_candidates = _find_input_nodes_candidates(src_pattern_node, pattern, graph)
        dst_candidates = _find_input_nodes_candidates(dst_pattern_node, pattern, graph)
        edge_candidates[(src_pattern_node, dst_pattern_node)] = _filter_edge_candidates(
            graph, pattern, src_pattern_node, dst_pattern_node, src_candidates, dst_candidates)

    # Initialize partial assignments based on valid edge candidates
    partial_assignments = set()

    for (src_pattern_node, dst_pattern_node), valid_edge_candidates in edge_candidates.items():
        new_assignments = set()

        for src_candidate, dst_candidate in valid_edge_candidates:
            for partial_assignment in partial_assignments or [{}]:
                new_assignment = _add_candidates_to_assignment(src_candidate, dst_candidate, partial_assignment, src_pattern_node, dst_pattern_node)
                if new_assignment:
                    new_assignments.add(new_assignment)

        if not new_assignments:  # If no new assignments are found for a pair of pattern nodes, the pattern cannot be matched
            return
        partial_assignments = new_assignments

    # Add lonely node candidates (nodes without edges) to the assignments
    for pattern_node in lonely_pattern_nodes:
        lonely_node_candidates = lonely_pattern_nodes_to_input_candidates[pattern_node]
        new_assignments = set()

        for candidate in lonely_node_candidates:
            for partial_assignment in partial_assignments or [{}]:
                if candidate not in dict(partial_assignment).values():
                    new_assignment = dict(partial_assignment).copy()
                    new_assignment[pattern_node] = candidate
                    new_assignments.add(frozenset(new_assignment.items()))  # Ensuring the assignment remains immutable and unique

        partial_assignments = new_assignments

    # Filter and yield valid subgraphs that match the pattern (structurally and by attributes)
    for assignment in partial_assignments:
        assignment_dict = dict(assignment)  # Convert frozen set back to a dict
        subgraph = graph.subgraph(assignment_dict.values())

        # Validate the subgraph for isomorphism against the pattern
        if isomorphism.is_isomorphic(subgraph, pattern, node_match=_attributes_match, edge_match=_attributes_match):
            yield subgraph, assignment_dict


In [None]:

#OLD    
'''
    # We find all possible candidates for each node in the pattern, by checking the attributes of the nodes in the graph
    pattern_to_input_candidates = {pattern_node: _find_input_nodes_with_pattern_attributes(pattern.nodes[pattern_node], graph) for pattern_node in pattern.nodes}
    candidate_assignments = itertools.product(*(pattern_to_input_candidates[pattern_node] for pattern_node in list(pattern.nodes)))
    for assignment in candidate_assignments:
        # Make sure the sub_nodes are unique - we don't want to have multiple nodes in the subgraph that are mapped to the same node in the pattern
        if len(set(assignment)) != len(assignment): 
            continue
        assignment_mapping = dict(zip(list(pattern.nodes), assignment))
        subg = DiGraph()
        subg.add_nodes_from(list(assignment))
        for pattern_edge in list(pattern.edges):
            graph_edge = (assignment_mapping[pattern_edge[0]], assignment_mapping[pattern_edge[1]])
            if graph_edge in graph.edges and _input_node_has_pattern_node_attributes(graph.edges[graph_edge], pattern.edges[edge]):
              subg.add_edge(graph_edge[0], graph_edge[1])
            else: # In that case we don't need to check the rest of the isomorphism
                break
                             
        # We only yield mappings for subgraphs that have the same amount of edges as the pattern - otherwise the subgraph won't be an isomorphism
        if len(subg.edges) == len(pattern.edges):
            yield assignment_mapping
'''


#### Filtering Matches
The only thing we ignored up until now is attribute values. As metioned above, the LHS Parser does not include required attribute values in the pattern graph. Instead, it constructs a boolean function which receives a Match object and checks whether the match it represents has the required attribute values (if there are any). 

This boolean function is further extended by the user of the library, which can pass as parameter a function of the same format, which filteres a list of Match objects based on any condition it wishes to apply. The LHS Parser, in addition to the pattern graph, provides the extended filtering function, that mixes both the user and the parser constraints which were not handled by the matcher so far.

Later in this module, we will use the extended function to filter the list of Match objects we get from the structural and attribute-existence-based matchers. The signature of that function will be as follows:

In [None]:
#| export
FilterFunc = Callable[[Match], bool]

#### Putting It All Togehter
Given our ability to find matches (both structural and in terms of attribute existence) between two graphs, as well as filtering matches according to desired conditions and constraints, we can finally find complete matches of the pattern in our input graph. 

We define one last auxiliary function, which removes duplicated matches based on their mappings:

In [None]:
#| export
def _filter_duplicated_matches(matches: list[Match]) -> Iterator[Match]:
    """Remove duplicates from a list of Matches, based on their mappings. Return an iterator of the matches without duplications.

    Args:
        matches (list[Match]): list of Match objects

    Yields:
        Iterator[list[Match]]: Iterator of the matches without duplications.
    """

    # We avoid using set here to avoid instantiation of all matches - so this is the most efficient way to do it
    seen = set()
    for match in matches:
        if match not in seen:  # O(1) average time complexity for set lookup
            seen.add(match)  # O(1) average time complexity for adding to set
            yield match

In [None]:
#| export
def _find_intersecting_pattern_nodes(exact_match_pattern: DiGraph, collection_pattern: DiGraph) -> set:
    """
    Find the intersecting pattern nodes between the exact match pattern and the collection pattern.

    The intersecting pattern nodes are those that appear in both the exact match pattern 
    (i.e., pattern nodes that aim to match a single, unique input node) and the collection pattern 
    (i.e., pattern nodes that aim to match multiple input nodes).

    Args:
        exact_match_pattern (DiGraph): The pattern graph representing nodes that match exactly one input node.
        collection_pattern (DiGraph): The pattern graph representing nodes that match multiple input nodes.

    Returns:
        set: A set of pattern nodes that are present in both the exact match pattern and the collection pattern.
    """
    intersecting_pattern_nodes = set(exact_match_pattern.nodes) & set(collection_pattern.nodes)
    return intersecting_pattern_nodes


#| export
def _add_collections_to_exact_matches(input_graph: DiGraph, collection_pattern: DiGraph, 
                                      exact_matches: Set[Dict[NodeName, NodeName]], intersecting_pattern_nodes: Set[NodeName]
                                      ) -> Iterator[Dict[NodeName, Set[NodeName]]]:
    """
    Add collection matches to the existing exact matches by finding subgraph matches for collection pattern nodes
    and merging them with the given exact match mapping.

    This function finds matches in the input graph that satisfy both the exact match pattern (pattern nodes 
    that aim to match exactly one input node) and the collection pattern (pattern nodes that aim to match 
    multiple input nodes).

    Args:
        input_graph (DiGraph): The input graph where collection matches are searched.
        collection_pattern (DiGraph): The pattern graph representing nodes that match multiple input nodes.
        exact_matches (Set[Dict[NodeName, NodeName]]): The set of exact matches, where each pattern node 
            is mapped to a single input node.
        intersecting_pattern_nodes (Set[NodeName]): The set of pattern nodes that intersect between the 
            exact match pattern and collection pattern.

    Yields:
        Iterator[Dict[NodeName, Set[NodeName]]]: An iterator over the updated mappings, where each includes both 
        the previous exact match mapping and the newly found collection matches for this exact match.
    """
    input_graph_copy = input_graph.copy()

    # Enrich the exact match mapping with the corresponding collection matches.
    # This involves moving to set semantics for exact matches nodes and adding collection matches.
    for exact_match in exact_matches:
        updated_mapping = {pattern_node: {input_node} for pattern_node, input_node in exact_match.items()}
        non_intersecting_collection_pattern_nodes = set(collection_pattern.nodes) - intersecting_pattern_nodes
        updated_mapping.update({pattern_node: set() for pattern_node in non_intersecting_collection_pattern_nodes})

        # Lock intersecting pattern nodes to their corresponding input node in the exact match
        collection_pattern_copy = collection_pattern.copy()
        for intersecting_pattern_node in intersecting_pattern_nodes:
            collection_pattern_copy.nodes[intersecting_pattern_node]['_id'] = exact_match[intersecting_pattern_node]

        # Find collection matches using the locked pattern
        collection_matches = list(_find_pattern_based_matches(input_graph_copy, collection_pattern_copy))

        # Add matches for collection pattern nodes
        for collection_match in collection_matches:
            for collection_pattern_node, matched_input_nodes in collection_match.items():
                if collection_pattern_node not in intersecting_pattern_nodes: # We already have the exact match for these nodes
                    updated_mapping[collection_pattern_node].add(matched_input_nodes)  # Add the matched input node

        yield updated_mapping

We are now combining everything we saw in order to find the matches of a pattern in our input graph. The matches are returned as a (filtered) list of Match objects:

In [None]:
#| export
def find_matches(input_graph: DiGraph, exact_match_pattern: DiGraph, collections_pattern: DiGraph = None, 
                 condition: FilterFunc = lambda match: True) -> Iterator[Match]:
    """
    Find all matches of a pattern graph in an input graph, satisfying a certain condition.

    This function identifies subgraphs of the input graph that match the exact match pattern 
    and the collections pattern (if provided) based on structure, attributes, and additional conditions 
    specified by the user.

    Args:
        input_graph (DiGraph): A graph where matches are searched.
        exact_match_pattern (DiGraph): The pattern graph representing exact matches (nodes mapped to one input node).
        collections_pattern (DiGraph, optional): A pattern graph representing nodes that can map to multiple input nodes. Defaults to None.
        condition (FilterFunc, optional): A function that receives a Match object and checks if a condition holds. 
                                          Defaults to a function that always returns True.

    Yields:
        Iterator[Match]: Iterator of Match objects, each representing a match of the pattern in the input graph.
    """

    # Find all exact matches (isomorphisms) based on structure and attributes.
    exact_matches = {frozenset(mapping.items()) for _, mapping in _find_pattern_based_matches(input_graph, exact_match_pattern)}

    # If a collections pattern is provided, enrich exact matches by adding matching collections.
    if collections_pattern:
        intersecting_pattern_nodes = _find_intersecting_pattern_nodes(exact_match_pattern, collections_pattern)
        updated_matches_with_collections = _add_collections_to_exact_matches(input_graph, collections_pattern, exact_matches, intersecting_pattern_nodes)
    else:
        #TODO: verify this
        # If no collections, we proceed with exact matches only, we just move it to the correct format of the updated_matches_with_collections
        updated_matches_with_collections = [{pattern_node: {input_node} for pattern_node, input_node in exact_match.items()} 
                                            for exact_match in exact_matches]

    exact_pattern_nodes = set(exact_match_pattern.nodes)

    # TODO: verify this
    pattern_nodes = set(exact_pattern_nodes) | set(collections_pattern.nodes)
    pattern_edges = set(collections_pattern.edges) | set(exact_match_pattern.edges)

    # Generate matches with and without filtering out anonymous nodes.
    # TODO: ask Dean about the anonymous nodes
    matches_with_filtered_versions = [(mapping_to_match(input_graph, pattern_nodes, pattern_edges, mapping, exact_pattern_nodes, filter=False),
                                       mapping_to_match(input_graph, pattern_nodes, pattern_edges, mapping, exact_pattern_nodes))
                                       for mapping in updated_matches_with_collections]
    
    # Filter matches using the provided condition function, based on the unfiltered match.
    filtered_matches = [filtered_match for (unfiltered_match, filtered_match) in matches_with_filtered_versions 
                        if condition(unfiltered_match)]

    # Remove any duplicate matches (duplicates might be introduced due to filtering out anonymous nodes).
    yield from _filter_duplicated_matches(filtered_matches)

### Tests

#### Test Utils

In [None]:
def _assert_match(input_graph: DiGraph, LHS: str, expected: list[dict], condition=lambda x: True, plot=True):
    """Match the pattern in the input graph, and validate that the list of matches
    is equal to the expected list of matches. Also allows plotting the first match instance.

    Args:
        input_graph (DiGraph): A graph
        LHS (str): A pattern string
        expected (list[dict]): The list of expected matches (as mappings from pattern nodes to input graph nodes)
        collection_expected (list[dict]): The list of expected collection matches (as mappings from pattern nodes to corresponding collections)
        plot (bool, optional): If True, plots the first match instance on the input graph (in red). Defaults to False.
    """
    # Convert the pattern to a NetworkX graph + an extended condition function
    pattern, collections_pattern, condition = lhs_to_graph(LHS, condition)
    matches = [match for match in find_matches(input_graph, pattern, collections_pattern, condition=condition)]
    assert all([match.mapping in expected for match in matches]) and len(matches) == len(expected)
    if plot and len(matches) > 0:
        match = matches[0]
        mapping = match.mapping
        hl_nodes = {mapping[node] for node in pattern.nodes() if not is_anonymous_node(node)}
        hl_edges = {(mapping[s], mapping[t]) for s, t in pattern.edges() if not (is_anonymous_node(s) or is_anonymous_node(t))}
        print(f"Plotting the match: {mapping}")
        draw_match(input_graph,match)

#### Basic Test Cases

We begin with simple cases, which do not take attributes into account at all. Consider the following quite-generic input graph (in which we will try to find matches for different patterns):

In [None]:
input_graph = _create_graph(
    ['A','B','C','D'], 
    [
        ('A', 'B'),
        ('A', 'C'),
        ('A', 'A'),
        ('C', 'C'),
    ]
)
draw(input_graph)

In the following tests, we try to match different patterns and make sure that the matcher found all of the possible matches. The first match in the list will be highlighted in the input graph (nodes and edges are colored in red), and printed above the plot:

In [None]:
# Match all nodes 1 with self loops (both A and C)
_assert_match(input_graph, "1->1", [{'1': 'A'}, {'1': 'C'}])

Plotting the match: {'1': 'C'}


In [None]:
# Find all pairs of nodes 1, 2 where 1 has a self loop 
_assert_match(input_graph, '1->1, 2', [{'1': 'A', '2': 'B'}, {'1': 'A', '2': 'C'}, {'1': 'A', '2': 'D'}, 
                                    {'1': 'C', '2': 'A'}, {'1': 'C', '2': 'B'}, {'1': 'C', '2': 'D'}])

Plotting the match: {'1': 'C', '2': 'B'}


In [None]:
_assert_match(input_graph, '1->1, 1->2', [{'1': 'A', '2': 'B'}, {'1': 'A', '2': 'C'}]) 

Plotting the match: {'1': 'A', '2': 'C'}


In [None]:
_assert_match(input_graph, '1->1, 2->1', [{'1': 'C', '2': 'A'}])

Plotting the match: {'1': 'C', '2': 'A'}


In [None]:
# Find a circle in the graph + self loop. There is no such match in the graph
_assert_match(input_graph, '1->1, 2->1->2', []) 

In [None]:
# Find five different nodes (the different pattern names enforce it).
# There are only 4 nodes in the input graph, and so there are no matches.
_assert_match(input_graph, '1,2,3,4,5', []) 

#### Advanced Test Cases

Now, we want to check more advanced features of both the parser and the matcher:
* Checking for attributes (existance only)
* Checking for attributes (match the values as well, using the parser-generated condition function)
* Add user conditions
* Anonymous nodes

We will work with a new input graph, which shows the connections between students and the courses they took throughout their degree:
* Each node in the graph is associated with either a student or a course (and has an attribute "type" to denote which is which).
* A student is defined by his/her name. Some students (not all of them) also metion their faculty.
* A course is defined by its name. Some courses mention their associated number of units.
* An edge from a student to a course denotes that the student took the course. It mentions the semester in which the student took the course.
* An edge from a course to another course denotes that the first must be taken before the latter.

The graph looks like this:

In [None]:
input_graph = _create_graph(
    [
        # Names
        ('John', {'type': 'student', 'faculty': 'Biology'}),
        ('Lucy', {'type': 'student', 'fauclty': 'CS'}),
        ('Amy', {'type': 'student'}),
        # Courses
        ('Algo', {'type': 'course', 'units': 3}),
        ('AI', {'type': 'course', 'units': 3}),
        ('NLP', {'type': 'course', 'units': 5}),
        ('DB', {'type': 'course'}),
        ('Bio', {'type': 'course'})
    ], 
    [
        # Students take
        ('John', 'Bio', {'sem': 3}),
        ('Lucy', 'Algo', {'sem': 5}),
        ('Lucy', 'AI', {'sem': 7}),
        ('Amy', 'Algo', {'sem': 5}),
        # KDAM
        ('Algo', 'AI'),
        ('AI', 'NLP'),
    ]
)
draw(input_graph)

We will now run some useful queries by matching patterns in the graph:

In [None]:
# Find all students
_assert_match(input_graph, 's[type="student"]', [{'s': student} for student in ['Amy', 'John', 'Lucy']])

Plotting the match: {'s': 'Amy'}


In [None]:
# Find all courses
_assert_match(input_graph, 'c[type="course"]', [{'c': course} for course in ['DB','NLP','AI','Algo','Bio']])

Plotting the match: {'c': 'DB'}


In [None]:
# Find all students that took some course (all of them)
_assert_match(input_graph, 's[type="student"]->_[type="course"]', [{'s': 'Amy'}, {'s': 'John'}, {'s': 'Lucy'}])

Plotting the match: {'s': 'Amy'}


In [None]:
# Find all students that took some 3-units course
_assert_match(input_graph, 's[type="student"]->_[type="course", units=3]', [{'s': 'Amy'}, {'s': 'Lucy'}])

Plotting the match: {'s': 'Amy'}


In [None]:
# Find all students that took some 3-units course, and the associated courses
_assert_match(input_graph, 's[type="student"]->c[type="course", units=3]', [
    {'s': 'Amy', 'c': 'Algo'}, {'s': 'Lucy', 'c': 'Algo'}, {'s': 'Lucy', 'c': 'AI'}
])

Plotting the match: {'s': 'Amy', 'c': 'Algo'}


In [None]:
# Find all students which took two courses (and the courses) 
_assert_match(input_graph, 's[type="student"]->c1[type="course"], s->c2[type="course"]', [ 
    {'s': 'Lucy', 'c1': 'AI', 'c2': 'Algo'},
    {'s': 'Lucy', 'c1': 'Algo', 'c2': 'AI'}
])

Plotting the match: {'s': 'Lucy', 'c1': 'Algo', 'c2': 'AI'}


In [None]:
# Find all tripltes c1, c2, c3 of courses such that c1 is a prerequisite of c2, and the same for c2 and c3
_assert_match(input_graph, 'c1[type="course"]->c2[type="course"]->c3[type="course"]', [
    {'c1': 'Algo', 'c2': 'AI', 'c3': 'NLP'}
])

Plotting the match: {'c1': 'Algo', 'c2': 'AI', 'c3': 'NLP'}


In [None]:
# Find all students that took a course in their 7th semester
_assert_match(input_graph, 's[type="student"]-[sem=7]->_[type="course"]', [ 
    {'s': 'Lucy'}
])

Plotting the match: {'s': 'Lucy'}


In [None]:
# Find all students that took a course before their 5th semester (use user-defined condition)
_assert_match(input_graph, 's[type="student"]-[sem]->c[type="course"]', [
    {'s': 'John', 'c': 'Bio'}
], condition=lambda match: match['s->c']['sem'] < 5)

Plotting the match: {'s': 'John', 'c': 'Bio'}


#### POC For Large Graphs

In [None]:
# POC: High number of nodes, solved with attribute filtering
num_nodes = 100000

input_graph = _create_graph(
    [n for n in range(num_nodes)] + [(num_nodes+1, {'attr': 15}), (num_nodes+2, {'attr': 15})], 
    [
        (num_nodes+1, num_nodes+2),
        (2,4),
        (3,1)
    ]
)

_assert_match(input_graph, 'X[attr]->Y[attr]', [{'X': num_nodes+1, 'Y': num_nodes+2}], plot=False)

# Export

In [None]:
#|hide
import nbdev; nbdev.nbdev_export()
     

Collections Feature - Matcher Tests

In [None]:
g = _create_graph(
    [('1', {'val': 1}), ('2', {'val': 2}), ('3', {'val': 3}), ('4',{'val': 4}), ('5',{'val': 5})],
    [('1','2'), ('1','3'), ('3','4'), ('3','5')]
)
draw(g)

In [None]:
""" 
Ensuring that in a case where we match all modes, and collect all nodes (with no specific connection to the match), 
there's actually will be a collection of all nodes for each match 
"""

input_graph = g.copy()
pattern, collections_pattern, condition = lhs_to_graph('x;y')
matches = [match for match in find_matches(input_graph, pattern, collections_pattern, condition=condition)]
last_x = []
for match in matches:    
    assert match['x'] not in last_x
    assert match.collection_mapping == {'y': {'4', '5', '1', '3', '2'}}
    last_x.append(match['x'])
assert len(matches) == 5



In [None]:
""" 
Ensuring that:
1.⁠ ⁠We get four matches based on the pattern
2.⁠ ⁠We ensure the collection for each is correct
"""
pattern, collections_pattern, condition = lhs_to_graph('x->y;x->z')
matches = [match for match in find_matches(input_graph, pattern, collections_pattern, condition=condition)]
for match in matches:
    assert match.mapping in [{'x': '1', 'y': '3'}, {'x': '1', 'y': '2'}, {'x': '3', 'y': '5'}, {'x': '3', 'y': '4'}]
    if (match['x']['val'] == 1):
        assert match.collection_mapping == {'z': {'3', '2'}}
    else:
        assert match['x']['val'] == 3
        assert match.collection_mapping == {'z': {'5', '4'}}
assert len(matches) == 4


In [None]:

""" 
Ensuring that:
1.⁠ ⁠We get four matches based on the pattern
2.⁠ ⁠For each match, there is an empty collection for (x,z), z, (z,y)
#TODO: No matches at all! no empty collections
"""

pattern, collections_pattern, condition = lhs_to_graph('x->y;x->z->y')
matches = [match for match in find_matches(input_graph, pattern, collections_pattern, condition=condition)]
#for match in matches:
#        assert match.mapping in [{'x': '1', 'y': '3'}, {'x': '1', 'y': '2'}, {'x': '3', 'y': '5'}, {'x': '3', 'y': '4'}]
#        assert match.collection_mapping == {}
assert len(matches) == 0

In [None]:
""" 
Ensuring that:
1.⁠ ⁠We get two matches based on the pattern
2.⁠ ⁠For each match, the collections are created correctly - they all should be empty
#TODO: No matches at all! no empty collections
"""

pattern, collections_pattern, condition = lhs_to_graph('x->y->z;x->a->b->c')
matches = [match for match in find_matches(input_graph, pattern, collections_pattern, condition=condition)]
#for match in matches:
#        assert match.mapping in [{'x': '1', 'y': '3', 'z': '5'}, {'x': '1', 'y': '3', 'z': '4'}]
#        assert match.collection_mapping == {}
assert len(matches) == 0



In [None]:
""" 
Ensuring that:
1.⁠ ⁠We get two matches based on the pattern
2.⁠ ⁠For each match, the collections are created correctly - they all should be empty
#TODO: No matches at all! no empty collections
"""

pattern, collections_pattern, condition = lhs_to_graph('x->y->z;x->a,a->b,b->c')
matches = [match for match in find_matches(input_graph, pattern, collections_pattern, condition=condition)]
#for match in matches:
#        assert match.mapping in [{'x': '1', 'y': '3', 'z': '5'}, {'x': '1', 'y': '3', 'z': '4'}]
#        assert match.collection_mapping == {}
assert len(matches) == 0


In [None]:
""" 
Ensuring that:
1.⁠ ⁠We get two matches based on the pattern
2.⁠ ⁠For each match, the collections are created correctly - they all should be empty
#TODO: No matches at all! no empty collections
"""

pattern, collections_pattern, condition = lhs_to_graph('x->y->z;x->a,a->b,z->c')
matches = [match for match in find_matches(input_graph, pattern, collections_pattern, condition=condition)]
#for match in matches:
#        assert match.mapping in [{'x': '1', 'y': '3', 'z': '5'}, {'x': '1', 'y': '3', 'z': '4'}]
#        assert match.collection_mapping == {}
assert len(matches) == 0



In [None]:
g = _create_graph(
    [('1', {'val': 1}), ('2', {'val': 2}), ('3', {'val': 3}), ('4',{'val': 4}), ('5',{'val': 5})],
    [('1','2'), ('1','3'), ('3','4'), ('3','5')]
)
draw(g)

In [None]:
pattern, collections_pattern, condition = lhs_to_graph('x[val];x[val]->y[val]->z[val]')
matches = [match for match in find_matches(input_graph, pattern, collections_pattern, condition=condition)]
assert len(matches) == 1
assert matches[0]['x']['val'] == 1
assert matches[0].collection_mapping == {'y': {'3'}, 'z': {'4', '5'}}
print(match.mapping, match.collection_mapping)


{'x': '3', 'y': '5'} {'z': {'4', '5'}}
