In [1]:
# There is a bug I cannot explain at the moment,
# which you can see by uncommenting the triple-quoted comments.
# and removing the line that checks that a page is valid

import pywikibot
from util import *
from tqdm import tqdm

In [2]:
# Store things as pairs: (stack, dict) where stack is
# the stack associated to a dfs, and dict is
# a dictionary sending each neighbor of a previous element of stack
# to its parent that showed up earliest in the list.
# A possible future update is to also include the distance
# from the origin in the dict, so the image of the dict
# would be an ordered pair (parent, distance). 
# Another potential improvement is to expand the depth-first search
# on the side whose dictionary is smaller, rather than just alternating

"""
page3 = PageNode(pywikibot.Page(site,"Module talk:Authority control/testcases"))
"""

'\npage3 = PageNode(pywikibot.Page(site,"Module talk:Authority control/testcases"))\n'

In [3]:
# Updates a pair and returns 1 if it finds a path,
# 0 if it does not find a path but updates successfully,
# and -1 if there is nowhere to expand.
# By default, the first pair is the one updated.
# careful_bookkeeping means that the pairs will be updated correctly
# even if a path is found.
def expand_one_node(pair1, pair2, expand_first = True, pair1_to_pair2 = True, careful_bookkeeping=True):
    if not expand_first:
        return expand_one_node(pair2, pair1, pair1_to_pair2 = not pair1_to_pair2, careful_bookkeeping = careful_bookkeeping)

    try:
        item_to_expand, path = pair1[0].pop()

        if pair1_to_pair2:
            potential_expansions = item_to_expand.get_successors()
        else:
            potential_expansions = item_to_expand.get_predecessors()
        # good potential expansions are those that have not been seen before
        # in either dictionary.  It is not completely obvious, but not hard to show
        # that if we get stuck by avoiding words in this way, then there is no path
        # from start to end.
        good_potential_expansions = []
        for item in potential_expansions:
            """
            if str(item) == str(page3):
                the_item = item.page
                the_page3 = page3.page
                print(["LOOK HERE!"]*1000)
                print("item:",item)
                print("title:",the_item.title())
                print("site:",the_item.site)
                print("namespace:",the_item.namespace())
                print("hash:",hash(item))
                print("page3:",page3)
                print("title:",the_page3.title())
                print("site:",the_page3.site)
                print("namespace:",the_page3.namespace())
                print("hash:",hash(page3))
            
            # On cmd, printing page.namespace() gives you:
            # Namespace(id=0, custom_name='', canonical_name='', aliases=[], case='first-letter', content=True, nonincludable=False, subpages=False)
            # But I don't see how to get that here...
            """
            
            if valid_page(item.title()):
                if not careful_bookkeeping:
                    if item in pair2[1].keys():
                        # if we have seen this neighbor when extending the other pair
                        pair1[0].append((item, path + [item]))
                        return 1
                    elif item not in pair1[1].keys():
                        good_potential_expansions.append((item, path + [item]))
                        pair1[1][item] = item_to_expand
                else:
                    pass #TODO

        # The dictionary has already been updated during the loop
        # so we only have to update the stack now
        pair1[0].extend(good_potential_expansions)
        
    except:
        # If the stack is empty
        return -1


In [4]:
# use_first means that the path works by
# travelling down the stack of pair1
# followed by going backwards down pair2;
# not use_first means the opposite.
def generate_path(pair1,pair2,use_first):
    if use_first:
        endpoint, path = pair1[0].pop()
        dict = pair2[1]
        while endpoint != None:
            endpoint = dict[endpoint]
            path.append(endpoint)
        path = path[:-1]
    else:
        # This is equivalent to the following:
        # path = generate_path(pair2,pair1,not use_first)
        # path.reverse()
        endpoint, path = pair2[0].pop()
        dict = pair1[1]
        while endpoint != None:
            endpoint = dict[endpoint]
            path.append(endpoint)
        path = path[:-1]
        path.reverse()
    return path

In [5]:
def set_use_first(previous_use_first, pair1, pair2):
    return not previous_use_first

In [6]:
# If start == end then this returns
# a cycle in the graph, not the empty path.
#
# if return_expansions == False, the output is the path as a list
# if return_expansions == True, the output is a tuple (path, num_expansions)
# where num_epansions is the number of nodes expanded.
# In either case, the number of nodes expanded is counted;
# return_expansions only affects whether it is returned
def bidirectional_DFS_graph(start, end, return_expansions=False):
    stack1 = [(start,[start])]
    stack2 = [(end,[end])]
    dict1 = {start:None}
    dict2 = {end:None}
    num_expansions = 0

    use_first = True
    while True:
        """
        print(page3 in dict1.keys())
        print(hash(page3) in [hash(word) for word in dict1.keys()])
        print(str(page3) in [str(word) for word in dict1.keys()])
        print(page3 in stack1[-1][1])
        print(str(page3) in [str(word) for word in stack1[-1][1]])
        print("STACKS")
        for word in stack1[-1][1]:
            print(word)
        #print("DICTS")
        #for word in dict1.keys():
            #print(word)
        """
            
        result = expand_one_node((stack1, dict1), (stack2, dict2), expand_first = use_first, careful_bookkeeping = False)
        num_expansions += 1
        if result == 1:
            path = generate_path((stack1,dict1),(stack2,dict2),use_first)
            if return_expansions:
                return (path,num_expansions)
            else:
                return path
        elif result == -1:
            if return_expansions:
                return (False,num_expansions)
            else:
                return (False,num_expansions)
        
        use_first = set_use_first(use_first, (stack1, dict1), (stack2, dict2))

In [7]:
class PageNode:
    def __init__(self,page):
        self.page = page
        
    def __str__(self):
        return str(self.page)
    
    def __repr__(self):
        return repr(self.page)
    
    def __eq__(self, other):
        try:
            return self.page == other.page
        except:
            # If other does not have a page attribute
            # (Main intended use case: If other == None)
            return False
    
    def __hash__(self):
        return hash(self.page)
    
    def title(self):
        return self.page.title()
        
    def get_successors(self):
        return [PageNode(page) for page in list(self.page.linkedPages())]
    
    def get_predecessors(self):
        return [PageNode(page) for page in list(self.page.backlinks())]

In [8]:
site = pywikibot.Site("en", "wikipedia")

In [9]:
def PageNode_from_title(title):
    return PageNode(pywikibot.Page(site,title))

In [10]:
def bidirectional_DFS(start, end):
    return bidirectional_DFS_graph(PageNode_from_title(start),PageNode_from_title(end), return_expansions = True)

In [11]:
examples = get_samples(100)

In [12]:
def run_search(search_method, search_tuples, **kwargs):
    results = {}
    for start, goal in tqdm(search_tuples):
        results[(start, goal)] = search_method(start, goal, **kwargs)
    return results

In [13]:
output = run_search(bidirectional_DFS,examples)

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [26:19<00:00, 10.38s/it]


In [14]:
output

{('Transhumanism',
  'Saturn'): ([Page('Transhumanism'),
   Page('Space colonization'),
   Page('Saturn')], 2),
 ('Vacuum',
  'Jim Henson'): ([Page('Vacuum'),
   Page('Windscreen wipers'),
   Page('Windscreen wiper'),
   Page('Wing mirror'),
   Page('United States'),
   Page('George Lucas'),
   Page('Jim Henson')], 11),
 ('Earth',
  'Chimpanzee'): ([Page('Earth'),
   Page('Human evolution'),
   Page('Chimpanzee')], 2),
 ('Renaissance',
  'Dancing with the Stars'): ([Page('Renaissance'),
   Page('Íñigo López de Mendoza, marqués de Santillana'),
   Page('Íñigo López de Mendoza, 1st Marquis of Santillana'),
   Page('Íñigo López de Mendoza, 1st count of Tendilla'),
   Page('Íñigo López de Mendoza y Quiñones'),
   Page('Toledo, Spain'),
   Page('Úbeda'),
   Page('Vázquez de Molina Square'),
   Page('Temple'),
   Page('Zoroastrian'),
   Page('Zoroastrianism'),
   Page('Ẓāhirī'),
   Page('Zubair Ali Zai'),
   Page('Yasir Qadhi'),
   Page('Abu Ammaar Yasir Qadhi'),
   Page('Yale University'),
