In [None]:
from bs4 import BeautifulSoup
import requests
from collections import deque

def valid_wiki_link(link):
    media_file_extensions = [".ogg","jpg"]
    if( 
        ( link == None) or
        (len(link) < 6) or
        (link[:6] != "/wiki/") or
        ( ":" in link ) or
        (link[6:6+5] == "File:") or
        (link[6:6+5] == "Help:") or
        (link[6:6+5] == "Talk:") or
        (link[6:6+7] == "Portal:") or
        (link[6:6+8] == "Special:") or
        (link[6:6+9] == "Category:") or
        (link[6:6+9] == "Template:") or
        (link[6:6+10] == "Wikipedia:") or
        (link[-4:] in media_file_extensions) or
        ( "/w/index.php" in link )
    ):
        return False
    else:
        return True

'''
    DUM BFS
'''
def crawl_to_philosophy_bfs(starting_wiki_link):
    has_been_queued = {}
    queue=deque([starting_wiki_link])
    
    pages_visited = 0 
    while ( (queue[0].lower() != "/wiki/philosophy" ) and (len(queue)>0) ):
        pages_visited += 1
        page_url = "https://en.wikipedia.org" + queue.popleft()
        html_page = requests.get( page_url ).content
        soup = BeautifulSoup( html_page, 'html.parser')
        out_links =  [ link.get("href") for link in soup.find_all('a') if valid_wiki_link(link.get("href")) ]
        for link in out_links:
            if link not in has_been_queued:
                queue.append( link )
                has_been_queued[link] = True
        
        if ( pages_visited % 50 == 0 ):
            print( "Number of pages visited: {}, {}".format( pages_visited, len(queue) ) )
    
    if ((len(queue)>0) and (queue[0]=="/wiki/philosophy")):
        print("Traversed from page '{}' to Philosophy after taking {} links.".format( starting_page_tile, pages_visited ) )
    else:
        print("We reached a dead end page, congratulations. You proved the hypothesis false!")
        
    return pages_visited

def get_valid_wiki_out_links(wiki_link):
        page_url = "https://en.wikipedia.org" + wiki_link
        html_page = requests.get( page_url ).content
        soup = BeautifulSoup( html_page, 'html.parser')
        out_links = [ link.get("href").lower() for link in soup.find_all('a') if valid_wiki_link(link.get("href")) ][::-1]
        return out_links
    
'''
    What I would like this thing to do:
    * get all the links on the page with links at the top of the page having higher priority.
    * travel through the highest priority link always until we reach philosophy
'''

def crawl_to_philosophy(starting_wiki_link, filter_out=set()):
    visited = {}
    has_been_queued = { starting_wiki_link.lower(): True }
    queue=[[starting_wiki_link]]
    
    reaching_goal_possible = (len(queue)>0)
    goal_reached =  "/wiki/philosophy" in has_been_queued #(len(queue[-1])>0) and (queue[-1][-1].lower() == "/wiki/philosophy")
    
    while ( not(goal_reached) and reaching_goal_possible ):
        wiki_link = queue[-1].pop()
        if ((wiki_link.lower() in visited) or (wiki_link.lower() in filter_out)):
            continue
        else:
            visited[wiki_link.lower()] = True
        
        print("{} https://en.wikipedia.org{}".format(len(visited), wiki_link))
        
        _out_links = get_valid_wiki_out_links(wiki_link)[::-1] #! reversing bad for performance, but enforces intented priority
        out_links = []
        for link in _out_links:
            if link not in has_been_queued:
                out_links.append( link )
                has_been_queued[ link.lower() ] = True
        
        if (len(queue[-1]) == 0):
            queue.pop()
        if (len(out_links) > 0): 
            queue.append( out_links )
        
        reaching_goal_possible = (len(queue)>0)
        goal_reached =  "/wiki/philosophy" in has_been_queued #(len(queue[-1])>0) and (queue[-1][-1].lower() != "/wiki/philosophy")
    
    if ("/wiki/philosophy" in has_been_queued):
        print("\nPhilosphy has been found starting from page '{}' after expanding {} links.".format( starting_wiki_link, len(visited) ) )
    else:
        print("We reached a dead end page, congratulations. You proved the hypothesis false!")
        
    return len(has_been_queued)

In [7]:
# It works, could 
filter_out=set(['/wiki/main_page'])

crawl_to_philosophy("/wiki/Albert_Hoffman", filter_out=filter_out) 

1 /wiki/Albert_Hoffman
2 /wiki/abby_hoffman
3 /wiki/colleen_quigley
4 /wiki/case_sensitivity
5 /wiki/oracle_corporation
6 /wiki/geographic_coordinate_system
7 /wiki/45%c3%9790_points
8 /wiki/the_sydney_morning_herald
9 /wiki/bbc_news
10 /wiki/bbc_newsline
11 /wiki/wales_today
12 /wiki/the_nine_(bbc_scotland)
13 /wiki/an_l%c3%a0
14 /wiki/reporting_scotland
15 /wiki/spotlight_(bbc_news)
16 /wiki/south_today
17 /wiki/south_east_today
18 /wiki/bbc_london_news
19 /wiki/bbc_points_west
20 /wiki/bbc_look_east
21 /wiki/midlands_today
22 /wiki/east_midlands_today
23 /wiki/bbc_look_north_(east_yorkshire_and_lincolnshire)
24 /wiki/bbc_look_north_(yorkshire_and_north_midlands)
25 /wiki/bbc_north_west_tonight
26 /wiki/bbc_look_north_(north_east_and_cumbria)
27 /wiki/white_house_correspondents%27_association
28 /wiki/financial_times
29 /wiki/notoav
30 /wiki/yes!_to_fairer_votes
31 /wiki/ulster_unionist_party
32 /wiki/traditional_unionist_voice
33 /wiki/socialist_party_(england_and_wales)
34 /wiki/re

267 /wiki/hampton_catlin
268 /wiki/esra%27a_al_shafei
269 /wiki/rosie_stephenson-goodknight
270 /wiki/raju_narisetti
271 /wiki/rebecca_mackinnon
272 /wiki/dariusz_jemielniak
273 /wiki/american_library_association
274 /wiki/cornell_university_press
275 /wiki/mit_press
276 /wiki/mitx
277 /wiki/mit_opencourseware
278 /wiki/mit_app_inventor
279 /wiki/tech_dinghy
280 /wiki/mit_engineers
281 /wiki/history_of_the_massachusetts_institute_of_technology
282 /wiki/l._rafael_reif
283 /wiki/howard_wesley_johnson
284 /wiki/julius_adams_stratton
285 /wiki/james_rhyne_killian
286 /wiki/samuel_wesley_stratton
287 /wiki/ernest_fox_nichols
288 /wiki/elihu_thomson
289 /wiki/richard_cockburn_maclaurin
290 /wiki/arthur_amos_noyes
291 /wiki/henry_smith_pritchett
292 /wiki/james_crafts
293 /wiki/harvard_division_of_engineering_and_applied_sciences
294 /wiki/ol_(identifier)
295 /wiki/social_distancing
296 /wiki/covid-19_pandemic_in_the_solomon_islands#covid_chart
297 /wiki/covid-19_pandemic_in_papua_new_guinea

435 /wiki/covid-19_pandemic_in_libya#covid_chart
436 /wiki/covid-19_pandemic_in_kenya#covid_chart
437 /wiki/covid-19_pandemic_in_ivory_coast#covid_chart
438 /wiki/covid-19_pandemic_in_ghana#covid_chart
439 /wiki/covid-19_pandemic_in_ethiopia#covid_chart
440 /wiki/covid-19_pandemic_in_eswatini#covid_chart
441 /wiki/covid-19_pandemic_in_egypt#covid_chart
442 /wiki/covid-19_pandemic_in_the_comoros#covid_chart
443 /wiki/covid-19_pandemic_in_cameroon#covid_chart
444 /wiki/covid-19_pandemic_in_burundi#covid_chart
445 /wiki/covid-19_pandemic_in_burkina_faso#covid_chart
446 /wiki/covid-19_pandemic_in_botswana#covid_chart
447 /wiki/covid-19_pandemic_in_benin#covid_chart
448 /wiki/covid-19_pandemic_in_angola#covid_chart
449 /wiki/covid-19_pandemic_in_algeria#covid_chart
450 /wiki/covid-19_lockdowns#table_of_pandemic_lockdowns
451 /wiki/list_of_countries_by_hospital_beds#numbers
452 /wiki/list_of_countries_by_hospital_beds
453 /wiki/list_of_hospitals_in_south_georgia_and_the_south_sandwich_island

KeyboardInterrupt: 

In [None]:
class PathFinderGame:
    def __init__(self):
        self.starting_page_title = None
        return None
    
    '''
    !!
    '''
    def start(self):
        while ( self.starting_link == None ):
            proposed_starting_page_title = input("Enter the title of the wiki page you'd like to start from: ")
            
            if ( self.verify_wiki_page_title( proposed_starting_link ) == True ):
                self.starting_page_title = proposed_starting_link
            else:
                print("There is no wiki page that exists with that title")
        
        # dfs stuff
        
        return None
    
        
    '''
        bool : 
        
        Given a page title as a String,
        Checks whether a Wikipedia Page with that title exists  
    '''
    def verify_wiki_page_title(self, title):
        return True
        