## Example

In [6]:
from crawler import Crawler, Queue

keywords = []
def findKeyword(url, data):
  if data is None:
    return
  
  if "internet" in data.lower():
    keywords.append(url)

def begin(soup):
  return soup.find("h2", id="See_also")

def end(sibling):
  return getattr(sibling.next_element, "name", None) == "h2"

crawler = Crawler()
crawler.search(
  url="https://en.wikipedia.org/wiki/Internet", 
  structure=Queue(),
  begin=begin, 
  end=end, 
  max_iter=500,
  get=findKeyword
)

Searching: 100%|██████████| 500/500 [02:47<00:00,  2.98it/s]


In [7]:
for el in crawler.getVisited():
  print(el.split("/")[-1])

Internet
File:Crystal_Clear_app_linneighborhood.svg
Portal:Internet
File:The_Blue_Marble,_AS17-148-22727.jpg
Portal:World
Crowdfunding
Crowdsourcing
Cyberspace
Darknet
Deep_web
Hyphanet
Internet_industry_jargon
Index_of_Internet-related_articles
Internet_metaphors
Internet_video
Internets
Outline_of_the_Internet
Timeline_of_the_history_of_the_Internet
Angel_investor
Assurance_contract
Business_models_for_open-source_software
Comparison_of_crowdfunding_services
Crowdfunding_in_video_games
Cooperative_banking
Equity_crowdfunding
Fan-funded_music
Group_buying
Internet_begging
List_of_highest-funded_crowdfunding_projects
List_of_highest-funded_equity_crowdfunding_projects
Microfinance
Participatory_budgeting
Peer-to-peer_lending
Platform_cooperative
Private_equity
Revenue-based_financing
Seed_money
Threshold_pledge_system
Chronolog
Citizen_science
Clickworkers
Collaborative_innovation_network
Collaborative_mapping
Collective_consciousness
Collective_intelligence
Collective_problem_solving


In [8]:
print("Iterations:",crawler.maxIterations())
print("Max depth:", crawler.maxDepth())

Iterations: 500
Max depth: 3


In [9]:
print("This url's contains keyword `internet`")
for url in keywords:
  print(url.split("/")[-1])

This url's contains keyword `internet`
Internet
File:Crystal_Clear_app_linneighborhood.svg
Portal:Internet
File:The_Blue_Marble,_AS17-148-22727.jpg
Portal:World
Crowdfunding
Crowdsourcing
Cyberspace
Darknet
Deep_web
Hyphanet
Internet_industry_jargon
Index_of_Internet-related_articles
Internet_metaphors
Internet_video
Internets
Outline_of_the_Internet
Timeline_of_the_history_of_the_Internet
Angel_investor
Business_models_for_open-source_software
Comparison_of_crowdfunding_services
Equity_crowdfunding
Fan-funded_music
Group_buying
Internet_begging
Microfinance
Participatory_budgeting
Peer-to-peer_lending
Platform_cooperative
Threshold_pledge_system
Citizen_science
Collaborative_innovation_network
Collective_intelligence
Collective_problem_solving
Commons-based_peer_production
Crowd_computing
Crowdcasting
Crowdsourcing_software_development
Distributed_thinking
Distributed_Proofreaders
Flash_mob
Folksonomy
Government_crowdsourcing
List_of_crowdsourcing_projects
Microcredit
Participatory_de

## Compare Stack vs Queue

In [10]:
from crawler import Crawler, Queue, Stack

def begin(soup):
  return soup.find("h2", id="See_also")

def end(sibling):
  return getattr(sibling.next_element, "name", None) == "h2"

In [11]:
keywords_queue = []
def findKeywordQueue(url, data):
  if data is None:
    return
  
  if "internet" in data.lower():
    keywords_queue.append(url)


queue = Crawler()
queue.search(
  url="https://en.wikipedia.org/wiki/Internet", 
  structure=Queue(),
  begin=begin, 
  end=end, 
  max_iter=1000,
  get=findKeywordQueue
)

Searching:  56%|█████▌    | 555/1000 [03:37<04:31,  1.64it/s]

Failed to fetch https://en.wikipedia.org/w/index.php?title=Collective_influence_algorithm&action=edit&redlink=1: 404 Client Error: Not Found for url: https://en.wikipedia.org/wiki/Collective_influence_algorithm


Searching:  63%|██████▎   | 627/1000 [04:18<03:37,  1.71it/s]

Failed to fetch https://en.wikipedia.org/w/index.php?title=Industrial_internet&action=edit&redlink=1: 404 Client Error: Not Found for url: https://en.wikipedia.org/wiki/Industrial_internet


Searching:  76%|███████▋  | 764/1000 [05:49<01:34,  2.50it/s]

Failed to fetch http://i.dell.com/sites/doccontent/business/solutions/power/de/Documents/ps2q05-20040179-Saify-OE_de.pdf: 403 Client Error: Forbidden for url: https://i.dell.com/sites/doccontent/business/solutions/power/de/Documents/ps2q05-20040179-Saify-OE_de.pdf


Searching:  77%|███████▋  | 769/1000 [06:11<20:04,  5.21s/it]

Failed to fetch http://www.uobabylon.edu.iq/download/M.S%202013-2014/Operating_System_Concepts,_8th_Edition%5BA4%5D.pdf: 404 Client Error: Not Found for url: https://www.uobabylon.edu.iq:443/download/M.S%202013-2014/Operating_System_Concepts,_8th_Edition%5BA4%5D.pdf


Searching: 100%|██████████| 1000/1000 [08:15<00:00,  2.02it/s]


In [12]:
keywords_stack = []
def findKeywordStack(url, data):
  if data is None:
    return
  
  if "internet" in data.lower():
    keywords_stack.append(url)


stack = Crawler()
stack.search(
  url="https://en.wikipedia.org/wiki/Internet", 
  structure=Stack(),
  begin=begin, 
  end=end, 
  max_iter=1000,
  get=findKeywordStack
)

Searching:   6%|▋         | 64/1000 [00:15<07:16,  2.15it/s]

Failed to fetch https://stats.wikimedia.org/EN/Sitemap: 404 Client Error: Not Found for url: https://stats.wikimedia.org/EN/Sitemap


Searching:  16%|█▌        | 156/1000 [00:50<05:17,  2.66it/s]

Failed to fetch https://en.wikipedia.org/w/index.php?title=Arbitration_Committee/Discretionary_sanctions/2021_review&action=edit&redlink=1: 404 Client Error: Not Found for url: https://en.wikipedia.org/wiki/Arbitration_Committee/Discretionary_sanctions/2021_review


Searching:  16%|█▌        | 157/1000 [00:51<07:17,  1.93it/s]

Failed to fetch https://en.wikipedia.org/w/index.php?title=Arbitration_Committee/Discretionary_sanctions/2013_review&action=edit&redlink=1: 404 Client Error: Not Found for url: https://en.wikipedia.org/wiki/Arbitration_Committee/Discretionary_sanctions/2013_review


Searching:  50%|█████     | 502/1000 [03:03<01:48,  4.60it/s]

Failed to fetch https://www.nli.org.il/en/authorities/987007558512305171: 403 Client Error: Forbidden for url: https://www.nli.org.il/en/authorities/987007558512305171


Searching:  74%|███████▍  | 740/1000 [04:29<01:12,  3.59it/s]

Failed to fetch https://www.nli.org.il/en/authorities/987007530447705171: 403 Client Error: Forbidden for url: https://www.nli.org.il/en/authorities/987007530447705171


Searching:  84%|████████▍ | 840/1000 [05:09<01:20,  1.99it/s]

Failed to fetch https://en.wikipedia.org/w/index.php?title=Index_of_Vietnam-related_articles&action=edit&redlink=1: 404 Client Error: Not Found for url: https://en.wikipedia.org/wiki/Index_of_Vietnam-related_articles


Searching:  84%|████████▍ | 843/1000 [05:14<03:42,  1.42s/it]

Failed to fetch https://en.wikipedia.org/w/index.php?title=Index_of_Turkmenistan-related_articles&action=edit&redlink=1: 404 Client Error: Not Found for url: https://en.wikipedia.org/wiki/Index_of_Turkmenistan-related_articles


Searching:  86%|████████▌ | 858/1000 [05:32<01:33,  1.51it/s]

Failed to fetch https://www.nli.org.il/en/authorities/987007566060505171: 403 Client Error: Forbidden for url: https://www.nli.org.il/en/authorities/987007566060505171


Searching:  97%|█████████▋| 968/1000 [06:09<00:14,  2.24it/s]

Failed to fetch https://en.wikipedia.org/w/index.php?title=Hugo_(programming_language)&action=edit&redlink=1: 404 Client Error: Not Found for url: https://en.wikipedia.org/wiki/Hugo_(programming_language)


Searching: 100%|██████████| 1000/1000 [06:23<00:00,  2.61it/s]


In [13]:
print("Links with keyword for Queue:", len(keywords_queue))
print("Max Depth for Queue:", queue.maxDepth())
print("------------------------")
print("Links with keyword for Stack:", len(keywords_stack))
print("Max Depth for Stack:", stack.maxDepth())
print("------------------------")
if(len(keywords_queue) < len(keywords_stack)):
  print("Stack found more links")
else:
  print("Queue found more links")

Links with keyword for Queue: 545
Max Depth for Queue: 3
------------------------
Links with keyword for Stack: 289
Max Depth for Stack: 483
------------------------
Queue found more links
