[Reference](https://python.plainenglish.io/methodology-build-web-crawler-with-bypassing-anti-crawler-technology-python-beautifulsoup-4-74a27243890f)

# Method 1: Simple Program to get all URLs (Apply to all websites)


In [1]:
import requests
from bs4 import BeautifulSoup

domain = 'https://www.google.com/search?q='
search = 'Web Scraping'
response = requests.get(domain+search)
soup = BeautifulSoup(response.content, 'html.parser')

elm = [x['href'][x['href'].find('https'):] for x in soup.select('a') if '/url?q=' in x['href']]

for e in elm:
    print('Main URL',e)
    response = requests.get(e)
    soup = BeautifulSoup(response.content, 'html.parser')

    url = [x['href'] for x in soup.select('a') if x.has_attr('href') and 'https' in x['href']]
    print('Sub URL',url)

Main URL https://en.wikipedia.org/wiki/Web_scraping&sa=U&ved=2ahUKEwi2tbKc58_4AhWHs4QIHU4-AhsQmhN6BAgMEA0&usg=AOvVaw3YYUp-LaqUg8XICkAaxwuR
Sub URL ['https://en.wiktionary.org/wiki/Special:Search/Web_scraping%26sa%3DU%26ved%3D2ahUKEwi2tbKc58_4AhWHs4QIHU4-AhsQmhN6BAgMEA0%26usg%3DAOvVaw3YYUp-LaqUg8XICkAaxwuR', 'https://en.wikibooks.org/wiki/Special:Search/Web_scraping%26sa%3DU%26ved%3D2ahUKEwi2tbKc58_4AhWHs4QIHU4-AhsQmhN6BAgMEA0%26usg%3DAOvVaw3YYUp-LaqUg8XICkAaxwuR', 'https://en.wikiquote.org/wiki/Special:Search/Web_scraping%26sa%3DU%26ved%3D2ahUKEwi2tbKc58_4AhWHs4QIHU4-AhsQmhN6BAgMEA0%26usg%3DAOvVaw3YYUp-LaqUg8XICkAaxwuR', 'https://en.wikisource.org/wiki/Special:Search/Web_scraping%26sa%3DU%26ved%3D2ahUKEwi2tbKc58_4AhWHs4QIHU4-AhsQmhN6BAgMEA0%26usg%3DAOvVaw3YYUp-LaqUg8XICkAaxwuR', 'https://en.wikiversity.org/wiki/Special:Search/Web_scraping%26sa%3DU%26ved%3D2ahUKEwi2tbKc58_4AhWHs4QIHU4-AhsQmhN6BAgMEA0%26usg%3DAOvVaw3YYUp-LaqUg8XICkAaxwuR', 'https://commons.wikimedia.org/wiki/Special:Sear

# Method 2: Build a selenium app to crawl all URLs (Apply to all websites)


In [2]:
from selenium import webdriver
import numpy as np
import time

driver = webdriver.Chrome()
domain = 'https://www.google.com/search?q='
search = 'Web Scraping'
driver.get(domain+search)
time.sleep(5)

elm = [x.get_attribute('href') for x in driver.find_elements_by_tag_name('a') if x.get_attribute('href') != None]

for e in elm:
    print('Main URL', e)
    driver.get(e)
    time.sleep(5)
    url = np.unique([x.get_attribute('href') for x in driver.find_elements_by_tag_name('a') if x.get_attribute('href') != None and x.get_attribute('href').startswith('https')]).tolist()
    print('Sub URL', url)

driver.quit()

# Method 3: Build a selenium app with BS4 to crawl all URLs (Apply to all websites)

In [3]:
from selenium import webdriver
import time
from bs4 import BeautifulSoup

driver = webdriver.Chrome()
domain = 'https://www.google.com/search?q='
search = 'Web Scraping'
driver.get(domain+search)
time.sleep(5)

DOM = driver.page_source
soup = BeautifulSoup(DOM, 'html.parser')

elm = [x['href'] for x in soup.select('a') if x.has_attr('href') and x['href'].startswith('https')]

for e in elm:
    print('Main URL', e)
    driver.get(e)
    time.sleep(5)
    DOM = driver.page_source
    soup = BeautifulSoup(DOM, 'html.parser')
    url = [x['href'] for x in soup.select('a') if x.has_attr('href') and 'https' in x['href']]
    print('Sub URL', url)

driver.quit()