In [1]:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

In [3]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

# Configure Chrome options to use the proxy server
chrome_options = Options()
chrome_options.add_argument("--proxy-server=http://localhost:9919")

# Path to your ChromeDriver
driver = webdriver.Chrome(options=chrome_options)

In [5]:
# phrase search
def get_search_information(search_text:str):
    # Replace space with + sign
    search_text = search_text.replace(' ', '+')
    
    # Get to the link with limit search set to 500
    driver.get(f'https://en.wikipedia.org/w/index.php?title=Special:Search&limit=500&offset=0&ns0=1&search={search_text}')

    # Get page list
    page_list = driver.find_elements(by = By.XPATH, value = '//*[@id="mw-content-text"]/div[2]/div[4]/ul/li/div/div[2]/div[1]/a')

    # Get title and link from page list
    title_list = [page.get_attribute('title') for page in page_list]
    link_list = [page.get_attribute('href') for page in page_list]

    # Get content
    content_list = driver.find_elements(by = By.XPATH, value = '//*[@id="mw-content-text"]/div[2]/div[4]/ul/li/div/div[2]/div[2]')
    content_list = [content.get_attribute('innerText') for content in content_list]

    # Get created at date
    created_at = driver.find_elements(by = By.XPATH, value = '//*[@id="mw-content-text"]/div[2]/div[4]/ul/li/div/div[2]/div[3]')
    created_at = [text.get_attribute('innerText').split(' - ')[1] for text in created_at]

    # Create metadata
    metadata = [
        {
            'title':title_list[i],
            'link':link_list[i],
            'content':content_list[i],
            'created_at':created_at[i]
        }
        for i in range(len(page_list))
    ]

    return metadata

In [6]:
metadata = get_search_information('Indonesia Raya')

In [8]:
def unique_elements(list1, list2):
    # Combine both lists
    combined_list = list1 + list2
    
    # Use set to find unique elements
    unique_list = list(set(combined_list))
    
    return unique_list

In [10]:
def get_page_information(link_list, result_list:list=[], visited_link_list:list=[]):
    try:
        for link in link_list:
            # Check if link already in the visited link list
            if link in visited_link_list:
                print(f'Skipping {link} as this link is already scrapped')
                continue
            else:
                visited_link_list.append(link)

            # Go to the wikipedia
            print(f'Scraping {link}')
            time.sleep(2)
            driver.get(link)

            # Get title
            # //*[@id="firstHeading"]/i
            title = driver.find_element(by = By.XPATH, value = '//*[@id="firstHeading"]').get_attribute("innerText")

            # Get content
            content = driver.find_element(by = By.XPATH, value = '//*[@id="mw-content-text"]/div[1]').text

            # Click View History
            try:
                driver.find_element(by = By.XPATH, value = '//*[@id="ca-history"]/a').click()

                # Sort by oldest update
                try:
                    driver.find_element(by = By.XPATH, value = '//*[@id="mw-content-text"]/div[3]/a[1]').click()

                    # Get created at information
                    created_at = driver.find_element(by = By.XPATH, value = '//*[@id="pagehistory"]/ul[1]/li/a').text
                except:
                    # Get created_date from the last record
                    created_at = driver.find_elements(by = By.XPATH, value = '//*[@id="pagehistory"]/ul/li/a')[-1]

                # Go back to the page
                driver.find_element(by = By.XPATH, value = '//*[@id="ca-nstab-main"]/a').click()
            except:
                created_at = ''

            # Get categories
            cat_list = driver.find_elements(by = By.XPATH, value = '//*[@id="mw-normal-catlinks"]/ul/li/a')
            categories = ''
            for cat in cat_list:
                categories += cat.text + ', '

            # Get reference links
            reference_link_list_object = driver.find_elements(by = By.XPATH, value = '//*[@id="mw-content-text"]/div[1]/div/ul/li/a')
            reference_link_list = [link.get_attribute('href') for link in reference_link_list_object]

            # Store the result in dictionary
            scrape_result = {
                'title': title,
                'content': content,
                'categories': categories,
                'see_also': reference_link_list,
                'created_at': created_at,
            }

            result_list.append(scrape_result)

            # Loop
            ref_link_result = get_page_information(reference_link_list, result_list, visited_link_list)
            result_list.extend(ref_link_result)

        return result_list

    except KeyboardInterrupt:
        return result_list

In [11]:
result_list = get_page_information(['https://en.wikipedia.org/wiki/Proxy_server'])

Scraping https://en.wikipedia.org/wiki/Proxy_server
Scraping https://en.wikipedia.org/wiki/Application_firewall
Scraping https://en.wikipedia.org/wiki/ModSecurity
Scraping https://en.wikipedia.org/wiki/Captive_portal
Scraping https://en.wikipedia.org/wiki/Darknet
Scraping https://en.wikipedia.org/wiki/Crypto-anarchism
Scraping https://en.wikipedia.org/wiki/Cryptocurrency
Scraping https://en.wikipedia.org/wiki/Darknet_market
Scraping https://en.wikipedia.org/wiki/Dark_web
