# Data collection

This part contains code for scraping Wikipedia biographies of scientists and philosophers.

In [7]:
import requests
from bs4 import BeautifulSoup

In [8]:
BASE_URL = "https://en.wikipedia.org"

In [6]:
def get_philo_list_links():
    """Returns a list of links which are list of philosophers by period."""
    url = "https://en.wikipedia.org/wiki/Category:Lists_of_philosophers_by_period"

    # Get base page
    page = requests.get(url=url, headers={'User-Agent': 'Mozilla/5.0'})
    soup = BeautifulSoup(page.content, 'html.parser')

    # Get all links in the ul
    pages_link = soup.select(".mw-category-group ul li a")

    # Extract the urls
    pages_url = [BASE_URL + link.get('href') for link in pages_link]

    return pages_url

philo_list_urls = get_philo_list_links()
philo_list_urls

['https://en.wikipedia.org/wiki/List_of_philosophers_born_in_the_centuries_BC',
 'https://en.wikipedia.org/wiki/List_of_philosophers_born_in_the_1st_through_10th_centuries',
 'https://en.wikipedia.org/wiki/List_of_philosophers_born_in_the_11th_through_14th_centuries',
 'https://en.wikipedia.org/wiki/List_of_philosophers_born_in_the_15th_and_16th_centuries',
 'https://en.wikipedia.org/wiki/List_of_philosophers_born_in_the_17th_century',
 'https://en.wikipedia.org/wiki/List_of_philosophers_born_in_the_18th_century',
 'https://en.wikipedia.org/wiki/List_of_philosophers_born_in_the_19th_century',
 'https://en.wikipedia.org/wiki/List_of_philosophers_born_in_the_20th_century']

In [13]:
def get_philo_links():
    """Return a list of links which are about philosophers."""
    urls = []
    for url in philo_list_urls:
        # Get list page
        page = requests.get(url=url, headers={'User-Agent': 'Mozilla/5.0'})
        soup = BeautifulSoup(page.content, 'html.parser')

        # Get all links in the ul
        # Exclude a tags with class new to avoid red links
        # Exclude last ul which is see also
        pages_link = soup.select(".mw-parser-output > ul:not(:last-of-type) > li > a:not(.new)")

        # Extract the urls
        pages_url = [BASE_URL + link.get('href') for link in pages_link]
        urls.extend(pages_url)

    return urls

philo_urls = get_philo_links()
philo_urls


['https://en.wikipedia.org/wiki/Aenesidemus',
 'https://en.wikipedia.org/wiki/Agastya',
 'https://en.wikipedia.org/wiki/Alcibiades',
 'https://en.wikipedia.org/wiki/Alcmaeon_of_Croton',
 'https://en.wikipedia.org/wiki/Anacharsis',
 'https://en.wikipedia.org/wiki/Anaxagoras',
 'https://en.wikipedia.org/wiki/Anaxarchus',
 'https://en.wikipedia.org/wiki/Floruit',
 'https://en.wikipedia.org/wiki/Anaxilaus',
 'https://en.wikipedia.org/wiki/Anaximander',
 'https://en.wikipedia.org/wiki/Anaximenes_of_Miletus',
 'https://en.wikipedia.org/wiki/Andronicus_of_Rhodes',
 'https://en.wikipedia.org/wiki/Angiras_(sage)',
 'https://en.wikipedia.org/wiki/Anniceris',
 'https://en.wikipedia.org/wiki/Ny%C4%81ya_S%C5%ABtras',
 'https://en.wikipedia.org/wiki/Antiochus_of_Ascalon',
 'https://en.wikipedia.org/wiki/Antiphon_(person)',
 'https://en.wikipedia.org/wiki/Antisthenes',
 'https://en.wikipedia.org/wiki/Arcesilaus',
 'https://en.wikipedia.org/wiki/Archimedes',
 'https://en.wikipedia.org/wiki/Archytas',
