# Data collection

This part contains code for scraping Wikipedia biographies of physicists and philosophers.

In [52]:
import requests, re
from bs4 import BeautifulSoup
import pandas as pd
import os

In [7]:
BASE_URL = "https://en.wikipedia.org"

In [44]:
def get_philo_list_links():
    """Returns a list of links which are list of philosophers by period."""
    url = "https://en.wikipedia.org/wiki/Category:Lists_of_philosophers_by_period"

    # Get base page
    page = requests.get(url=url, headers={'User-Agent': 'Mozilla/5.0'})
    soup = BeautifulSoup(page.content, 'html.parser')

    # Get all links in the ul
    philo_pages_link = soup.select(".mw-category-group ul li a")

    # Extract the urls
    philo_pages_url = [BASE_URL + link.get('href') for link in philo_pages_link]

    return philo_pages_url


In [45]:
def get_philo_links(list_urls):
    """Return a list of links which are about philosophers."""
    urls = []
    for url in list_urls:
        # Get list page
        page = requests.get(url=url, headers={'User-Agent': 'Mozilla/5.0'})
        soup = BeautifulSoup(page.content, 'html.parser')

        # Get all links in the ul
        # Exclude a tags with class new to avoid red links
        # Exclude last ul which is see also
        pages_link = soup.select(".mw-parser-output > ul:not(:last-of-type) > li > a:not(.new)")

        # Extract the urls
        pages_url = [BASE_URL + link.get('href') for link in pages_link]
        urls.extend(pages_url)

    return urls

In [46]:
def get_physi_list_links():
    """Returns a list of links which are list of physicists by century."""
    url = "https://en.wikipedia.org/wiki/Category:Physicists_by_century"

    # Get base page
    page = requests.get(url=url, headers={'User-Agent': 'Mozilla/5.0'})
    soup = BeautifulSoup(page.content, 'html.parser')

    # Get first level (category) links
    pages_link = soup.select(".mw-category-group:last-of-type > ul > li > .CategoryTreeSection > .CategoryTreeItem > a")
    pages_url = [BASE_URL + link.get('href') for link in pages_link]

    # Get second level (individual) links
    physi_list_urls = []
    for page_url in pages_url:
        page = requests.get(url=page_url, headers={'User-Agent': 'Mozilla/5.0'})
        soup = BeautifulSoup(page.content, 'html.parser')

        pages_link = soup.select(".mw-category-group > ul > li > .CategoryTreeSection > .CategoryTreeItem > a")
        physi_list_urls.extend([BASE_URL + link.get('href') for link in pages_link])

    return physi_list_urls

In [47]:
def get_physi_links(list_urls):
    """Return a list of links which are about physicists."""
    urls = []
    for url in list_urls:
        # Get list page
        page = requests.get(url=url, headers={'User-Agent': 'Mozilla/5.0'})
        soup = BeautifulSoup(page.content, 'html.parser')

        # Get all links in the ul
        # Exclude a tags with class new to avoid red links
        pages_link = soup.select(".mw-category > .mw-category-group > ul > li > a:not(.new)")

        # Extract the urls
        pages_url = [BASE_URL + link.get('href') for link in pages_link]
        urls.extend(pages_url)

    return urls

In [63]:
def get_page_title_content(url):
    """Return the title and content of a wikipedia page."""
    page = requests.get(url=url, headers={'User-Agent': 'Mozilla/5.0'})
    soup = BeautifulSoup(page.content, 'html.parser')

    # Get the title of the page
    title = soup.select("#firstHeading")[0].text

    # Get the content of the page
    content = soup.select("#mw-content-text > div.mw-parser-output > p")

    # Extract the text
    text = [p.text for p in content]

    # join with space, remove \n and \xa0, remove [digits], remove {style settings}
    text = " ".join(text).replace("\n", "").replace("\xa0", " ").strip()
    text = re.sub(r"\[\d+\]", "", text)
    text = re.sub(r"\{.*\}", "", text)

    return title, text

In [66]:
def write_txt(title, category, text):
    """Write the text to a txt file."""
    # Format the title
    title = title.title().replace(" ", "")
    title = re.sub(r"\(.*\)", "", title)

    # Create the directory if it doesn't exist
    os.makedirs(f"./{category}", exist_ok=True)

    with open(f"./{category}/{title}_{category}.txt", "w", encoding="utf8") as f:
        f.write(text)

In [65]:
def get_texts(list_urls, category):
    """
    Write a txt file for each url.
    Return a set of texts.
    """
    texts = set()
    for url in list_urls:
        # Get title and page content
        title, text = get_page_title_content(url)

        # Add to the set
        texts.add(text)

        # Write a txt file
        write_txt(title, category, text)

    return texts

In [61]:
# Scraping
philo_list_urls = get_philo_list_links()
philo_urls = get_philo_links(philo_list_urls)

physi_list_urls = get_physi_list_links()
physi_urls = get_physi_links(physi_list_urls)

philo_texts = get_texts(philo_urls, "Philosopher")
physi_texts = get_texts(physi_urls, "Physicist")

In [None]:
# Create a dataframe with text-category pairs
data = [(text, "Philosopher") for text in philo_texts] + \
       [(text, "Physicist") for text in physi_texts]

df = pd.DataFrame(data, columns=["text", "category"])