# Scrapers

## Imports & Other Setup

In [None]:
# Change working directory to repo directory if we are in the notebook directory
import os
if os.getcwd().endswith("utils"):
    os.chdir("..")

In [None]:
from bs4 import BeautifulSoup
from collections import defaultdict
import glob
import html2text
import random
import re
import requests
import undetected_chromedriver.v2 as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import ElementClickInterceptedException, ElementNotInteractableException, NoSuchElementException
import shutil
import time
from unidecode import unidecode

from utils.profanity import ProfanityFilter

In [None]:
### Helper variables/functions setup

notebook_path = os.path.dirname(os.path.realpath(next(
    glob.iglob(os.path.join(os.getcwd(), "**", "scrapers.ipynb")), "./scrapers.ipynb"
)))

data_path = os.path.realpath(os.path.join(notebook_path, "..", "data"))
os.makedirs(data_path, exist_ok=True)
corpus_path = os.path.realpath(os.path.join(notebook_path, "..", "corpus"))
os.makedirs(corpus_path, exist_ok=True)

profanity_filter = ProfanityFilter(censor_pool="@#%^*")
profanity_pool_regex = re.compile(re.escape(profanity_filter.censor_pool), flags=re.IGNORECASE)
whitespace_shrink_regex = re.compile(r"[\r\n]+")
punctuation_regex = re.compile(r"""[\.,!?:;"'\(\)\[\]]""")
url_regex = re.compile(r"(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})", flags=re.IGNORECASE)

def prepare_scraped_text(text):
    # Prepare text retrieved from scraper for model usage
    return profanity_filter.censor(
        profanity_pool_regex.sub("",
            whitespace_shrink_regex.sub("\n", unidecode(text.strip("\n")))
        )
    )

def backup_file(filepath):
    # Backup original file and save in original's location
    file_directory = os.path.dirname(os.path.realpath(filepath))
    file_name = os.path.basename(filepath)
    backup_filepath = os.path.join(file_directory, "bak", file_name)
    os.makedirs(os.path.join(file_directory, "bak"), exist_ok=True)
    os.rename(filepath, backup_filepath)

def cleanup_scraped_text(filepath):
    with open(filepath) as f:
        text = f.read()

    # Run a series of substitutions and other fixes to clean text
    text = url_regex.sub("", text)
    text = text.replace("~", "")
    text = "\n".join(line.strip() for line in text.split("\n"))
    text = whitespace_shrink_regex.sub("\n", text)
    text = re.sub(r" +", " ", text)
    text = re.sub(r'^"\s+', '"', text, flags=re.MULTILINE)
    text = re.sub(r'\s+"$', '"', text, flags=re.MULTILINE)
    text = re.sub(r'(^[^"]+"[^"]+)\s+"', r'\1"', text, flags=re.MULTILINE)
    text = re.sub(r'"\s+(?=[^"]+"[^"]+$)', '"', text, flags=re.MULTILINE)
    text = text.replace("\\.", ".").replace("\\-", "-").replace("\\\\", "\\")
    # Fix for weird behavior from replacing names with pronouns
    text = re.sub(r'(\b(?:he|him|his|she|her|hers|they|them|their|theirs)\b\s*)+', r"\1", text, flags=re.IGNORECASE)

    backup_file(filepath)

    with open(filepath, "w") as f:
        f.write(text)

blocker_regex = re.compile("DDoS protection", flags=re.IGNORECASE)

html2text_handler = html2text.HTML2Text()
html2text_handler.ignore_links = True
html2text_handler.ignore_images = True
html2text_handler.body_width = 2147483647
html2text_handler.emphasis_mark = ""
html2text_handler.strong_mark = ""

In [None]:
### Webdriver Setup ###

scraperfiles_path = os.path.join(notebook_path, "scraper-files")
os.makedirs(scraperfiles_path, exist_ok=True)

# Get UBlock Origin and add to Chrome to speed up page load times
ublock_origin_directory = os.path.join(scraperfiles_path, "ublock_origin")
if not os.path.exists(ublock_origin_directory):
    ublock_origin_crx_file = os.path.join(scraperfiles_path, "cjpalhdlnbpafiamejdnhcphjbkeiagm.zip")
    if not os.path.exists(ublock_origin_crx_file):
        chrome_version = "100.0.4896.127"
        extension_id = "cjpalhdlnbpafiamejdnhcphjbkeiagm"
        ublock_download_url = f"https://clients2.google.com/service/update2/crx?response=redirect&prodversion={chrome_version}&acceptformat=crx2,crx3&x=id%3D{extension_id}%26uc"
        extension_response = requests.get(ublock_download_url)
        if extension_response.ok:
            with open(ublock_origin_crx_file, "wb") as f:
                f.write(extension_response.content)
    # Make sure file exists from above, otherwise just ignore it
    if os.path.exists(ublock_origin_crx_file):
        os.makedirs(ublock_origin_directory, exist_ok=True)
        shutil.unpack_archive(ublock_origin_crx_file, ublock_origin_directory)

options = uc.ChromeOptions()
options.add_argument("--user-data-dir=/tmp/scrape-chromium-profile")
options.add_argument("--no-first-run --no-service-autorun --password-store=basic")
if os.path.exists(ublock_origin_directory):
    options.add_argument(f"--load-extension={ublock_origin_directory}")
driver = uc.Chrome(options=options)

## Names Scraper

In [None]:
# Taken from the nice work done here: https://dlsun.github.io/pods/data/names/
base_url = "https://dlsun.github.io/pods/data/names/yob{}.txt"

# Create dicts mapping names to number of occurrences, to calculate probabilities later
names_man = defaultdict(lambda: 0)
names_woman = defaultdict(lambda: 0)

for year in range(1980, 2011):
    driver.get(base_url.format(year))
    # Remove added html info to just get list
    names_list = whitespace_shrink_regex.sub("\n",
        html2text_handler.handle(driver.page_source)
    ).replace(" ","").strip(" \n")
    # Split name data like basic CSV
    name_data = [
        [col for col in row.split(",")]
        for row in names_list.split("\n") if len(row.strip()) > 0
    ]
    for name, gender, n_times in name_data:
        if gender == "F":
            names_woman[name] += int(n_times)
        else:
            names_man[name] += int(n_times)
    time.sleep(1.5)

# Create list of names in both, to get unisex names
names_nb = defaultdict(lambda: 0)
for name in names_man.keys():
    if name in names_woman:
        names_nb[name] = names_man[name] + names_woman[name]

# Finally, calculate probabilities
total_names_man = sum(names_man.values())
total_names_woman = sum(names_woman.values())
total_names_nb = sum(names_nb.values())
pr_names_man = {name: count / total_names_man for name,count in names_man.items()}
pr_names_woman = {name: count / total_names_woman for name,count in names_woman.items()}
pr_names_nb = {name: count / total_names_nb for name,count in names_nb.items()}

In [None]:
# Round and re-normalize the probabilities
def names_renormalize(pr_names):
    for name in pr_names.keys():
        pr_names[name] = round(pr_names[name], 9)
    adjust_amount = 1 - sum(pr_names.values())
    # Adjust highest probability by that amount
    most_common_key = max(pr_names.items(), key=lambda item: item[1])[0]
    pr_names[most_common_key] += adjust_amount
names_renormalize(pr_names_man)
names_renormalize(pr_names_woman)
names_renormalize(pr_names_nb)

In [None]:
# Save to files
with open(os.path.join(data_path, "names.m.txt"), "w") as f:
    for name, pr in sorted(pr_names_man.items(), key=lambda item: -item[1]):
        f.write(f"{name},{pr:.9f}\n")
with open(os.path.join(data_path, "names.f.txt"), "w") as f:
    for name, pr in sorted(pr_names_woman.items(), key=lambda item: -item[1]):
        f.write(f"{name},{pr:.9f}\n")
with open(os.path.join(data_path, "names.x.txt"), "w") as f:
    for name, pr in sorted(pr_names_nb.items(), key=lambda item: -item[1]):
        f.write(f"{name},{pr:.9f}\n")

all_names = set(pr_names_man.keys()).union(pr_names_woman.keys())
with open(os.path.join(data_path, "names.all.txt"), "w") as f:
    for name in sorted(all_names):
        f.write(name + "\n")

## Fanfic Scraper

In [None]:
titles = []
url = f"https://www.fanfiction.net/game/Sonic-the-Hedgehog/?&p=1"
driver.get(url)

for i in range(1000):
    while blocker_regex.search(driver.page_source):
        time.sleep(5)

    soup = BeautifulSoup(driver.page_source)
    for div in soup.select("div.z-list:has(a.stitle)"):
        # Check language to see if it's in English
        info_div = div.select_one("div.xgray")
        if "English" in info_div.get_text():
            stitle = div.select_one("a.stitle")
            titles.append(prepare_scraped_text(stitle.get_text()))
    
    time.sleep(2+2*random.random())
    
    try:
        next_link = driver.find_element(By.LINK_TEXT, "Next »")
        next_link.click()
    except NoSuchElementException:
        print("Reached end of pages, exiting")
        break # Leave loop when we can't go to next page anymore

In [None]:
with open(os.path.join(corpus_path, "fanfics.titles.txt"), "w") as f:
    f.write("\n".join(titles))

In [None]:
MAX_BYTE_COUNT = 128 * 1024 ** 2 # Imposing 128 MiB limit
bodies = []
body_byte_count = 0
base_url = "https://www.fanfiction.net/game/Sonic-the-Hedgehog/"
driver.get(base_url + "?&p=1")
time.sleep(2+2*random.random())

# Handle cookie accept dialog if it pops up
try:
    driver.find_element(By.XPATH, f"//div[@onclick='_cookieAccept()']").click()
except NoSuchElementException:
    pass

for i in range(500):
    while blocker_regex.search(driver.page_source):
        time.sleep(5)

    soup = BeautifulSoup(driver.page_source)
    for div in soup.select("div.z-list:has(a.stitle)"):
        time.sleep(2+2*random.random())
        
        while blocker_regex.search(driver.page_source):
            time.sleep(5)
            
        # Check language to see if it's in English
        info_div = div.select_one("div.xgray")
        if "English" not in info_div.get_text():
            continue
        # Now open the link
        stitle = div.select_one("a.stitle")
        stitle_href = stitle.get("href", None)
        if stitle_href is None:
            continue
        
        stitle_element = driver.find_element(By.XPATH, f"//a[@href='{stitle_href}']")
        stitle_element.click()

        while blocker_regex.search(driver.page_source):
            time.sleep(5)
        
        child_soup = BeautifulSoup(driver.page_source)
        story_div = child_soup.select_one("#storytextp")
        story_text = prepare_scraped_text(html2text_handler.handle(str(story_div)))
        bodies.append(story_text)
        body_byte_count += len(story_text)

        if body_byte_count >= MAX_BYTE_COUNT:
            break

        time.sleep(1+2*random.random())

        # Keep going back until at a directory page, in case of blocker check
        while not driver.current_url.startswith(base_url):
            time.sleep(1)
            driver.back()
    
    if body_byte_count >= MAX_BYTE_COUNT:
        print("Reached max byte count, exiting")
        break

    time.sleep(2+2*random.random())
    
    try:
        next_link = driver.find_element(By.LINK_TEXT, "Next »")
        next_link.click()
    except NoSuchElementException:
        print("Reached end of pages, exiting")
        break # Leave loop when we can't go to next page anymore

In [None]:
with open(os.path.join(corpus_path, "fanfics.bodies.txt"), "w") as f:
    f.write("\n".join(bodies))

## OC Descriptions Scraper

In [None]:
bodies_man = []
bodies_woman = []
bodies_nb = []

exclude_names = [
    "An", "Bow", "Boy", "Demon", "Do", "Don", "Had", "Her", "In", "Job", "Joy", "La", "Like", "Lo",
    "Ma", "May", "Me", "Moo", "My", "No", "Nor", "Not", "Or", "Pa", "Red", "Saw", "Say", "Sea",
    "See", "Set", "Shy", "Sir", "Ski", "Sky", "Sly", "So", "Son", "Tea", "The", "Toy", "True",
    "Van", "Win", "Won", "Woo", "Yo", "You", "Zen"
]

def read_names(gender):
    with open(os.path.join(corpus_path, f"names.{gender}.txt")) as f:
        return [
            name for line in f.readlines()
            # Exclude names that are also spelled like normal words
            if (name := line.strip().split(",")[0]) not in exclude_names
        ]
names_man = read_names("m")
names_woman = read_names("f")
names_nb = read_names("x")
with open(os.path.join(corpus_path, f"names.all.txt")) as f:
    names_all = [
        name for line in f.readlines()
        # Exclude names that are also spelled like normal words
        if (name := line.strip()) not in exclude_names
    ]

pronoun_man_pattern = r"\b(?:he|he'(?:[sd]|ll)|his|him)\b"
pronoun_woman_pattern = r"\b(?:she|she'(?:[sd]|ll)|hers|her)\b"
pronoun_nb_pattern = r"\b(?:they|they'(?:ve|d|ll)|their|theirs|them)\b"

domain = "https://sonicfanchara.fandom.com"
base_url = domain + "/wiki/Category:{}"

def re_count(pattern, string, flags=0):
    return sum(1 for _ in re.finditer(pattern, string, flags))

def filter_oc_paragraphs(p_text):
    # If any of the flags is true, return false
    flag_too_short = len(p_text) <= 35
    flag_no_sentence = re_count(r"[\.!?]", p_text) == 0
    flag_only_key_val = re.search(r"^[^\.!?;]{1,25}:", p_text) is not None and len(p_text) <= 150
    return not any((flag_too_short, flag_no_sentence))

def get_oc_names(page_title):
    lower_page_title = page_title.lower()
    if " the " in lower_page_title or "," in lower_page_title:
        search_substr = " the " if " the " in lower_page_title else ","
        # Take everything before the word "the" and remove punctuation
        full_name = punctuation_regex.sub("", page_title[:lower_page_title.find(search_substr)])
    else:
        # Just consider full name whole thing after removing punctuation
        full_name = punctuation_regex.sub("", page_title)
    # Start building all names list using all individual names of OC
    all_names = full_name.split(" ")
    # Add full name too if all_names split into individual names
    # Also add first name + last name if more than 2 names
    if len(all_names) > 2:
        all_names.insert(0, f"{all_names[0]} {all_names[-1]}")
    if len(all_names) > 1:
        all_names.insert(0, full_name)
    return [name.strip() for name in all_names]

def infer_oc_gender(oc_names, page_text):
    lower_page_text = page_text.lower()
    if "gender: male" in lower_page_text:
        return "M"
    elif "gender: female" in lower_page_text:
        return "F"
    elif "nonbinary" in lower_page_text or "non-binary" in lower_page_text or "enby" in lower_page_text:
        return "X"
    elif any(name.title() in names_all for name in oc_names):
        # Do name-based inference
        first_oc_name_known = next(name.title() in names_all for name in oc_names)
        # Sometimes do non-binary if name is in nb dict
        if first_oc_name_known in names_nb and random.random() < 0.1:
            return "X"
        elif first_oc_name_known in names_man:
            return "M"
        else:
            return "F"
    else:
        # Resort to count of third person pronouns in the content
        pronoun_counts = {
            "M": re_count(pronoun_man_pattern, page_text, re.IGNORECASE),
            "F": re_count(pronoun_woman_pattern, page_text, re.IGNORECASE),
            "X": re_count(pronoun_nb_pattern, page_text, re.IGNORECASE)
        }
        return max(pronoun_counts.items(), key=lambda item: item[1])[0]

def generalize_oc_text(oc_names, oc_gender, page_text, base=True):
    # Use regexes to replace with pronouns
    pronouns = {
        "M": ("he", "him", "his"),
        "F": ("she", "her", "her"),
        "X": ("they", "them", "their")
    }
    subject_pronoun, object_pronoun, possessive_pronoun = pronouns[oc_gender]

    # Do replacement in order of names, since full name is first
    return_text = page_text
    for oc_name in oc_names:
        name_to_subject_upper_regex = re.compile(
            r"(^|[\.!?] |(?:And|But|Or|Yet|So|Because|However|Therefore|Thus|Nevertheless|Nonetheless|Who|What|When|Where|Why|How|Now) )\b" + re.escape(oc_name) + r"\b", re.MULTILINE
        )
        name_to_subject_lower_regex = re.compile(
            r"(^|[,:;] |[,;] (?:for|and|nor|but|or) |(?:yet|so|because|however|therefore|thus|nevertheless|nonetheless|who|what|when|where|why|how|day) )\b" + re.escape(oc_name) + r"\b", re.MULTILINE
        )
        name_to_object_regex = re.compile(
            r"\b" + re.escape(oc_name) + r"\b", re.MULTILINE
        )
        name_to_possessive_upper_regex = re.compile(
            r"(^|[\.!?] )\b" + re.escape(oc_name) + r"'s?\b", re.MULTILINE
        )
        name_to_possessive_lower_regex = re.compile(
            r"([\w,:;] )\b" + re.escape(oc_name) + r"'s?\b", re.MULTILINE
        )
        return_text = name_to_possessive_upper_regex.sub(r"\1" + possessive_pronoun.title(), return_text)
        return_text = name_to_possessive_lower_regex.sub(r"\1" + possessive_pronoun, return_text)
        return_text = name_to_subject_upper_regex.sub(r"\1" + subject_pronoun.title(), return_text)
        return_text = name_to_subject_lower_regex.sub(r"\1" + subject_pronoun, return_text)
        return_text = name_to_object_regex.sub(object_pronoun, return_text)
    
    # Do the same for any other names that may appear in text recursively (if base is True)
    if base:
        other_names_man = [name for name in names_man if name in return_text]
        other_names_woman = [name for name in names_woman if name in return_text]
        if len(other_names_man) > 0: return_text = generalize_oc_text(other_names_man, "M", return_text, base=False)
        if len(other_names_woman) > 0: return_text = generalize_oc_text(other_names_woman, "F", return_text, base=False)
    
    return return_text

for category in ("Good", "Neutral", "Evil"):
    driver.get(base_url.format(category))
    time.sleep(1+2*random.random())

    category_new_pages = True

    while category_new_pages:
        soup = BeautifulSoup(driver.page_source)
        for link in soup.select("a.category-page__member-link"):
            directory_page_url = driver.current_url

            page_name = link.get_text().strip()

            # Open the link if applicable
            link_href = link.get("href", None)
            if page_name.startswith("File:") or link_href is None:
                continue
            link_element = driver.find_element(By.XPATH, f"//a[@href='{link_href}']")

            # Move page around if can't click
            try:
                link_element.click()
            except (ElementClickInterceptedException, ElementNotInteractableException) as e:
                driver.get(domain + link_href)
            
            child_soup = BeautifulSoup(driver.page_source)

            oc_names = get_oc_names(page_name)
            all_page_text = html2text_handler.handle(
                "".join(str(p) for p in child_soup.select("div#mw-content-text p"))
            )
            oc_gender = infer_oc_gender(oc_names, all_page_text)
            
            desc_text = "\n".join(
                p_text for p in child_soup.select("div#mw-content-text p")
                if filter_oc_paragraphs((p_text := prepare_scraped_text(
                    html2text_handler.handle(str(p))
                )))
            )
            adjusted_desc_text = generalize_oc_text(oc_names, oc_gender, desc_text)

            if oc_gender == "M":
                bodies_man.append(adjusted_desc_text)
            elif oc_gender == "F":
                bodies_woman.append(adjusted_desc_text)
            else:
                bodies_nb.append(adjusted_desc_text)

            time.sleep(1+2*random.random())
            driver.get(directory_page_url)
            time.sleep(1+2*random.random())
    
        try:
            next_link = driver.find_element(By.XPATH, "//a[contains(@class, 'category-page__pagination-next')]")
            next_link.click()
        except (ElementClickInterceptedException, ElementNotInteractableException) as e:
            next_link_element = soup.select_one("a.category-page__pagination-next")
            next_link_href = link.get("href", None)
            if next_link_href is None:
                category_new_pages = False
            else:
                driver.get(domain + next_link_href)
        except NoSuchElementException:
            # Leave loop when we can't go to next page anymore
            category_new_pages = False
        finally:
            time.sleep(1+2*random.random())

In [None]:
with open(os.path.join(corpus_path, "ocdescriptions.m.txt"), "w") as f:
    f.write("\n".join(bodies_man))
with open(os.path.join(corpus_path, "ocdescriptions.f.txt"), "w") as f:
    f.write("\n".join(bodies_woman))
with open(os.path.join(corpus_path, "ocdescriptions.x.txt"), "w") as f:
    f.write("\n".join(bodies_nb))

In [None]:
cleanup_scraped_text(os.path.join(corpus_path, "ocdescriptions.m.txt"))
cleanup_scraped_text(os.path.join(corpus_path, "ocdescriptions.f.txt"))
cleanup_scraped_text(os.path.join(corpus_path, "ocdescriptions.x.txt"))

## Colors Scraper

In [None]:
CATEGORIES = ["A-F", "G-M", "N-Z"]

color_triplets = {}
for cat in CATEGORIES:
    url = "https://en.wikipedia.org/wiki/List_of_colors:_" + cat
    driver.get(url)

    soup = BeautifulSoup(driver.page_source)

    for tr in soup.find_all("table", class_="wikitable")[0].find_all("tr"):
        raw_color_name = tr.find_all("th")[0].get_text().lower()
        color_hex = ""
        if tr.find_all("td"):
            color_hex = tr.find_all("td")[0].get_text()[1:]
        if color_hex != "":
            color_r = int(color_hex[0:2], 16)
            color_g = int(color_hex[2:4], 16)
            color_b = int(color_hex[4:6], 16)
            color_name = unidecode(raw_color_name).strip()

            first_parenthetical_match = re.search(r"\((.*?)\)", color_name)
            if first_parenthetical_match:
                reduced_color_name = re.sub(r"\s+\(.*?\)", "", color_name, count=1)
                first_parenthetical = first_parenthetical_match.groups()[0]
                if first_parenthetical == "ncs":
                    color_name = "natural " + reduced_color_name
                elif first_parenthetical == "crayola":
                    color_name = reduced_color_name + " crayon"
                elif first_parenthetical == "pantone":
                    color_name = reduced_color_name + " paint"
                elif first_parenthetical in ("pigment", "process"):
                    color_name = "pigment " + reduced_color_name
                elif first_parenthetical == "dye":
                    color_name = "rich " + reduced_color_name
                elif first_parenthetical == "munsell":
                    color_name = "brilliant " + reduced_color_name
                elif first_parenthetical in ("dark", "light", "metallic"):
                    color_name = first_parenthetical + " " + reduced_color_name
                elif first_parenthetical in ("fogra29", "fogra39", "traditional"):
                    color_name = reduced_color_name
            
            # Resolve collisions in color list, will need to be resolved manually
            if color_name in color_triplets:
                color_index = 0
                while f"{color_name} {color_index}" in color_triplets:
                    color_index += 1
                color_name = f"{color_name} {color_index}"

            color_triplets[color_name] = (color_r, color_g, color_b)
    
    time.sleep(1.5)

with open(os.path.join(data_path, "colors.general.txt"), "w") as f:
    for color_name, color_triplet in sorted(color_triplets.items(), key=lambda item: item[0]):
        color_r, color_g, color_b = color_triplet
        f.write(f"{color_r:3d},{color_g:3d},{color_b:3d}:{color_name}\n")

In [None]:
# Re-sort the colors after removing some manually
with open(os.path.join(data_path, "colors.general.txt")) as f:
    data = [(line.split(":")[1], line) for line in f.readlines()]
data.sort(key=lambda row: row[0])
backup_file(os.path.join(data_path, "colors.general.txt"))
with open(os.path.join(data_path, "colors.general.txt"), "w") as f:
    for name, line in data:
        f.write(line)

## Cleanup

In [None]:
driver.quit()