# Explorer

Design the angelpy web scraper.

#### Alternative Concept
A numpy array of chapters containing two integer columns, page and post,
could replace the list of dicts with lists.
All it would require is functions to parse the numbers from the strings then recreate them.
Would be more complicated and mildly less flexible as it would require the base url be saved
then assuming all storied from the same threadmark page are on the same thread,
which is a reasonable assumption.
There would not be many benefits to this approach.


In [1]:
# imports
import random
import re
import shutil
import time

from pathlib import Path
from urllib.parse import urlparse, urljoin, urldefrag, ParseResult

import html2text
import numpy as np
import requests

from bs4 import BeautifulSoup
from rich import print
from tqdm import tqdm

## Define Functions

In [2]:
# define a function to get soup from a link
def make_soup(link: str, headers: dict = None, pause_interval: float = None) -> BeautifulSoup:
    """Makes a soup from a link.

    Arguments:
        link -- the link to make the soup from (str).

    Keyword Arguments:
        headers -- the headers to use for the request (default: User-Agent: Mozilla/5.0).

    Returns:
        soup -- the soup from the link (BeautifulSoup).
    """

    # create default headers if none were passed
    if headers is None:
        headers = {"User-Agent": "Mozilla/5.0"}

    # if no pause interval was passed pause for a random interval to avoid
    # overloading websites
    if pause_interval is None: pause_interval = random.uniform(0.5, 1.5)

    time.sleep(pause_interval)

    # create a requests session to handle webpage data
    session = requests.Session()

    # get a response from the main page
    response = session.get(link, headers=headers)

    # check the response code and raise an error if not good
    if response.status_code != 200:
        raise Exception(f"Bad response code: {response.status_code}")

    # parse the soup
    soup = BeautifulSoup(response.content, "html.parser")

    # return the soup
    return soup

In [3]:
# define a function to parse a threadmark page
def parse_threadmark_page(threadmark_link: str) -> dict:
    """Parse a threadmark page (table of contents) and return chapter links.

    Arguments:
        threadmark_link -- A link to the threadmark page, ends in /threadmarks

    Returns:
        A dictionary with keys
        "url": link to chapter,
        "label": the text accompanying the link
    """

    # define a link to a threadmark page
    # threadmark_link = str(tla_links[0].get("href")) + "threadmarks"
    # print(f"Threadmark: '{threadmark_link}'")

    # parse out the link
    # parsed_link = urlparse(threadmark_link)

    # get the soup of a threadmark page
    thread_soup = make_soup(threadmark_link)

    # get the threadmark containers
    # each container has a link to a threadmark
    threadmark_containers = thread_soup.find_all(attrs={"class": "structItem--threadmark"})

    # extract all the links from threadmark containers
    # the first link is from the actual link and has the text, the second has the date
    # so take only the first
    thread_links = [l.find_all("a")[0] for l in threadmark_containers]

    # clean the thread links
    # first parse into a list of lists (parsed link, link label)
    # clean_thread_links = [(urlparse(l.get("href")), l.get_text().strip()) for l in thread_links]

    # compose full links from the urlparse with the base scheme and netloc
    # save as a list of dicts with url and title
    parsed_thread_links = [{
        "url": urlparse(urljoin(threadmark_link, l.get("href"))),
        "label": l.get_text().strip()
        } for l in thread_links]

    # return the thread links
    return parsed_thread_links

## Extract Info on Each Chapter

In [4]:
# scrape the links from the main webpage

# the links will have associated text The Last Angel and come from this page
main_page = r"https://proximalflame.com/index-2/"

# get a response from the main page and extract the html
soup = make_soup(main_page)

# extract all links from the main page
links = list(soup.find_all("a"))

# get only links that correspond to TLA stories
tla_threadlinks = [l for l in links if "the last angel" in l.get_text().strip().lower()]

# get only links with text The Last Angel
# these are the links to the main stories
# appending threadmarks to the end of these links gives a table of contents of sorts
# this works for now but is a little messy
tla_links = [
    {
        "url": (url := urlparse(l.get("href"), allow_fragments=False)),
        "label": l.get_text().strip(),
        "entrys": parse_threadmark_page(urljoin(url.geturl(), "threadmarks"))
    }
    for l in tqdm(
        tla_threadlinks,
        desc="Parsing TLA Links"
    )]

Parsing TLA Links: 100%|██████████| 3/3 [00:04<00:00,  1.47s/it]


## Parse a 'Book'

### Work on the parse_entry Function

In [5]:
# parse the entry (to be a function later)
def parse_entry(entry_url: str, entry_post_str: str) -> list:
    # first get the entry soup
    entry_soup = make_soup(entry_url)

    # get the post element first by post then by bbWrapper class
    post_container = entry_soup.find("div", {"data-lb-id": entry_post_str})
    post = post_container.find("div", class_="bbWrapper")

    # convert the post to a str
    html_parser = html2text.HTML2Text()
    post_str = html_parser.handle(str(post))

    # list the chapter selection choices
    chapter_choices = set([p.get_text().strip() for p in post.find_all("b")])
    # print(f"choices: {chapter_choices}.")

    # if there are no choices return none
    if len(chapter_choices) == 0: return None

    # define a regular expression to parse the set
    chapter_re = re.compile(
        r"(chapter|epilogue|prologue|interrupt|pt|the\slast\sangel)",
        re.IGNORECASE)

    # keep only set entries which are matched to the chapter_re
    chapter_choices_rem = set(filter(chapter_re.search, chapter_choices))
    # print(f"remaining: {chapter_choices}.")

    # if no choices remain check if there was only one option
    # if so that becomes the set of remaining choices
    # if there were no options to begin with return None
    if len(chapter_choices_rem) == 0 and len(chapter_choices) == 1:
        chapter_choices_rem = set(chapter_choices)

    # determine the indices of each chapter choice
    # and determine which line from post_lines
    chapter_str_indices = [post_str.find(chapter) for chapter in chapter_choices_rem]
    # print(f"indices: {chapter_str_indices}.")

    # convert the str index to a line number
    chapter_line_indices = [post_str[:index].count("\n") for index in chapter_str_indices]

    # use the indices to seperate the string into parts
    post_lines = post_str.split("\n")
    chapter_lines = [post_lines[index:] for index in chapter_line_indices]

    # ensure the remaining choices and chapter lines lists are the same length
    assert len(chapter_choices_rem) == len(chapter_lines)

    # return a list of tuples where the first entry is the choice and the second entry is the list
    chapters = list(zip(chapter_choices_rem, chapter_lines))
    return chapters

In [6]:
# configure book saving
# define an output directory
output_dir = Path(".output")
output_dir.mkdir(parents=True, exist_ok=True)

# loop over each book
for bi, book in enumerate(tla_links):
    # extract book information
    book_title = book["label"]
    # print(f"Reading '{book_title}'.")

    # define the books output directory
    book_output_dir = output_dir / f"book_{bi}"
    book_output_dir.mkdir(parents=True, exist_ok=True)

    # loop over each entry
    for i, entry in enumerate((
            pbar := tqdm(
                book["entrys"],
                desc=f"Reading '{book_title}'"
            ))):

        # extract the entry
        entry_title = entry['label']
        entry_url = entry['url'].geturl()
        entry_post_str = entry['url'].fragment

        # extract the entry
        entry_extract = parse_entry(entry_url, entry_post_str)

        # check that entry_extract is not None and has len > 0
        # print what book/entry but continue otherwise
        if entry_extract is None:
            print(f"Error (None): '{book_title}'/'{entry_title}'")
            continue
        if len(entry_extract) == 0:
            print(f"Error (len=0): '{book_title}'/'{entry_title}'")
            continue

        # loop over each entry and write to a file
        # chapter is a tuple of title and text
        # text is a list of lines
        for ci, chapter in enumerate(entry_extract):
            # save the chapter text to a text file
            chapter_output_file = book_output_dir / f"chapt_{i}_{ci}.txt"

            # write the chapter text to a file
            with open(chapter_output_file, "w") as f:
                f.write("\n".join(chapter[1]))

Reading 'The Last Angel': 100%|██████████| 55/55 [01:19<00:00,  1.44s/it]
Reading 'The Last Angel: Ascension':  32%|███▏      | 29/92 [00:42<01:43,  1.65s/it]

Error (None): 'The Last Angel: Ascension'/'Re: Spoilers'


Reading 'The Last Angel: Ascension':  39%|███▉      | 36/92 [00:53<01:22,  1.48s/it]

Error (len=0): 'The Last Angel: Ascension'/'Askanj Government'


Reading 'The Last Angel: Ascension':  47%|████▋     | 43/92 [01:04<01:20,  1.64s/it]

Error (None): 'The Last Angel: Ascension'/'Map of Galhemna System'


Reading 'The Last Angel: Ascension':  57%|█████▋    | 52/92 [01:17<01:00,  1.50s/it]

Error (len=0): 'The Last Angel: Ascension'/'Stillness'


Reading 'The Last Angel: Ascension':  92%|█████████▏| 85/92 [02:03<00:09,  1.34s/it]

Error (None): 'The Last Angel: Ascension'/'Hungry Stars Prologue'


Reading 'The Last Angel: Ascension':  99%|█████████▉| 91/92 [02:12<00:01,  1.48s/it]

Error (None): 'The Last Angel: Ascension'/'The Hungry Stars teaser'


Reading 'The Last Angel: Ascension': 100%|██████████| 92/92 [02:13<00:00,  1.45s/it]


Error (None): 'The Last Angel: Ascension'/'The Hungry Stars'


Reading 'The Last Angel: The Hungry Stars':   2%|▏         | 1/61 [00:01<01:48,  1.80s/it]

Error (None): 'The Last Angel: The Hungry Stars'/'Table of Contents'


Reading 'The Last Angel: The Hungry Stars':  18%|█▊        | 11/61 [00:15<01:09,  1.39s/it]

Error (None): 'The Last Angel: The Hungry Stars'/'Species Naming Traditions'


Reading 'The Last Angel: The Hungry Stars':  74%|███████▍  | 45/61 [01:03<00:24,  1.54s/it]

Error (len=0): 'The Last Angel: The Hungry Stars'/'Pets'


Reading 'The Last Angel: The Hungry Stars': 100%|██████████| 61/61 [01:29<00:00,  1.46s/it]
