# Explorer

Design the angelpy web scraper.

#### Alternative Concept
A numpy array of chapters containing two integer columns, page and post,
could replace the list of dicts with lists.
All it would require is functions to parse the numbers from the strings then recreate them.
Would be more complicated and mildly less flexible as it would require the base url be saved
then assuming all storied from the same threadmark page are on the same thread,
which is a reasonable assumption.
There would not be many benefits to this approach.


In [30]:
# imports
import random
import re
import shutil
import time

from pathlib import Path
from urllib.parse import urlparse, urljoin, urldefrag, ParseResult

import html2text
import numpy as np
import requests

from bs4 import BeautifulSoup
from rich import print
from tqdm import tqdm

## Define Functions

In [31]:
# define a function to parse tla style urls
def parse_tla_url(tla_url: str) -> dict:

    # parse the tla url
    parsed_tla = urlparse(tla_url)


    # extract the page number
    # the default page number will be 1
    page_number = 1

    # create a regular expression for extracting the page number
    # the page number comes at the end of the url of the form
    # page-# where # is any digit page number
    page_number_regex = re.compile(r'page-(\d+)$')

    # extract the page number from parsed_tla.path
    # if none exists default to one
    page_number_match = page_number_regex.search(parsed_tla.path)
    if page_number_match:
        page_number = int(page_number_match.group(1))


    # extract the post id as an integer from the fragment using regular expression
    # the post id comes at the end of the url of the form
    # post-# where # is any digit page number
    post_id_regex = re.compile(r'post-(\d+)$')

    # extract the post id from parsed_tla.fragment
    post_id_match = post_id_regex.search(parsed_tla.fragment)
    post_id = int(post_id_match.group(1))


    # return the page and post as a dict
    return {'page': page_number, 'post': post_id}


In [32]:
# define a function to compose tla style urls
def compose_tla_url(base_url: str, page: int, post: int) -> ParseResult:

    # take the page and posts and turn them into strings
    page_str = f"page-{page}"
    post_str = f"#post-{post}"
    joint_str = f"{page_str}/{post_str}"

    # compose a joined url from urljoin
    joint_url = urljoin(base_url, joint_str)

    # return a parsed joint url
    joint_parse_url = urlparse(joint_url)
    return joint_parse_url

In [33]:
# define a function to get soup from a link
def make_soup(link: str, headers: dict = None, pause_interval: float = None) -> BeautifulSoup:
    """Makes a soup from a link.

    Arguments:
        link -- the link to make the soup from (str).

    Keyword Arguments:
        headers -- the headers to use for the request (default: User-Agent: Mozilla/5.0).

    Returns:
        soup -- the soup from the link (BeautifulSoup).
    """

    # create default headers if none were passed
    if headers is None:
        headers = {"User-Agent": "Mozilla/5.0"}

    # if no pause interval was passed pause for a random interval to avoid
    # overloading websites
    if pause_interval is None: pause_interval = random.uniform(0.25, 1)

    time.sleep(pause_interval)

    # create a requests session to handle webpage data
    session = requests.Session()

    # get a response from the main page
    response = session.get(link, headers=headers)

    # check the response code and raise an error if not good
    if response.status_code != 200:
        raise Exception(f"Bad response code: {response.status_code}")

    # parse the soup
    soup = BeautifulSoup(response.content, "html.parser")

    # return the soup
    return soup

In [34]:
# define a function to parse a threadmark page
def parse_threadmark_page(threadmark_link: str) -> list[dict]:
    """Parse a threadmark page (table of contents) and return chapter links.

    Arguments:
        threadmark_link -- A link to the threadmark page, ends in /threadmarks

    Returns:
        A dictionary with keys
        "href": link to chapter,
        "post": the post number,
        "title": the text accompanying the link
    """

    # define a link to a threadmark page
    # threadmark_link = str(tla_links[0].get("href")) + "threadmarks"
    # print(f"Threadmark: '{threadmark_link}'")

    # parse out the link
    # parsed_link = urlparse(threadmark_link)

    # get the soup of a threadmark page
    thread_soup = make_soup(threadmark_link)

    # get the threadmark containers
    # each container has a link to a threadmark
    threadmark_containers = thread_soup.find_all(attrs={"class": "structItem--threadmark"})

    # extract all the links from threadmark containers
    # the first link is from the actual link and has the text, the second has the date
    # so take only the first
    thread_links = [l.find_all("a")[0] for l in threadmark_containers]

    # clean the thread links
    # first parse into a list of lists (parsed link, link label)
    # clean_thread_links = [(urlparse(l.get("href")), l.get_text().strip()) for l in thread_links]

    # compose full links from the urlparse with the base scheme and netloc
    # save as a list of tuples of the form (link, post code, link label)
    # use url join from urlparse?
    parsed_thread_links = np.array(
        [(
            (parsed_url := parse_tla_url(l.get("href")))["page"],
            parsed_url["post"]
        ) for l in thread_links],
        dtype=np.int64)

    # return the thread links
    return parsed_thread_links

## Extract Info on Each Chapter

In [35]:
# scrape the links from the main webpage

# the links will have associated text The Last Angel and come from this page
main_page = r"https://proximalflame.com/index-2/"

# get a response from the main page and extract the html
soup = make_soup(main_page)

# extract all links from the main page
links = list(soup.find_all("a"))

# get only links with text The Last Angel
# these are the links to the main stories
# appending threadmarks to the end of these links gives a table of contents of sorts
# this works for now but is a little messy
tla_links = [
    {
        "href": (href := urldefrag(l.get("href"))[0]),
        "title": l.get_text().strip(),
        "entrys": parse_threadmark_page(urljoin(href, "threadmarks"))
    }
    for l in links if "the last angel" in l.get_text().strip().lower()]

## Parse an Entry for Chapters

This eventually becomes a function.

In [40]:
# select a random entry
# first select a random thread
thread_choice = random.choice(tla_links)
# thread_choice = tla_links[0]

# then select a random entry from the thread
entry_choice = random.choice(thread_choice["entrys"])
# entry_choice = thread_choice["entrys"][14]

# print the choices
print(f"Thread: {thread_choice['title']}")

In [43]:
# define a function to parse a chapter from above methods
def parse_entry(base_url: str, page: int, post: int) -> str:

    # compose the entry url
    entry_url = compose_tla_url(base_url, page, post)

    # get the soup
    entry_soup = make_soup(entry_url.geturl())

    # get the first div element with data-lb-id=post-id
    post_element = entry_soup.find("div", {"data-lb-id": entry_url.fragment})

    # go in one layer and find the class=bbWrapper tag
    post_wrapper = post_element.find("div", class_="bbWrapper")

    # unfortunately due to mixed formatting I think it just has to be converted to raw text
    entry_raw_text = str(post_wrapper)

    # pass throught the html to text parser and fix some issues with '_'
    # parse using html2text
    html_parser = html2text.HTML2Text()
    entry_text = html_parser.handle(entry_raw_text)

    # if a number of asterix are seperated from another group of asterix by nothing but blank space
    # characters remove the blank space characters and the asterix
    # example **text** **more text** would become **textmore text**
    # but there might be any number of asterix that need to be removed
    # this is done to make the regex for finding the entry titles easier
    entry_text = re.sub(r"\*\*\s+\*\*", " ", entry_text)

    # identify the entry titles with a regular expression
    title_re = re.compile(r"^\*+(?P<entry_title>[^\*]*)\*\*+", re.MULTILINE)

    entry_title = title_re.findall(entry_text)
    print(f"Titles Found: {entry_title}.")

    return 

## Parse a 'Book'

In [44]:
# iterate over each selection in tla_links
# each selection is a thread
for thread in tla_links:
    # iterate over each entry in the thread
    for i, entry in enumerate(tqdm(
            thread["entrys"],
            desc=f"Parsing Thread {thread['title']}",
            ascii=True,
            leave=True,
            position=0)):

        # run parse entry
        parse_entry(thread['href'], entry[0], entry[1])


Parsing Thread The Last Angel:   0%|          | 0/55 [00:00<?, ?it/s]

Parsing Thread The Last Angel:   2%|1         | 1/55 [00:01<01:34,  1.75s/it]

Parsing Thread The Last Angel:   4%|3         | 2/55 [00:03<01:20,  1.51s/it]

Parsing Thread The Last Angel:   5%|5         | 3/55 [00:04<01:11,  1.38s/it]

Parsing Thread The Last Angel:   7%|7         | 4/55 [00:06<01:16,  1.50s/it]

Parsing Thread The Last Angel:   9%|9         | 5/55 [00:07<01:14,  1.49s/it]

Parsing Thread The Last Angel:  11%|#         | 6/55 [00:08<01:05,  1.33s/it]

Parsing Thread The Last Angel:  13%|#2        | 7/55 [00:10<01:08,  1.43s/it]

Parsing Thread The Last Angel:  15%|#4        | 8/55 [00:11<01:05,  1.39s/it]

Parsing Thread The Last Angel:  16%|#6        | 9/55 [00:12<00:59,  1.29s/it]

Parsing Thread The Last Angel:  18%|#8        | 10/55 [00:13<00:54,  1.21s/it]

Parsing Thread The Last Angel:  20%|##        | 11/55 [00:15<00:57,  1.30s/it]

Parsing Thread The Last Angel:  22%|##1       | 12/55 [00:15<00:50,  1.17s/it]

Parsing Thread The Last Angel:  24%|##3       | 13/55 [00:16<00:46,  1.10s/it]

Parsing Thread The Last Angel:  25%|##5       | 14/55 [00:18<00:51,  1.26s/it]

Parsing Thread The Last Angel:  27%|##7       | 15/55 [00:19<00:47,  1.19s/it]

Parsing Thread The Last Angel:  29%|##9       | 16/55 [00:20<00:47,  1.21s/it]

Parsing Thread The Last Angel:  31%|###       | 17/55 [00:22<00:48,  1.29s/it]

Parsing Thread The Last Angel:  33%|###2      | 18/55 [00:23<00:48,  1.30s/it]

Parsing Thread The Last Angel:  35%|###4      | 19/55 [00:25<00:49,  1.37s/it]

Parsing Thread The Last Angel:  36%|###6      | 20/55 [00:26<00:44,  1.26s/it]

Parsing Thread The Last Angel:  38%|###8      | 21/55 [00:27<00:45,  1.34s/it]

Parsing Thread The Last Angel:  40%|####      | 22/55 [00:28<00:44,  1.34s/it]

Parsing Thread The Last Angel:  42%|####1     | 23/55 [00:29<00:39,  1.24s/it]

Parsing Thread The Last Angel:  44%|####3     | 24/55 [00:31<00:41,  1.34s/it]

Parsing Thread The Last Angel:  45%|####5     | 25/55 [00:32<00:36,  1.22s/it]

Parsing Thread The Last Angel:  47%|####7     | 26/55 [00:33<00:35,  1.21s/it]

Parsing Thread The Last Angel:  49%|####9     | 27/55 [00:34<00:34,  1.23s/it]

Parsing Thread The Last Angel:  51%|#####     | 28/55 [00:36<00:33,  1.26s/it]

Parsing Thread The Last Angel:  53%|#####2    | 29/55 [00:37<00:30,  1.17s/it]

Parsing Thread The Last Angel:  55%|#####4    | 30/55 [00:38<00:33,  1.33s/it]

Parsing Thread The Last Angel:  56%|#####6    | 31/55 [00:40<00:33,  1.38s/it]

Parsing Thread The Last Angel:  58%|#####8    | 32/55 [00:41<00:29,  1.30s/it]

Parsing Thread The Last Angel:  60%|######    | 33/55 [00:42<00:26,  1.22s/it]

Parsing Thread The Last Angel:  62%|######1   | 34/55 [00:44<00:28,  1.35s/it]

Parsing Thread The Last Angel:  64%|######3   | 35/55 [00:45<00:24,  1.24s/it]

Parsing Thread The Last Angel:  65%|######5   | 36/55 [00:46<00:23,  1.23s/it]

Parsing Thread The Last Angel:  67%|######7   | 37/55 [00:47<00:22,  1.24s/it]

Parsing Thread The Last Angel:  69%|######9   | 38/55 [00:49<00:22,  1.30s/it]

Parsing Thread The Last Angel:  71%|#######   | 39/55 [00:50<00:22,  1.38s/it]

Parsing Thread The Last Angel:  73%|#######2  | 40/55 [00:52<00:22,  1.48s/it]

Parsing Thread The Last Angel:  75%|#######4  | 41/55 [00:53<00:18,  1.33s/it]

Parsing Thread The Last Angel:  76%|#######6  | 42/55 [00:54<00:16,  1.24s/it]

Parsing Thread The Last Angel:  78%|#######8  | 43/55 [00:55<00:15,  1.26s/it]

Parsing Thread The Last Angel:  80%|########  | 44/55 [00:57<00:14,  1.30s/it]

Parsing Thread The Last Angel:  82%|########1 | 45/55 [00:58<00:12,  1.26s/it]

Parsing Thread The Last Angel:  84%|########3 | 46/55 [00:59<00:12,  1.36s/it]

Parsing Thread The Last Angel:  85%|########5 | 47/55 [01:01<00:11,  1.39s/it]

Parsing Thread The Last Angel:  87%|########7 | 48/55 [01:02<00:10,  1.45s/it]

Parsing Thread The Last Angel:  89%|########9 | 49/55 [01:04<00:09,  1.52s/it]

Parsing Thread The Last Angel:  91%|######### | 50/55 [01:05<00:07,  1.40s/it]

Parsing Thread The Last Angel:  93%|#########2| 51/55 [01:07<00:05,  1.43s/it]

Parsing Thread The Last Angel:  95%|#########4| 52/55 [01:08<00:04,  1.35s/it]

Parsing Thread The Last Angel:  96%|#########6| 53/55 [01:09<00:02,  1.23s/it]

Parsing Thread The Last Angel:  98%|#########8| 54/55 [01:10<00:01,  1.28s/it]

Parsing Thread The Last Angel: 100%|##########| 55/55 [01:11<00:00,  1.31s/it]
Parsing Thread The Last Angel: Ascension:   0%|          | 0/92 [00:00<?, ?it/s]

Parsing Thread The Last Angel: Ascension:   1%|1         | 1/92 [00:01<02:25,  1.60s/it]

Parsing Thread The Last Angel: Ascension:   2%|2         | 2/92 [00:03<02:21,  1.57s/it]

Parsing Thread The Last Angel: Ascension:   3%|3         | 3/92 [00:04<01:57,  1.32s/it]

Parsing Thread The Last Angel: Ascension:   4%|4         | 4/92 [00:05<01:51,  1.27s/it]

Parsing Thread The Last Angel: Ascension:   5%|5         | 5/92 [00:06<02:01,  1.39s/it]

Parsing Thread The Last Angel: Ascension:   7%|6         | 6/92 [00:08<02:06,  1.47s/it]

Parsing Thread The Last Angel: Ascension:   8%|7         | 7/92 [00:09<01:56,  1.37s/it]

Parsing Thread The Last Angel: Ascension:   9%|8         | 8/92 [00:10<01:47,  1.28s/it]

Parsing Thread The Last Angel: Ascension:  10%|9         | 9/92 [00:12<01:57,  1.42s/it]

Parsing Thread The Last Angel: Ascension:  11%|#         | 10/92 [00:13<01:46,  1.29s/it]

Parsing Thread The Last Angel: Ascension:  12%|#1        | 11/92 [00:14<01:36,  1.20s/it]

Parsing Thread The Last Angel: Ascension:  13%|#3        | 12/92 [00:15<01:38,  1.23s/it]

Parsing Thread The Last Angel: Ascension:  14%|#4        | 13/92 [00:17<01:41,  1.28s/it]

Parsing Thread The Last Angel: Ascension:  15%|#5        | 14/92 [00:18<01:37,  1.25s/it]

Parsing Thread The Last Angel: Ascension:  16%|#6        | 15/92 [00:19<01:38,  1.28s/it]

Parsing Thread The Last Angel: Ascension:  17%|#7        | 16/92 [00:20<01:28,  1.17s/it]

Parsing Thread The Last Angel: Ascension:  18%|#8        | 17/92 [00:22<01:36,  1.28s/it]

Parsing Thread The Last Angel: Ascension:  20%|#9        | 18/92 [00:23<01:27,  1.18s/it]

Parsing Thread The Last Angel: Ascension:  21%|##        | 19/92 [00:24<01:37,  1.33s/it]

Parsing Thread The Last Angel: Ascension:  22%|##1       | 20/92 [00:26<01:32,  1.29s/it]

Parsing Thread The Last Angel: Ascension:  23%|##2       | 21/92 [00:27<01:29,  1.26s/it]

Parsing Thread The Last Angel: Ascension:  24%|##3       | 22/92 [00:28<01:27,  1.25s/it]

Parsing Thread The Last Angel: Ascension:  25%|##5       | 23/92 [00:29<01:22,  1.19s/it]

Parsing Thread The Last Angel: Ascension:  26%|##6       | 24/92 [00:30<01:20,  1.18s/it]

Parsing Thread The Last Angel: Ascension:  27%|##7       | 25/92 [00:32<01:26,  1.29s/it]

Parsing Thread The Last Angel: Ascension:  28%|##8       | 26/92 [00:33<01:20,  1.23s/it]

Parsing Thread The Last Angel: Ascension:  29%|##9       | 27/92 [00:34<01:12,  1.11s/it]

Parsing Thread The Last Angel: Ascension:  30%|###       | 28/92 [00:35<01:22,  1.29s/it]

Parsing Thread The Last Angel: Ascension:  32%|###1      | 29/92 [00:36<01:11,  1.14s/it]

Parsing Thread The Last Angel: Ascension:  33%|###2      | 30/92 [00:38<01:16,  1.23s/it]

Parsing Thread The Last Angel: Ascension:  34%|###3      | 31/92 [00:39<01:15,  1.23s/it]

Parsing Thread The Last Angel: Ascension:  35%|###4      | 32/92 [00:40<01:11,  1.19s/it]

Parsing Thread The Last Angel: Ascension:  36%|###5      | 33/92 [00:41<01:10,  1.19s/it]

Parsing Thread The Last Angel: Ascension:  37%|###6      | 34/92 [00:42<01:11,  1.23s/it]

Parsing Thread The Last Angel: Ascension:  38%|###8      | 35/92 [00:44<01:11,  1.26s/it]