# Parse the information of ACC


Parse the information from [Acc Site](https://missionaries.prod.byu-pathway.psdops.com/ACC-site-index) to a CSV file.


In [12]:
import requests
import json
from bs4 import BeautifulSoup, Tag
import csv

from typing import Any, cast
import re

with this function we will clean our data, delete the weird characters and the empty spaces.


In [13]:
# clean function
def clean(text: Any) -> str:
    """Convert text to a string and clean it."""
    if text is None:
        return ""
    if isinstance(text, Tag):
        text = text.get_text()
    if not isinstance(text, str):
        text = str(text)
    """Replace non-breaking space with normal space and remove surrounding whitespace."""
    text = text.replace(" ", " ").replace("\u200b", "").replace("\u200a", " ")
    text = re.sub(r"(\n\s*)+\n", "\n\n", text)
    text = re.sub(r" +\n", "\n", text)
    text = re.sub(r"\r\n", " ", text)
    return cast(str, text.strip())


`Selectors` class contains the information of the columns that we want to extract from the site.

In [14]:
class Selectors:
    def __init__(self, header, sub_header, link, text):
        self.header = header
        self.sub_header = sub_header
        self.link = link
        self.text = text

The `get_data` function will get the data from the website and parse it to a rows list.

In [15]:

def get_data(soup: BeautifulSoup, selectors: Selectors) -> list:
    """
    Get the data from the soup object.
    """
    cur_header = None
    cur_sub_header = None
    rows = []  # header, subheader, title, url
    
    header = selectors.header
    sub_header = selectors.sub_header
    link = selectors.link
    text = selectors.text
    elems = soup.select("p.MsoNormal")

    for elem in elems:
        if elem.select(sub_header):
            sub_header_text = elem.select(sub_header)[0].text
            cur_sub_header = clean(sub_header_text)
        elif elem.select(header):
            header_text = elem.select(header)[0].text
            cur_header = clean(header_text)
            cur_sub_header = None
        elif elem.select(link):
            if len(elem.select(link)) > 0 and elem.select(text):
                link_text = elem.select(link)[0].get_attribute_list("href")[0]
                text_text = elem.select(text)[0].text

            # save the row
                rows.append(
                    [cur_header, cur_sub_header, clean(text_text), clean(link_text)]
                )

    return rows


In [16]:
def crawl_index(url, selectors: Selectors):
    parser = "html.parser",
    response = requests.get(url)
    soup = BeautifulSoup(response.content, parser)
    data = get_data(soup, selectors)
    return data

## Data from ACM Site Index


In [17]:
acm_selectors = Selectors(
    header = 'span[style="font-size:18.0pt"]',
    sub_header = "b > i",
    link = "a",
    text = "a > span",
)

In [18]:
acm_url = "https://missionaries.prod.byu-pathway.psdops.com/ACC-site-index"
acm_data = crawl_index(acm_url, acm_selectors)
print(json.dumps(acm_data, indent=2))

[
  [
    "Area Coordination (For ACMs Only)",
    "ACC Council",
    "ACM Council Agendas",
    "https://office365lds.sharepoint.com/sites/BYU-PWAreaCoordination/SitePages/ACM-Council-Agendas.aspx#:~:text=Upcoming%20Agendas"
  ],
  [
    "Block",
    null,
    "International Area Transitioning to Block a Guide",
    "https://missionaries.prod.byu-pathway.psdops.com/International-Area-Transitioning-to-Block-a-Guide"
  ],
  [
    "Block",
    null,
    "PathwayConnect",
    "https://missionaries.prod.byu-pathway.psdops.com/PathwayConnect-Curriculum-and-Block"
  ],
  [
    "Block",
    null,
    "PC 103 Block Pilot Spring 2022",
    "https://missionaries.prod.byu-pathway.psdops.com/PC-103-Block-Pilot-Spring-2022"
  ],
  [
    "Missionary Processes",
    "Vetting ACMs",
    "ACM Vetting Recommendation Instructions",
    "https://missionaries.prod.byu-pathway.psdops.com/ACM-Vetting-Recommendation-Instructions"
  ],
  [
    "Missionary Processes",
    "Vetting New Missioanries",
    "Vettin

Now we will save our data into the `acm_site.csv` file.


In [19]:
with open("../CSVs/acm_site.csv", "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["Section", "Subsection", "Title", "URL"])
    writer.writerows(acm_data)

# Missionary Services Site Index


In [20]:
missionary_selectors = Selectors(
    header = 'b > span',
    sub_header = 'span[style="font-size:16.0pt;line-height:150%"]',
    link = "a",
    text = "a > span",
)

In [21]:
missionary_url = "https://missionaries.prod.byu-pathway.psdops.com/missionary-services-site-index"
missionary_data = crawl_index(missionary_url, missionary_selectors)
print(json.dumps(missionary_data, indent=2))

[
  [
    "Missionary Services Site Index",
    null,
    "on Android",
    "https://missionaries.prod.byu-pathway.psdops.com/android-search-index"
  ],
  [
    "BYU-Pathway Support Knowledge Articles",
    null,
    "Search the Support Knowledge base",
    "https://pathway-missionary.powerappsportals.com/"
  ],
  [
    "BYU-Pathway Support Knowledge Articles",
    null,
    "Using BYU-Pathway Support Knowledge Articles",
    "https://missionaries.prod.byu-pathway.psdops.com/Using-BYUPW-Support-Knowledge-Articles"
  ],
  [
    "BYU-Pathway Worldwide",
    "BYU-Pathway Worldwide Overview",
    "5 things to know about BYU-Pathway Worldwide",
    "https://www.byupathway.edu/articles/feature/5-things-about-byupw"
  ],
  [
    "BYU-Pathway Worldwide",
    "BYU-Pathway Worldwide Overview",
    "BYU-Idaho Learning Model",
    "https://www.byui.edu/learning-model/"
  ],
  [
    "BYU-Pathway Worldwide",
    "BYU-Pathway Worldwide Overview",
    "BYU-Pathway Student Honor Code",
    "https://www

Now we will save our data into the `missionary.csv` file.


In [22]:
with open("../CSVs/missionary.csv", "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    # write headers
    writer.writerow(["Section", "Subsection", "Title", "URL"])
    writer.writerows(missionary_data[1:])