# Parse the information of ACC


Parse the information from [Acc Site](https://missionaries.prod.byu-pathway.psdops.com/ACC-site-index) to a CSV file.


In [1]:
import requests
import json
from bs4 import BeautifulSoup, Tag
import csv

from typing import Any, cast
import re


here we set the general variables


In [2]:
parser = "html.parser"
header = 'span[style="font-size:18.0pt"]'
sub_header = "b > i"
link = "a"
text = "a > span"


with this function we will clean our data, delete the weird characters and the empty spaces.


In [3]:
# clean function
def clean(text: Any) -> str:
    """Convert text to a string and clean it."""
    if text is None:
        return ""
    if isinstance(text, Tag):
        text = text.get_text()
    if not isinstance(text, str):
        text = str(text)
    """Replace non-breaking space with normal space and remove surrounding whitespace."""
    text = text.replace(" ", " ").replace("\u200b", "").replace("\u200a", " ")
    text = re.sub(r"(\n\s*)+\n", "\n\n", text)
    text = re.sub(r" +\n", "\n", text)
    text = re.sub(r"\r\n", " ", text)
    return cast(str, text.strip())


The `get_data` function will get the data from the website and parse it to a rows list.


In [4]:
def get_data(soup: Any) -> list:
    """
    Get the data from the soup object.
    """
    cur_header = None
    cur_sub_header = None
    rows = []  # header, subheader, title, url

    elems = soup.select("p.MsoNormal")

    for elem in elems:
        if elem.select(header):
            header_text = elem.select(header)[0].text
            cur_header = clean(header_text)
            cur_sub_header = None
        elif elem.select(sub_header):
            sub_header_text = elem.select(sub_header)[0].text
            cur_sub_header = clean(sub_header_text)
        elif elem.select(link):
            link_text = elem.select(link)[0].get_attribute_list("href")[0]
            text_text = elem.select(text)[0].text

            # save the row
            rows.append(
                [cur_header, cur_sub_header, clean(text_text), clean(link_text)]
            )

    return rows


## Data from ACC Site Index


In [5]:
url1 = "https://missionaries.prod.byu-pathway.psdops.com/ACC-site-index"

response1 = requests.get(url1)

soup1 = BeautifulSoup(response1.content, parser)


In [6]:
data1 = get_data(soup1)

print(json.dumps(data1, indent=2))


[
  [
    "Area Coordination (For ACMs Only)",
    "ACC Council",
    "ACM Council Agendas",
    "https://office365lds.sharepoint.com/sites/BYU-PWAreaCoordination/SitePages/ACM-Council-Agendas.aspx#:~:text=Upcoming%20Agendas"
  ],
  [
    "Block",
    null,
    "International Area Transitioning to Block a Guide.pdf",
    "https://missionaries.prod.byu-pathway.psdops.com/International-Area-Transitioning-to-Block-a-Guide"
  ],
  [
    "Block",
    null,
    "PathwayConnect Curriculum and Block.pdf",
    "https://missionaries.prod.byu-pathway.psdops.com/PathwayConnect-Curriculum-and-Block"
  ],
  [
    "Block",
    null,
    "PC 103 Block Pilot Spring 2022.pdf",
    "https://missionaries.prod.byu-pathway.psdops.com/PC-103-Block-Pilot-Spring-2022"
  ],
  [
    "Missionary Processes",
    "Vetting ACMs",
    "ACM Vetting Recommendation Instructions.pdf",
    "https://missionaries.prod.byu-pathway.psdops.com/ACM-Vetting-Recommendation-Instructions"
  ],
  [
    "Missionary Processes",
    "V

Now we will save our data into the `acc_site.csv` file.


In [7]:
with open("acc_site.csv", "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(data1)


# Missionary Services Site Index


In [8]:
url2 = "https://missionaries.prod.byu-pathway.psdops.com/missionary-services-site-index"

response2 = requests.get(url2)

soup2 = BeautifulSoup(response2.content, parser)


In [9]:
data2 = get_data(soup2)

print(json.dumps(data2, indent=2))


[
  [
    "BYU-Pathway Worldwide",
    "BYU-Pathway Worldwide Overview",
    "2024_Service Missionaries Flyer.pdf",
    "https://missionaries.prod.byu-pathway.psdops.com/2024_Service-Missionaries-Flyer"
  ],
  [
    "BYU-Pathway Worldwide",
    "BYU-Pathway Worldwide Overview",
    "Apply For and Check Status of HJG Scholarship.pdf",
    "https://missionaries.prod.byu-pathway.psdops.com/Apply-For-and-Check-Status-of-HJG-Scholarship"
  ],
  [
    "BYU-Pathway Worldwide",
    "BYU-Pathway Worldwide Overview",
    "Area Manager Assignments.pdf",
    "https://missionaries.prod.byu-pathway.psdops.com/Area-Manager-Assignments"
  ],
  [
    "BYU-Pathway Worldwide",
    "BYU-Pathway Worldwide Overview",
    "Area Structure and Communication.pdf",
    "https://missionaries.prod.byu-pathway.psdops.com/Area-Structure-and-Communication"
  ],
  [
    "BYU-Pathway Worldwide",
    "BYU-Pathway Worldwide Overview",
    "BYU-Pathway History-Full.pdf",
    "https://missionaries.prod.byu-pathway.psdops.c

Now we will save our data into the `missionary.csv` file.


In [10]:
with open("missionary.csv", "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(data2)
