# Parse the information of ACC


Parse the information from [Acc Site](https://missionaries.prod.byu-pathway.psdops.com/ACC-site-index) to a CSV file.


In [None]:
import requests
import json
from bs4 import BeautifulSoup, Tag
import csv

from typing import Any, cast
import re

with this function we will clean our data, delete the weird characters and the empty spaces.


In [None]:
# clean function
def clean(text: Any) -> str:
    """Convert text to a string and clean it."""
    if text is None:
        return ""
    if isinstance(text, Tag):
        text = text.get_text()
    if not isinstance(text, str):
        text = str(text)
    """Replace non-breaking space with normal space and remove surrounding whitespace."""
    text = text.replace(" ", " ").replace("\u200b", "").replace("\u200a", " ")
    text = re.sub(r"(\n\s*)+\n", "\n\n", text)
    text = re.sub(r" +\n", "\n", text)
    text = re.sub(r"\r\n", " ", text)
    return cast(str, text.strip())


In [None]:
class Selectors:
    def __init__(self, header, sub_header, link, text):
        self.header = header
        self.sub_header = sub_header
        self.link = link
        self.text = text

The `get_data` function will get the data from the website and parse it to a rows list.


In [None]:

def get_data(soup: BeautifulSoup, selectors: Selectors) -> list:
    """
    Get the data from the soup object.
    """
    cur_header = None
    cur_sub_header = None
    rows = []  # header, subheader, title, url
    
    header = selectors.header
    sub_header = selectors.sub_header
    link = selectors.link
    text = selectors.text
    elems = soup.select("p.MsoNormal")

    for elem in elems:
        if elem.select(sub_header):
            sub_header_text = elem.select(sub_header)[0].text
            cur_sub_header = clean(sub_header_text)
        elif elem.select(header):
            header_text = elem.select(header)[0].text
            cur_header = clean(header_text)
            cur_sub_header = None
        elif elem.select(link):
            if len(elem.select(link)) > 0 and elem.select(text):
                link_text = elem.select(link)[0].get_attribute_list("href")[0]
                text_text = elem.select(text)[0].text

            # save the row
                rows.append(
                    [cur_header, cur_sub_header, clean(text_text), clean(link_text)]
                )

    return rows


## Data from ACC Site Index


In [None]:
selectors = Selectors(
    header = 'span[style="font-size:18.0pt"]',
    sub_header = "b > i",
    link = "a",
    text = "a > span",
)

def crawl_acm_index(selectors: Selectors):
    parser = "html.parser",
    url = "https://missionaries.prod.byu-pathway.psdops.com/ACC-site-index"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, parser)
    data = get_data(soup, selectors)
    return data

In [None]:
data = crawl_acm_index(selectors)
print(json.dumps(data, indent=2))

Now we will save our data into the `acm_site.csv` file.


In [None]:
with open("../CSVs/acm_site.csv", "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(data)

# Missionary Services Site Index


In [None]:
selectors = Selectors(
    header = 'b > span',
    sub_header = 'span[style="font-size:16.0pt;line-height:150%"]',
    link = "a",
    text = "a > span",
)

def crawl_missionary_index(selectors: Selectors):
    parser = "html.parser",
    url = "https://missionaries.prod.byu-pathway.psdops.com/missionary-services-site-index"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, parser)
    data = get_data(soup, selectors)
    return data


In [None]:
data2 = crawl_missionary_index(selectors)
print(json.dumps(data2, indent=2))

Now we will save our data into the `missionary.csv` file.


In [None]:
with open("../CSVs/missionary.csv", "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(data2[1:])