In [1]:
import json
import os
import re
import time
from urllib.parse import urljoin, urlparse

from bs4 import BeautifulSoup
from markdownify import MarkdownConverter
import requests

In [2]:
# config
years = range(1980, 2023)
months = ['04', '10']
host = 'https://www.churchofjesuschrist.org'
base_dir = '../data'
bs_parser = 'html.parser'
encoding = 'utf-8'
seconds_delay = 30

headers = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", 
    "Accept-Encoding": "gzip, deflate, br", 
    "Accept-Language": "en-US,en;q=0.9", 
    "Sec-Ch-Ua": "\"Google Chrome\";v=\"105\", \"Not)A;Brand\";v=\"8\", \"Chromium\";v=\"105\"", 
    "Sec-Ch-Ua-Mobile": "?0", 
    "Sec-Ch-Ua-Platform": "\"Linux\"", 
    "Sec-Fetch-Dest": "document", 
    "Sec-Fetch-Mode": "navigate", 
    "Sec-Fetch-Site": "cross-site", 
    "Sec-Fetch-User": "?1", 
    "Upgrade-Insecure-Requests": "1",     
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36",
}


In [3]:
class HrefConverter(MarkdownConverter):
    """
    Create a custom MarkdownConverter that joins hrefs with a base url
    """
    def __init__(self, *args, **kwargs):
        super(HrefConverter, self).__init__(*args, **kwargs)
        self.base_url = kwargs.get('base_url','')
        
    def convert_a(self, el, text, convert_as_inline):
        if 'href' in el.attrs:
            el['href'] = urljoin(self.base_url, el['href'])
        return super().convert_a(el, text, convert_as_inline)


# Create shorthand method for custom conversion
def md(html, **options):
    return HrefConverter(**options).convert(html)

In [4]:
def get_page(url, headers, encoding):
    response = requests.get(url, headers=headers)
    if encoding:
        response.encoding = encoding
    return response.status_code, response.text

In [5]:
def _is_talk_url(url):
    path_components = urlparse(url).path.split('/')
    # must be 5 components and last component must not end in -session
    return len(path_components) == 6 and not path_components[-1].endswith('-session')
    
def get_talk_urls(base_url, html):
    soup = BeautifulSoup(html, bs_parser)
    return [urljoin(base_url, a['href']) for a in soup.find_all('a', href=True) \
            if _is_talk_url(urljoin(base_url, a['href']))]

In [6]:
def _clean(text):
    return text.replace(' ', ' ')


def get_talk_info(url, html):
    path_components = urlparse(url).path.split('/')
    year, month = path_components[3:5]
    soup = BeautifulSoup(html, bs_parser)
    title = soup.select_one('article header h1')
    author = soup.select_one('article p.author-name')
    author_role = soup.select_one('article p.author-role')
    body = soup.select_one('article div.body-block')
    content = _clean(md(str(body), base_url=url)) if body else ''

    return {
        'year': year,
        'month': month,
        'url': url,
        'title': _clean(title.text) if title else '',
        'author': _clean(author.text) if author else '',
        'author_role': _clean(author_role.text) if author_role else '',
        'content': content,
        'html': html,
    }

In [7]:
def get_talk_path(url):
    path_components = urlparse(url).path.split('/')
    year, month, title = path_components[3:6]
    return os.path.join(base_dir, f"{year}-{month}-{title}.json")    


def write_talk_info(path, talk_info):
    with open(path, 'w', encoding='utf-8') as f:
        json.dump(talk_info, f, ensure_ascii=False, indent=2)

In [None]:
for year in years:
    for month in months:
        dir_url = f"{host}/study/general-conference/{year}/{month}?lang=eng"
        print(dir_url)
        status_code, dir_html = get_page(dir_url, headers, encoding)
        if status_code != 200:
            print(f"Status code={status_code} url={dir_url}")
            continue
        talk_urls = get_talk_urls(dir_url, dir_html)
        for talk_url in talk_urls:
            path = get_talk_path(talk_url)
            if os.path.exists(path):
                continue
            print("    ", path)
            status_code, talk_html = get_page(talk_url, headers, encoding)
            if status_code != 200:
                print(f"Status code={status_code} url={dir_url}")
                continue
            talk_info = get_talk_info(talk_url, talk_html)
            write_talk_info(path, talk_info)
            time.sleep(seconds_delay)

https://www.churchofjesuschrist.org/study/general-conference/1980/04?lang=eng
     ../data/1980-04-no-unhallowed-hand-can-stop-the-work.json
     ../data/1980-04-preparing-the-way.json
     ../data/1980-04-he-is-not-here-he-is-risen.json
     ../data/1980-04-communion-with-the-holy-spirit.json
     ../data/1980-04-celestial-marriages-and-eternal-families.json
     ../data/1980-04-church-finance-committee-report.json
     ../data/1980-04-introduction-to-the-proclamation.json
     ../data/1980-04-proclamation.json
     ../data/1980-04-remarks-and-dedication-of-the-fayette-new-york-buildings.json
     ../data/1980-04-what-hath-god-wrought-through-his-servant-joseph.json
     ../data/1980-04-a-tribute-to-the-rank-and-file-of-the-church.json
     ../data/1980-04-the-book-of-mormon.json
     ../data/1980-04-where-do-we-stand.json
     ../data/1980-04-the-coming-tests-and-trials-and-glory.json
     ../data/1980-04-nauvoo-a-demonstration-of-faith.json
     ../data/1980-04-self-accountability-a

     ../data/1981-10-living-welfare-principles.json
     ../data/1981-10-charity-never-faileth.json
     ../data/1981-10-relief-society-in-times-of-transition.json
     ../data/1981-10-an-opportunity-for-continual-learning.json
     ../data/1981-10-relief-society-in-welfare.json
     ../data/1981-10-the-honored-place-of-woman.json
https://www.churchofjesuschrist.org/study/general-conference/1982/04?lang=eng
     ../data/1982-04-remember-the-mission-of-the-church.json
     ../data/1982-04-the-resurrection-of-jesus.json
     ../data/1982-04-this-is-no-harm.json
     ../data/1982-04-beginning-again.json
     ../data/1982-04-we-believe-in-being-honest.json
     ../data/1982-04-church-audit-committee-report.json
     ../data/1982-04-statistical-report-1981.json
     ../data/1982-04-true-greatness.json
     ../data/1982-04-a-lasting-marriage.json
     ../data/1982-04-pondering-strengthens-the-spiritual-life.json
     ../data/1982-04-spiritual-guides-for-teachers-of-righteousness.json
     ..

     ../data/1983-10-a-season-for-strength.json
     ../data/1983-10-prepare-to-teach-his-children.json
     ../data/1983-10-agency-and-accountability.json
https://www.churchofjesuschrist.org/study/general-conference/1984/04?lang=eng
     ../data/1984-04-the-sustaining-of-church-officers.json
     ../data/1984-04-counsel-to-the-saints.json
     ../data/1984-04-choose-the-good-part.json
     ../data/1984-04-marriage-and-divorce.json
     ../data/1984-04-jesus-the-christ-the-words-and-their-meaning.json
     ../data/1984-04-building-your-eternal-home.json
     ../data/1984-04-the-church-audit-committee-report.json
     ../data/1984-04-statistical-report-1983.json
     ../data/1984-04-the-great-plan-of-the-eternal-god.json
     ../data/1984-04-covenants-ordinances-and-service.json
     ../data/1984-04-a-generation-prepared-to-make-wise-choices.json
     ../data/1984-04-coming-through-the-mists.json
     ../data/1984-04-warmed-by-the-fires-of-their-lives.json
     ../data/1984-04-home-and-