In [None]:
# !pip install requests_cache
import requests
import bs4
import contextlib
import requests_cache
import re
import csv
import sys
requests_cache.install_cache('/tmp/cache')

In [2]:
# General helpers
def load_url(url):
    with contextlib.closing(requests.get(url)) as r:
        if r.status_code != 200:
            raise ValueError('Bad response {}'.format(r))
        return bs4.BeautifulSoup(r.content, 'html.parser')
def select_only_one(soup, selector):
    matches = soup.select(selector)
    if len(matches) != 1:
        raise ValueError('Unexpected ambiguous contents for selector "{}"'.format(selector))
    return matches[0]

In [46]:
# Parse titles
with open('data/bible_books.txt') as f:
    bible_books = list(line.rstrip('\n') for line in f)
bible_books += ['1 Cor', 'Psalm', 'Proverb']
def correct_spelling(s):
    return s.replace('Eccelsiastes', 'Ecclesiastes')
PASSAGE = re.compile(r'({}) (\d+)([.:]\d+)?(-(\d+)([.:]\d+)?)?'.format('|'.join(bible_books)), re.I)
def get_name_and_passage(title):
    title = correct_spelling(title)
    match = list(PASSAGE.finditer(title))
    if match:
        return title[:match[-1].start()].rstrip(': (–-'), match[-1].group(0).title()
    return title, ''

In [6]:
ARCHIVE_PAGE = 'https://www.stnickschurch.org.uk/sermon-archive/page/{}'
ARCHIVE_PAGES = [ARCHIVE_PAGE.format(n) for n in range(1, 22)]
TITLES = [a.contents[0].strip()
          for page in ARCHIVE_PAGES
          for a in load_url(page).select('.resurrect-entry-content .resurrect-sermon-short h1 > a')]
LINKS = [a.attrs['href']
         for page in ARCHIVE_PAGES
         for a in load_url(page).select('.resurrect-entry-content .resurrect-sermon-short h1 > a')]

In [None]:
print('\n'.join(sorted(['{:<50}  {}'.format(*get_name_and_passage(title)) for title in TITLES])))

In [47]:
def get_sermon(url):
    soup = load_url(url)
    try:
        root = select_only_one(soup, '.resurrect-sermon-full')
        d = {}
        d['title'], d['passage'] = get_name_and_passage(
            select_only_one(root, '.resurrect-main-title').contents[0].strip())
        d['date'] = select_only_one(root, 'time').contents[0]
        d['speaker'] = select_only_one(root, '.resurrect-sermon-speaker > a').contents[0]
        d['audio'] = select_only_one(soup, '#resurrect-sermon-full-audio-download-button > a').attrs['href']
        footers = root.select('.resurrect-entry-footer-terms > .resurrect-content-icon')
        for footer in footers:
            if 'Series: ' in footer.contents:
                d['series'] = select_only_one(footer, 'a').contents[0]
            elif 'Tagged with ' in footer.contents:
                tag = select_only_one(footer, 'a').contents[0]
                if 'time' in d:
                    raise ValueError('Multiple "Tagged with" in footers: {}'.format(footers))
                d['time'] = {'Sunday': '11:00',
                             'Midweek': '13:00',
                             'Summer Small Groups': '19:00',
                             'Small Groups': '19:00',
                             'Guest Event': '19:00',
                             'Events': '19:00',
                             'Weekend Away': '',
                             'Weekends Away': '',
                             'Christmas 2016': '11:00'}[tag]
        if 'time' not in d:
            sys.stderr.write('\rWARNING! Time tag not found for {}\n'.format(url))
    except Exception:
        raise ValueError('Failed to parse: {}'.format(url))
    return d

In [48]:
EXCLUDED = {'https://www.stnickschurch.org.uk/sermons/heaven-not-get-2-luke-12-21-34/',
            'https://www.stnickschurch.org.uk/sermons/king-serves-6-mark-8-11-30/',
            'https://www.stnickschurch.org.uk/sermons/king-serves-5-mark-7-24-8-10/'}
with open('sermons.auto.tsv', 'w') as f:
    writer = csv.DictWriter(f, ['date', 'time', 'passage', 'series', 'title', 'speaker', 'audio'],
                            delimiter='\t')
    writer.writeheader()
    for n, url in enumerate(LINKS):
        if url not in EXCLUDED:
            sys.stderr.write('\rSermon {}/{}'.format(n+1, len(LINKS)))
            writer.writerow(get_sermon(url))

Sermon 205/205