In [1]:
import requests
import re
from bs4 import BeautifulSoup
import html5lib
import multiprocessing.dummy as mp
URL = 'http://www.eprbulletsafsc.com/'
outdir = '../scrapes/'
outfile = outdir + 'eprbulletsafsc.txt'
homepage = requests.get(URL)

# need HTML5LIB because sometimes website has errors in HTML. html5lib is very forgiving.
homeSoup = BeautifulSoup(homepage.content,'html5lib')


In [2]:
pageList = homeSoup.find(id='page').find_all('a', href=re.compile('\w+\.htm*'))
# filtering out disclaimer link
pageList = list(filter(lambda a: a['href'] != "disclaimer.htm", pageList))
# some of the hrefs have '/' in front of them, but I don't think that will matter.

pageURLs = [URL + pageItem['href'] for pageItem in pageList]

In [3]:
def getBulletsInPage(pageURL, bulletDict):
    
    page = requests.get(pageURL)
    pageSoup = BeautifulSoup(page.content, 'html5lib')

    # Searches #bulletmain for all strings that match regex.
    bulletScrape = pageSoup.find(id="page").find_all(string=re.compile('^-\s+.*'))
    bulletDict[pageURL]  = [bullet.strip() for bullet in bulletScrape]


In [4]:
# multiprocessing is used to speed up the scrape. 

# a dict/manager is used to collect all scraped bullets during multiprocessing.  
manager = mp.Manager()
resultsDict = manager.dict()

#running Pool() with no arguments will default to amount of cores you have... I think
p = mp.Pool()

p.starmap(getBulletsInPage, [(pageURL,resultsDict) for pageURL in pageURLs])
p.close()
p.join()

# Writing to file. Should default to clobbering.
with open(outfile,'w',encoding='utf-8') as fileID:
    for key in resultsDict:
        fileID.write(f'SECTION {key} \n')
        for line in resultsDict[key]:
            fileID.write(line + '\n')

In [5]:
# this block is for debugging
pageURL = 'http://www.eprbulletsafsc.com/CTK.htm'
page = requests.get(pageURL)
pageSoup = BeautifulSoup(page.content, 'html5lib')
bulletScrape = pageSoup.find(id="page").find_all(string=re.compile('^-\s.*'))
bullets = [bullet.strip() for bullet in bulletScrape]
bullets

[&quot;- Oversaw MICT utilization effort; trained shop personnel on new inspection sys--ensured shop&#39;s 100% info accuracy&quot;,
 &#39;- Performed XX tool box inspections; verified XK tools serviceable/fixed XX--saved sqdn XXK in replacement parts&#39;,
 &#39;- Managed tool crib ops 228 hrs; ordered $9.3K parts/issued 872 CTK tools--100% accounted/0 FOD incidents&#39;,
 &#39;- Oversaw relocation of CTK; $400K+ worth of equipment moved--guaranteed seamless maintenance support&#39;,
 &#39;- Monitored/tracked $3M+ in CTKs/testers; 80 inspections completed--facilitated 98% AME in-commission rate&#39;,
 &quot;- Mng&#39;d dply&#39;d CTK assets; led 100% equip insp tm/ID&#39;d/rect 10 discreps--key to AOR 433 sorties/2.7K flt hrs FY &#39;16&quot;,
 &#39;- Section CTK Prgm Mgr; revamped TAS tracking identification/tool location...guaranteed 100% accountability&#39;,
 &quot;- Prepped section for 2011 LCAT visit; insp&#39;d tool kit w/zero defects noted--key to sq&#39;s 83% compliance rate&q