In [1]:
import requests
import re
from bs4 import BeautifulSoup
import html5lib
import multiprocessing.dummy as mp

# searching under all categories, positive bullets. 'hard-coded' to 250 results per page
URL = 'http://www.afeprbullets.com/results.php?Submit5=Search&strength=Positive&rec=250&pg=1'
ROOT = 'http://www.afeprbullets.com/'
outdir = '../scrapes/'
outfile = outdir + 'afeprbullets.txt'
homepage = requests.get(URL)

# need HTML5LIB because sometimes website has errors in HTML. html5lib is very forgiving.
homeSoup = BeautifulSoup(homepage.content,'html5lib')


In [2]:
pageList = homeSoup.find_all('a', href=re.compile('&rec=250&pg=\d+'))
# This site lists hrefs relative to root.
pageURLs = [ROOT + pageItem['href'] for pageItem in pageList]

In [3]:
def getBulletsInPage(pageURL, bulletDict):
    
    page = requests.get(pageURL)
    pageSoup = BeautifulSoup(page.content, 'html5lib')

    # Searches for all strings that match regex.
    bulletScrape = pageSoup.find(id="form1").find_all('a',href=re.compile('rateit\.php\?id=\d+'))
    bulletDict[pageURL] = [bullet.get_text().strip() for bullet in bulletScrape]


In [4]:
# multiprocessing is used to speed up the scrape. 

# a dict/manager is used to collect all scraped bullets during multiprocessing.  
manager = mp.Manager()
resultsDict = manager.dict()

#running Pool() with no arguments will default to amount of cores you have... I think
p = mp.Pool()

p.starmap(getBulletsInPage, [(pageURL,resultsDict) for pageURL in pageURLs])
p.close()
p.join()

# Writing to file. Should default to clobbering.
with open(outfile,'w',encoding='utf-8') as fileID:
    for key in resultsDict:
        fileID.write(f'SECTION {key} \n')
        for line in resultsDict[key]:
            fileID.write(line + '\n')

In [110]:
# this block is for debugging
pageURL = 'http://www.afeprbullets.com/results.php?Submit5=Search&strength=Positive&rec=250&pg=1'
page = requests.get(pageURL)
pageSoup = BeautifulSoup(page.content, 'html5lib')
bulletScrape = pageSoup.find(id="form1").find_all('a',href=re.compile('rateit\.php\?id=\d+'))
bullets = [bullet.get_text() for bullet in bulletScrape]
bullets[0:5]

[&#39;- Exceptional SNCO! BASE\&#39;s nominee for the 2006 &quot;Chief Master Sergeant Larry R. Daniels Award&quot;&#39;,
 &#39;- Deployed as Fire Chief to Baghdad International Airport (BIAP); molded 45 firefighters into a cohesive team\n  -- 24-hour combat fire protection--6K airlift missions carrying 57K troops/19K cargo short-tons--zero fire loss&#39;,
 &#39;- Community leader--VFW Post #### nominee for 2006 &quot;Gold Medal Award&quot;--ready for SNCO of the Year&#39;,
 &#39;- Exceptional Amn; consistantly performs insps 20% above peers--contributed to flt 93% in-commission rate\n- Primed units for TACEVAL; replaced one starter/two cylinders--aided first NATO response force certification&#39;,
 &quot;- Flawless superintendent control of flight&#39;s Operations/Readiness--84 firefighters, $10M fleet, $1.9M budget&quot;]