# Screen Scraping Basics with Python

Frank Donnelly Head of GIS & Data Services, Brown University Library\
Oct 14, 2025 / Revised Oct 15, 2025

IRS SOI Exempt Organizations Business Master File Extract\
https://www.irs.gov/charities-non-profits/exempt-organizations-business-master-file-extract-eo-bmf

## Brown Univ Colab Users
SKIP these blocks if you are NOT using CoLab. Otherwise, run them is you want to operate in CoLab

1. Open this notebook with this URL:\
https://colab.research.google.com/github/Brown-University-Library/geodata_screenscrape/blob/main/python_scrape.ipynb
2. Then run the following box to import this repo into a temporary folder:

In [None]:
# GOOGLE COLAB USERS - RUN THIS
!git clone https://github.com/Brown-University-Library/geodata_screenscrape/ && mv temp_repo/* temp_repo/.[!.]* . && rm -rf temp_repo

## Preliminaries

In [None]:
import requests, os
from bs4 import BeautifulSoup as soup
from datetime import date
from time import sleep
from IPython.display import clear_output

In [None]:
url='https://www.irs.gov/charities-non-profits/exempt-organizations-business-master-file-extract-eo-bmf'
dataset='IRS SOI Exempt Organizations Business Master File Extract'
person='YOUR NAME, YOUR TITLE, YOUR ORG'
today = str(date.today())

In [None]:
outfolder='downloaded-'+today
if not os.path.exists(outfolder):
    os.makedirs(outfolder)

## Get Links

In [None]:
webpage=requests.get(url).content
soup_page=soup(webpage,'html.parser')
page_title = soup_page.title.text
container=soup_page.find('div', {'class': 'pup-header-content-rt col-sm-12 col-md-9'}) # all links to data files are in this div
#container=soup_page.find('table',{'class': 'table complex-table table-striped table-bordered table-responsive'}) # just links in the table
links=container.find_all('a') # all the links, mix of data and non-data

In [None]:
datalinks={}

for lnk in links:
    if 'href' in lnk.attrs:
        if lnk.attrs['href'].endswith(('.pdf','.csv','.zip')):
            filename=lnk.attrs['href'].split('/')[-1]
            datalinks[filename]=lnk.attrs['href']

## Download Data

In [None]:
i = 0 
errors={}
for k,v in datalinks.items():
    try:
        response = requests.get(v)
        response.raise_for_status()
        datafile = open(os.path.join(outfolder,k),'wb')
        datafile.write(response.content)
        datafile.close()
        i=i+1
        print('Downloaded',k)
        #sleep(1) #add a pause between downloads
        clear_output(wait=True) #replace print messages with new msg
    except requests.exceptions.RequestException as e:
        print('Could not retrieve',k,'because of',e)
        errors[k]=e

print('Finished downloading',i,'files from',page_title)

## Save Webpage, Metadata, Errors

In [None]:
webfile = '_WEBPAGE-{}.html'.format(today)
writefile=open(os.path.join(outfolder,webfile),'wb')
writefile.write(webpage)
writefile.close()

In [None]:
metafile = "_METADATA-{}.txt".format(today)
writefile=open(os.path.join(outfolder,metafile),'w')
writefile.write(dataset+'\n') 
writefile.write('{} files archived on {}\n'.format(i,today))
writefile.write('From webpage {}\n'.format(page_title)) 
writefile.write('At {}\n'.format(url))  
writefile.write('By {}'.format(person))  
writefile.close() 

In [None]:
efile = "_ERRORS-{}.txt".format(today)
epath=os.path.join(outfolder,efile)
if os.path.exists(efile):
    os.remove(efile)

if len(errors)>0:
    writefile=open(epath,'w')
    writefile.write('Download Errors for {}\n'.format(page_title))
    for ek,ev in errors.items():
        writefile.write('{}: {}\n'.format(ek,ev))
    writefile.close()