This code generates the URLs for ma-appellatecourts.org which we will want to download. There is probably no need to run this again unless we need to capture more current cases.

In [None]:
def generate_url(base, case_type, year, number):
    """
    Given a case type, year, and number, generate the URL for it on the MA Appellate Court website
    
    Input:
        base: base of URL
        case type: Type of case in J, P, SJ, and SJC
        year: Year of case
        number: Case number
    Output:
        URL to case
    """
    
    if case_type in ["J", "P"]: # format: http://www.ma-appellatecourts.org/display_docket.php?src=party&dno=1999-P-1
        return base + str(year) + "-" + case_type + "-" + str(number)
    elif case_type in ["SJ"]: # format: http://www.ma-appellatecourts.org/display_docket.php?src=party&dno=SJ-2011-0500
        return base + case_type + "-" + str(year) + "-" + str(number)
    else: #http://www.ma-appellatecourts.org/display_docket.php?src=party&dno=SJC-10108
        return base + case_type + "-" + str(number)

base = "http://www.ma-appellatecourts.org/display_docket.php?src=party&dno="

# Number of J cases by year
j_limits = {}
j_limits[2008] = 547
j_limits[2009] = 565
j_limits[2010] = 589
j_limits[2011] = 550
j_limits[2012] = 482
j_limits[2013] = 568
j_limits[2014] = 514
j_limits[2015] = 527
j_limits[2016] = 539
j_limits[2017] = 581
j_limits[2018] = 130

# Number of P cases by year
p_limits = {}
p_limits[2008] = 2156
p_limits[2009] = 2354
p_limits[2010] = 2281
p_limits[2011] = 2182
p_limits[2012] = 2023
p_limits[2013] = 2031
p_limits[2014] = 1995
p_limits[2015] = 1755
p_limits[2016] = 1758
p_limits[2017] = 1634
p_limits[2018] = 365

# Number of SJ cases by year
sj_limits = {}
sj_limits[2008] = 575
sj_limits[2009] = 668
sj_limits[2010] = 586
sj_limits[2011] = 555
sj_limits[2012] = 521
sj_limits[2013] = 503
sj_limits[2014] = 529
sj_limits[2015] = 561
sj_limits[2016] = 536
sj_limits[2017] = 511
sj_limits[2018] = 125

# Lower and upper limit of SJC case numbers within current window (2008-2018)
sjc_lower = 10108
sjc_upper = 12510

# Links to all cases
links = []

# Generate all the links based on the above controls
for year, n in j_limits.items():
    for i in range(n):
        links.append(generate_url(base, "J", year, i + 1))
for year, n in p_limits.items():
    for i in range(n):
        links.append(generate_url(base, "P", year, i + 1))
for year, n in sj_limits.items():
    for i in range(n):
        links.append(generate_url(base, "SJ", year, i + 1))
for i in range(sjc_upper - sjc_lower):
    links.append(generate_url(base, "SJC", 0, i + sjc_lower + 1))

with open("urls_todo.txt", "w") as text_file:
    for link in links:
        print(link, file=text_file)

This code takes the URLs (from a different file than the one written to above- this way, we can limit the scope if we so desire) and pulls the text down for us to keep. Be advised that the operation succeeds even if the page we pull down is "you've been blocked", so be sure to remove any files that are downloaded and are too small to be court cases (in my case, the minimum size is 7 KB, which is a "this number wasn't found"; most cases are much larger. However, the 'blocked' responses are 3 KB, but your ISP may vary. Still these are probably always smaller than actual court cases). The main loop which controls the page reads also checks if we have the case before we pull it, so there's no need to worry about pulling duplicates (but if we pull a 'blocked' response, we do need to add it back).

In [None]:
import requests
import os
import sys
import time
import random

# List of user agents to choose from
useragents = '''Mozilla/5.0 (Windows; U; ; en-NZ) AppleWebKit/527  (KHTML, like Gecko, Safari/419.3) Arora/0.8.0
Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser; Avant Browser; .NET CLR 1.0.3705; .NET CLR 1.1.4322; Media Center PC 4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30)
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.8 (KHTML, like Gecko) Beamrise/17.2.0.9 Chrome/17.0.939.0 Safari/535.8
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/28.0.1469.0 Safari/537.36
Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/28.0.1469.0 Safari/537.36
Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36
Mozilla/5.0 (Windows NT 6.0; rv:14.0) Gecko/20100101 Firefox/14.0.1
Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20120427 Firefox/15.0a1
Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0) Gecko/16.0 Firefox/16.0
Mozilla/5.0 (Windows NT 6.2; rv:19.0) Gecko/20121129 Firefox/19.0
Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130401 Firefox/21.0
Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0
iTunes/9.0.2 (Windows; N)
Mozilla/5.0 (compatible; Konqueror/4.5; Windows) KHTML/4.5.4 (like Gecko)
Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; Maxthon 2.0)
Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.1 (KHTML, like Gecko) Maxthon/3.0.8.2 Safari/533.1
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML like Gecko) Maxthon/4.0.0.2000 Chrome/22.0.1229.79 Safari/537.1
Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)
Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)
Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)
Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)
Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Trident/4.0)
Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)
Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Trident/5.0)
Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)
Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.2; Trident/5.0)
Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.2; WOW64; Trident/5.0)
Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; Media Center PC 6.0; InfoPath.3; MS-RTC LM 8; Zune 4.7)
Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)
Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/6.0)
Mozilla/5.0 (compatible; MSIE 10.6; Windows NT 6.1; Trident/5.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727) 3gpp-gba UNTRUSTED/1.0
Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko
Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko
Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.3; Trident/7.0; .NET4.0E; .NET4.0C)
Opera/9.80 (Windows NT 6.1; U; en) Presto/2.7.62 Version/11.01
Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14
Opera/9.80 (Windows NT 6.1; WOW64) Presto/2.12.388 Version/12.16
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.12 Safari/537.36 OPR/14.0.1116.4
Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.29 Safari/537.36 OPR/15.0.1147.24 (Edition Next)
Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.57 Safari/537.36 OPR/18.0.1284.49
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.76 Safari/537.36 OPR/19.0.1326.56
Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/533.17.8 (KHTML, like Gecko) Version/5.0.1 Safari/533.17.8
Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5
Mozilla/5.0 (Windows; U; Windows NT 6.2; es-US ) AppleWebKit/540.0 (KHTML like Gecko) Version/6.0 Safari/8900.00
Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB; rv:1.9.1.17) Gecko/20110123 (like Firefox/3.x) SeaMonkey/2.0.12		
Mozilla/5.0 (Windows NT 5.2; rv:10.0.1) Gecko/20100101 Firefox/10.0.1 SeaMonkey/2.7.1
Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20120422 Firefox/12.0 SeaMonkey/2.9'''.split('\n')

def get_page_text(url, useragent, sleep_timings = [2, 3, 5, 8], exception_timings = [5, 10, 15]):
    """
    Given a URL, return the text content
    
    Input:
        url: a string representing a URL
        useragent: our totally not fake id, officer
        sleep_timings: list of possible numbers of seconds to wait between requests
        exception_timings: list of possible numbers of seconds to wait between exceptions before retrying
    Output:
        the content of said URL
    """
    
    # Construct the header
    headers = {"Connection": "close", "user-agent": useragent}
    
    # Request until we have a result
    page = ""
    while (page == ""):
        try:
            time.sleep(random.choice(sleep_timings))
            page = requests.get(url)
        except:
            print("Unexpected error:", sys.exc_info()[0])
            time.sleep(random.choice(exception_timings))
            continue
    
    # Keep the page text
    return page.text

def write_page_text(url, text):
    """
    Write a page's text content to a file
    
    Input:
        url: a string representing the source of the text
        text: the text content
    Output:
        the filename under which the content was written
    """
    filename = r'C:\Users\tiamm\Documents\BENCHMARKS\%s.html' % url.split('dno=')[-1]
    with open(filename, "w") as text_file:
        print(text, file=text_file)
    return filename

links = set([])
folder = r'MA Appellate Court'

with open("urls_todo.txt", "r") as text_file:
    for line in text_file:
        links.add(line.strip())

# Get files that have already been done
done = set([])
for file in os.listdir(folder):
    done.add(file)

# Starting from where we left off, pull down pages and write them
# This is to limit what we do at once (if desired)
countdown = 1000
processed = []
for link in links:
    processed.append(link)
    # Don't download a file we already have
    if (link.split('dno=')[-1] + '.html') in done:
        continue
    write_page_text(link, get_page_text(link, random.choice(useragents), [3, 5, 8, 13], [10, 20, 30]))
    countdown -= 1
    #print(link)
    if countdown <= 0:
        break

# Write down what we've done
for link in processed:
    links.remove(link)
with open("urls_todo.txt", "w") as text_file:
    for link in links:
        print(link, file=text_file)

Within the web pages themselves are links to PDFs of briefs (in many cases), but we have not acquired those for this semester's project. This could give more insight into reasons why cases were reversed for a future team; one would need to scrape the HTML pages for the links to said briefs, take the time to download them all (without getting blocked), parse them for text, and endeavor to extract meaning from said text.