This reads in the HTML files from the hard drive (not the web). This will need some work to read the docket entries, but that work can all be done within the scrape_page method; further web queries are probably unnecessary.

In [10]:
import re
import os
from bs4 import BeautifulSoup

def scrape_mac_page(filename):
    """
    Open the MA Appellate Court html file and Soup it as beautifully as possible
    
    Input:
        filename: The filename to parse
    Output:
        A dictionary of the items found in the case page
        A list of dated docket entries
    """
    soup = BeautifulSoup(open(filename), 'html.parser')
    info = {}
    
    # Get case tags
    header = soup.find('td', class_="largefont")
    if len(list(soup.find_all("td", align="center"))) < 2:
        return {}, []
    center_cells = list(soup.find_all("td", align="center")[1].stripped_strings)
    info["Court Type"] = header.b.contents[0]
    info["Panel"] = header.b.contents[1].text
    info["Case Name"] = center_cells[0]
    info["Case Id"] = center_cells[-1]
    #info["Case Id"] = center_cells[2]
    
    # Get court tags
    tables = soup.find_all("table", class_="lightborder")
    attr_table = tables[0]
    for row in attr_table.find_all("tr", valign="top"): 
        items = row.find_all("b")
        for item in items:
            k = item.text
            v = item.next.next.text.strip()
            info[k] = v
    
    # Get parties
    parties_table = soup.find("table", class_="lightborder", cellpadding="5")
    if parties_table is None:
        return {}, []
    p_k = set([])
    for row in parties_table.find_all("tr")[1:]:
        k = row.b.nextSibling.next.strip().split('/')[0]
        v = row.b.text.strip()

        if k in info:
            info[k].append(v)
        else: 
            info[k] = [v]
            p_k.add(k)
        
        ext = re.search(r'(\d+) Extensions, (\d+) Days', row.text)
        if ext:
            info['%s Extensions' %(k)] = ext.group(1)
            info['%s Extension Days' %(k)] = ext.group(2)
    for k in p_k:
        info[k] = ", ".join(info[k])
    
    # Get docket entries
    docket = []
    for table in tables:
        if table.find('tr').text.find('DOCKET ENTRIES') >= 0:
            for row in table.find_all("tr")[2:]:
                items = row.find_all("td")
                date = items[0].text.strip()
                entry = ""
                if len(items) >= 3:
                    entry = str(items[2].text.strip())
                    entry = re.sub(r"\s+", " ", entry, flags=re.UNICODE)
                docket.append([info['Case Id'], date, entry])
    for entry in docket:
        if re.search('affirm', entry[2], re.IGNORECASE):
            info['Has Affirm'] = 'Yes'
        if re.search('revers', entry[2], re.IGNORECASE):
            info['Has Reverse'] = 'Yes'
    
    return info, docket

base = "http://www.ma-appellatecourts.org/display_docket.php?src=party&dno="
folder = r'MA Appellate Court'
cases = []
dockets = []
keys = set([])

# Read in all the downloaded pages and print / process them
for file in os.listdir(folder):
    if file.endswith(".html"):
        fullname = os.path.join(folder, file)
        case, docket = scrape_mac_page(fullname)
        case['URL'] = base + file
        keys.update(case.keys())
        if 'Case Status' in case:
            if re.search('rescript', case['Case Status'], re.IGNORECASE):
                if 'Case Type' in case and case['Case Type'] == 'Criminal':
                    cases.append(case)
                    dockets += docket

In [11]:
#Run interactive session to see population of data scraped from the websites


import unicodecsv as csv

keys = list(keys)
move_to_front = ['Case Id', 'Case Name', 'Case Type', 'Case Status', 'Entry Date', 'Argued Date', 'Status Date', 'Decision Date', 'Has Affirm', 'Has Reverse']
for i in range(len(move_to_front)):
    j = keys.index(move_to_front[i])
    #j = dockets.index(move_to_front[i])
    keys[i], keys[j] = keys[j], keys[i]
    #dockets[i],dockets[j] = dockets[j],dockets[i]

# Write out the case csv
with open('cases.csv', 'wb') as output_file:
    dict_writer = csv.DictWriter(output_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(cases)

# Write out the case csv
with open('dockets.csv', 'wb') as output_file:
    writer = csv.writer(output_file)
    writer.writerow(['Case Id', 'Date', 'Entry'])
    for entry in dockets:
        writer.writerow(entry)

ValueError: 'Case Name' is not in list

Reversals by lower court judge

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

%matplotlib inline

# Plot bar of lower court judge reversals
num_cases = dict()
num_reversals = dict()
for case in cases:
    if 'Lower Ct Judge' in case and len(case['Lower Ct Judge']) > 0:
        jname = case['Lower Ct Judge']
        if jname not in num_cases:
            num_cases[jname] = 0
        if jname not in num_reversals:
            num_reversals[jname] = 0
        num_cases[jname] += 1
        if 'Has Reverse' in case:
            num_reversals[jname] += 1

# Calculate the reversal percentage
perc_rev = dict()
perc_rev_filter = dict()
for jname in num_cases.keys():
    if num_cases[jname] >= 0:
        perc_rev[jname] = num_reversals[jname] / num_cases[jname]
    if num_cases[jname] >= 5:
        perc_rev_filter[jname] = num_reversals[jname] / num_cases[jname]

# Get basic stats of each category
a_t = list(num_cases.values())
a_r = list(num_reversals.values())
a_p = list(perc_rev.values())
print("Total cases:")
print("Minimum: %s" %(np.min(a_t)))
print("Maximum: %s" %(np.max(a_t)))
print("Mean: %s" %(np.mean(a_t)))
print("Median: %s" %(np.median(a_t)))
print("Reversed cases:")
print("Minimum: %s" %(np.min(a_r)))
print("Maximum: %s" %(np.max(a_r)))
print("Mean: %s" %(np.mean(a_r)))
print("Median: %s" %(np.median(a_r)))
print("Reversal percentages:")
print("Minimum: %s" %(np.min(a_p)))
print("Maximum: %s" %(np.max(a_p)))
print("Mean: %s" %(np.mean(a_p)))
print("Median: %s" %(np.median(a_p)))

# Get cumulative distribution of each category
n = len(a_t)
a_t = sorted(a_t)
c_t = [np.sum(a_t[:i]) / np.sum(a_t) for i in range(n)]
a_r = sorted(a_r)
c_r = [np.sum(a_r[:i]) / np.sum(a_t) for i in range(n)]
#a_p = sorted(a_p)
#c_p = [np.sum(a_p[:i]) / np.sum(a_p) for i in range(n)]

# Graph the distribution
plt.figure(dpi=500)
p1 = plt.plot(range(n), a_t)
p2 = plt.plot(range(n), a_r)
#p3 = plt.plot(range(n), c_p)
plt.legend((p1[0], p2[0]), ('Total Cases', 'Reversed Cases'))
plt.title("Case Distribution by Judge")
plt.show()


In [None]:
# Take the top 10 judges by percentage of reversals
top_judge = sorted(perc_rev_filter.items(), key=lambda x: -x[1])
top_judge = [li[0] for li in top_judge[:10]]

# Graph them (stacked bar) over total cases
plt.figure(figsize=(21, 4), dpi=500)
p1 = plt.bar(top_judge, [num_cases[j] for j in top_judge], 0.5)
p2 = plt.bar(top_judge, [num_reversals[j] for j in top_judge], 0.5)
#plt.xticks(range(1, k+1), ['Cluster %s (%s)' %(i + 1, str(int(100 * num_rev[i] / len(groupings_dockets[i]))) + '%') for i in range(k)])
plt.legend((p1[0], p2[0]), ('Total Cases', 'Reversed Cases'))
plt.title("Reversals by Judge")
plt.show()

These are some labeling and graphing utilities I developed

In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

%matplotlib inline

ENGLISH_STOP_WORDS = frozenset([
    "a", "about", "above", "across", "after", "afterwards", "again", "against",
    "all", "almost", "alone", "along", "already", "also", "although", "always",
    "am", "among", "amongst", "amoungst", "amount", "an", "and", "another",
    "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are",
    "around", "as", "at", "back", "be", "became", "because", "become",
    "becomes", "becoming", "been", "before", "beforehand", "behind", "being",
    "below", "beside", "besides", "between", "beyond", "bill", "both",
    "bottom", "but", "by", "call", "can", "cannot", "cant", "co", "con",
    "could", "couldnt", "cry", "de", "describe", "detail", "do", "done",
    "down", "due", "during", "each", "eg", "eight", "either", "eleven", "else",
    "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone",
    "everything", "everywhere", "except", "few", "fifteen", "fifty", "fill",
    "find", "fire", "first", "five", "for", "former", "formerly", "forty",
    "found", "four", "from", "front", "full", "further", "get", "give", "go",
    "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter",
    "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his",
    "how", "however", "hundred", "i", "ie", "if", "in", "inc", "indeed",
    "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter",
    "latterly", "least", "less", "ltd", "made", "many", "may", "me",
    "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly",
    "move", "much", "must", "my", "myself", "name", "namely", "neither",
    "never", "nevertheless", "next", "nine", "nobody", "none", "noone",
    "nothing", "now", "nowhere", "of", "off", "often", "on",
    "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our",
    "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps",
    "please", "put", "rather", "re", "same", "see", "seem", "seemed",
    "seeming", "seems", "serious", "several", "she", "should", "show", "side",
    "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone",
    "something", "sometime", "sometimes", "somewhere", "still", "such",
    "system", "take", "ten", "than", "that", "the", "their", "them",
    "themselves", "then", "thence", "there", "thereafter", "thereby",
    "therefore", "therein", "thereupon", "these", "they", "thick", "thin",
    "third", "this", "those", "though", "three", "through", "throughout",
    "thru", "thus", "to", "together", "too", "top", "toward", "towards",
    "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us",
    "very", "via", "was", "we", "well", "were", "what", "whatever", "when",
    "whence", "whenever", "where", "whereafter", "whereas", "whereby",
    "wherein", "whereupon", "wherever", "whether", "which", "while", "whither",
    "who", "whoever", "whole", "whom", "whose", "why", "will", "with",
    "within", "without", "would", "yet", "you", "your", "yours", "yourself",
    "yourselves"])

def extract_groupings(X, corpus, k, labels):
    """
    Return data points and metadata grouped by cluster
    
    Plus some other logging that I found useful
    
    I'm also appending the sample representative to the list of clusters for ease of display
    
    Input:
        X = data points
        corpus = the reviews for each restaurant
        k = number of clusters
        labels = cluster number of each point in X
    Output:
        Two list of lists, where the ith sublist is all data points (or title) in the ith cluster
    """
    # Grouping of points by cluster and metadata by cluster
    groupings = [[] for i in range(k)]
    corpora = [[] for i in range(k)]
    winners = []
    
    # Intra-cluster similarity scores
    iscore = dict()
    
    # Sort each point (and associated metadata) into bins for each cluster label
    for i in range(len(X)):
        label = labels[i]
        
        # Add this point to the cluster based on its label
        groupings[label].append(X[i])
        corpora[label].append(corpus[i])
    
    # Print (to file) the sample representative of each cluster
    with open('reps.txt', 'w') as repfile:
        for j in range(k):
            # Score each case in the cluster by intra-cluster similarity
            scores = np.sum(cosine_similarity(groupings[j], groupings[j]), axis=0)
            
            # The case with the maximum such score shall be considered the sample representative
            index = np.argmax(scores)
            repfile.write('Cluster %s Representative:\n' %(j + 1))
            repfile.write(corpora[j][index])
            repfile.write('\n')
            repfile.write('\n')
            winners.append(groupings[j][index])
            
            # Save all the scores
            for i in range(len(scores)):
                iscore[corpora[j][i].split(' \n')[0]] = scores[i]
    
    # Score the value of each term within the groupings and get the most meaningful terms for each cluster
    titles = ['' for i in range(k)]
    for i in range(k):
        # Highest count words
        cv = CountVectorizer(stop_words=ENGLISH_STOP_WORDS, token_pattern=r'(?u)\b[a-zA-Z][a-zA-Z]+\b').fit(corpora[i])
        matrix = cv.transform(corpora[i])
        smat = matrix.sum(axis=0)
        wf = [(word, smat[0, idx]) for word, idx in cv.vocabulary_.items()]
        l = sorted(wf, key=lambda x: -x[1])
        titles[i] = [li[0] for li in l[:8]]
    
    # Print (to file) the cluster and score of each case
    with open('scores.csv', 'wb') as scorefile:
        writer = csv.writer(scorefile)
        writer.writerow(['Case ID', 'Cluster', 'ICS Score', 'Has Reverse', 'Has Affirm'])
        for i in range(len(X)):
            case_id = corpus[i].split(' \n')[0]
            ha = re.search('revers', entry[2], re.IGNORECASE)
            writer.writerow([case_id, labels[i] + 1, iscore[case_id], 'Yes' if re.search('revers', corpus[i], re.IGNORECASE) else 'No', 'Yes' if re.search('affirm', corpus[i], re.IGNORECASE) else 'No'])
    
    return groupings + [winners], titles + ['Representatives']

def plot_clustering(k, groupings, labels, title, axes=[2,1]):
    """
    Plot the given k clusters on a 16x16 plot.
    
    Input:
        k = the number of clusters
        groupings = list of lists corresponding to the points in each cluster
        labels = the title of each cluster
        title = the title of the plot
    Output:
        None
    """
    # This size seems quite reasonable/readable
    plt.figure(figsize=(16, 16), dpi=500)
    plt.axes().set_aspect('equal')
    plt.xlabel('Component %s' %(axes[0] + 1))
    plt.ylabel('Component %s' %(axes[1] + 1))
    
    # Store the plot results so we can label them later
    legend = []
    
    # Plot each cluster
    for i in range(k):
        plot = None
        if 'Representatives' == labels[i]:
            plot = plt.scatter([entry[axes[0]] for entry in groupings[i]], [entry[axes[1]] for entry in groupings[i]], alpha=1, c='w', edgecolors='b')#, color=colors[i], marker=markers[i], s=8) 
        else:
            plot = plt.scatter([entry[axes[0]] for entry in groupings[i]], [entry[axes[1]] for entry in groupings[i]], alpha=0.5)#, color=colors[i], marker=markers[i], s=8)
        legend.append(plot)
    
    # Label each cluster
    plt.legend(legend, labels, fancybox=True, framealpha=0.5)
    plt.title(title)
    
    # Show the plot
    plt.show()

Cluster our ma-appellatecourts.org data

In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn import mixture
from sklearn import metrics
import scipy.cluster.hierarchy as hierarchy

corpus_dockets = []
last = ''
for docket in dockets:
    if last != docket[0]:
        last = docket[0]
        corpus_dockets.append('%s' %(docket[0]))
    corpus_dockets[-1] += ' \n' + docket[2]

# Compute the tf-idf scores of the opinions
# Using up to trigrams to account for adverbs and for legal terms
fe_tfv = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS, min_df=0.01, max_df = 0.5, ngram_range=(1, 3), token_pattern=r'(?u)\b[a-zA-Z][a-zA-Z]+\b')
tfidf_dockets = fe_tfv.fit_transform(corpus_dockets)

# Compute the LSA of the scored opinions
# After 4 components, we don't get much more ROI (plot leading to this conclusion is commented out below)
# min/max document frequency play a huge role here
n_c = 4
dc_tsvd = TruncatedSVD(n_components=n_c)
lsa_dockets = dc_tsvd.fit_transform(tfidf_dockets)

# This is the code we would use to graph the singular values
#fe_tfv = TfidfVectorizer(stop_words='english', min_df = 0.01, max_df = 0.5, token_pattern=r'(?u)\b[a-zA-Z][a-zA-Z]+\b')
#tfidf_dockets = fe_tfv.fit_transform(corpus_dockets)
#dc_tsvd = TruncatedSVD(n_components=50)
#lsa_dockets = dc_tsvd.fit_transform(tfidf_dockets)
#plt.plot(range(1,51), dc_tsvd.singular_values_)
#print(dc_tsvd.singular_values_)

# Cluster via cosine/hiearchical
# Best silhouette score is found with k=3, with 5 also being a local maximum
# Plot leading to this conclusion commented out below
k = 4
linkage_dockets = hierarchy.linkage(lsa_dockets, "average", metric="cosine")
hier_dockets = hierarchy.fcluster(linkage_dockets, k, criterion='maxclust') - 1
groupings_dockets, titles_dockets = extract_groupings(lsa_dockets, corpus_dockets, k, hier_dockets)
#plot_clustering(k, groupings_dockets, titles_dockets, 'Cosine Similarity of Docket Entries')

# This is the code we would use to display silhouette scores per k for cosine
#x = list(range(2,15))
#y = []
#linkage_dockets = hierarchy.linkage(lsa_dockets, "average", metric="cosine")
#for i in x:
#    hier = hierarchy.fcluster(linkage_dockets, i, criterion='maxclust') - 1
#    y.append(metrics.silhouette_score(lsa_dockets, hier))
#_ = plt.plot(x, y)

In [None]:
axes = [0,2]

plot_clustering(k + 1, groupings_dockets, titles_dockets, 'Cosine Similarity of Docket Entries', axes)

In [None]:
# Identify and transform reversed cases
corpus_reversed = [docket for docket in corpus_dockets if re.search('revers', docket, re.IGNORECASE)]
tfidf_reversed = fe_tfv.transform(corpus_reversed)
lsa_reversed = dc_tsvd.transform(tfidf_reversed)

plot_clustering(2, [lsa_dockets, lsa_reversed], ['All Cases', 'Reversed Cases'], 'Locality of Reversals', axes)

In [None]:
num_rev = [0] * k

for i in range(len(corpus_dockets)):
    if re.search('revers', corpus_dockets[i], re.IGNORECASE):
        num_rev[hier_dockets[i]] += 1

p1 = plt.bar(range(1, k+1), [len(groupings_dockets[i]) for i in range(k)], 0.25)
p2 = plt.bar(range(1, k+1), num_rev, 0.25)
plt.xticks(range(1, k+1), ['Cluster %s (%s)' %(i + 1, str(int(100 * num_rev[i] / len(groupings_dockets[i]))) + '%') for i in range(k)])
plt.legend((p1[0], p2[0]), ('Total Cases', 'Reversed Cases'))
plt.title("Reversals by Cluster")
plt.show()

Look for other features correlating with reversal

In [None]:
from sklearn import tree
from sklearn.metrics import roc_auc_score
from datetime import datetime

# Convert case features into something measurable
X_case = []
for case in cases:
    if 'Entry Date' not in case or 'Decision Date' not in case or len(case['Entry Date']) <= 0 or len(case['Decision Date']) <= 0:
        continue
    
    # Features
    a = []
    # Duration
    dt0 = datetime.strptime(case['Entry Date'], '%m/%d/%Y')
    dt1 = datetime.strptime(case['Decision Date'], '%m/%d/%Y')
    a.append(int((dt1-dt0).days))
    # Prosecution extensions
    a.append(int(case['Plaintiff Extensions']) if 'Plaintiff Extensions' in case else 0)
    a.append(int(case['Plaintiff Extension Days']) if 'Plaintiff Extension Days' in case else 0)
    # Defense extensions
    a.append(int(case['Defendant Extensions']) if 'Defendant Extensions' in case else 0)
    a.append(int(case['Defendant Extension Days']) if 'Defendant Extension Days' in case else 0)
    # Classification
    a.append(1 if 'Has Reverse' in case else 0)
    
    X_case.append(np.array(a))

# Shuffle and split into feature/class
X_case = np.array(X_case)
np.random.shuffle(X_case)
X_case, Y_case = np.split(X_case, [-1], axis=1)

# Index of the beginning of the test holdout data
test = int(len(X_case) * 0.8)

# Train a decision tree on the data
c_dt = tree.DecisionTreeClassifier()
c_dt = c_dt.fit(X_case[:test], Y_case[:test])

# ROC on test data seems to suggest this isn't really so useful just yet
print(c_dt.score(X_case[:test], Y_case[:test]))
print(c_dt.score(X_case[test:], Y_case[test:]))
print(roc_auc_score(c_dt.predict(X_case[test:]), Y_case[test:], average='weighted'))

This code is for parsing SJC opinion pages

In [None]:
import re
import os
from bs4 import BeautifulSoup

def scrape_lexis_page(filename):
    """
    Open the Lexis html file and Soup it as beautifully as possible
    
    Input:
        filename: The filename to parse
    Output:
        A dictionary of the items found in the case page
    """
    soup = BeautifulSoup(open(filename, 'rb'), 'html.parser')
    info = {}
    
    # Get document text
    doctext = soup.find("div", {"class": "document-text"})
    # TODO: Figure out why some documents return None from the previous step
    if not doctext:
        return {}
    
    # Parse metadata
    title = doctext.find("h1", {"id": "SS_DocumentTitle"}).text.strip()
    docinfo = doctext.find_all("p", {"class": "SS_DocumentInfo"})
    court = docinfo[0].text.strip()
    dates = docinfo[1].text.strip().split(';')
    case = docinfo[2].text.strip()
    info['Case Title'] = title
    info['Court'] = court
    # TODO: Fix date parsing
    #info['Date Argued'] = dates[0]
    #info['Date Decided'] = dates[1]
    info['Case Number'] = case
    reporter = []
    for sp in doctext.find_all("span", {"class": "SS_NonPaginatedRptr"}):
        reporter.append(sp.text.strip())
    # TODO: Get more from this section
    info['Reporter'] = ' | '.join(reporter)
    # TODO: Find out how to parse Prior History and similar (e.g. subsequent history)
    #prior = doctext.find_all("p", {"class": "SS_InlineText"})[-1].text
    #prior = re.sub(r"\s+", " ", prior, flags=re.UNICODE)
    #info['Prior History'] = prior
    
    
    #start = doctext.find("span", id="JUMPTO_Counsel")
    #for i in range(25):
    #    print(str(start.next_sibling).strip())
    #    start = start.next_sibling
    #here get stuff until br (end of category) and span (end of section)
    
    # TODO: Parse headnotes
    
    # Parse opinions
    info['Opinion Author'] = get_text_after_span(doctext, "JUMPTO_Opinionby")
    info['Opinion'] = get_text_after_id(doctext, "JUMPTO_Opinion")
    info['Concuring Opinion Author'] = get_text_after_span(doctext, "JUMPTO_Concurby")
    info['Concurring Opinion'] = get_text_after_id(doctext, "JUMPTO_Concur")
    info['Dissenting Opinion Author'] = get_text_after_span(doctext, "JUMPTO_Dissentby")
    info['Dissenting Opinion'] = get_text_after_id(doctext, "JUMPTO_Dissent")
    
    return info

def get_text_after_span(document, s_id):
    """
    Get the text immediately following some span with given id
    
    Input:
        document: The section of text
        s_id: The id of the span
    Output:
        The text immediately following said element (or the empty string if the id does not exist)
    """
    start = document.find("span", id=s_id)
    if not start:
        return ""
    return str(start.next_sibling).strip()

def get_text_after_id(document, e_id):
    """
    Get the text in the paragraphs immediately following some element with given id
    
    Input:
        document: The section of text
        e_id: The id of the span
    Output:
        The text immediately following said element (or the empty string if the id does not exist)
    """
    start = document.find(id=e_id)
    if not start:
        return ""
    element = start.next_sibling
    ps = []
    while element and element.name == 'p':
        text = element.text.strip()
        text = re.sub(r"\s+", " ", text, flags=re.UNICODE)
        ps.append(text)
        element = element.next_sibling
    return " %%% ".join(ps)

#base = "http://www.ma-appellatecourts.org/display_docket.php?src=party&dno="
folder = 'Reversal Opinions HTML'
cases = []
keys = set([])

# Read in all the downloaded pages and print / process them
for file in os.listdir(folder):
    if file.endswith(".html"):
        fullname = os.path.join(folder, file)
        case = scrape_lexis_page(fullname)
        keys.update(case.keys())
        cases.append(case)

In [None]:
import unicodecsv as csv

print(keys)

# Write out the csv
with open('sjc-opinions.csv', 'wb') as output_file:
    dict_writer = csv.DictWriter(output_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(cases)