In [1]:
!pip install spacy
!python -m spacy download en_core_web_sm

import os
import pandas as pd
import requests
from bs4 import BeautifulSoup
from typing import List, Tuple, Union, Callable, Dict, Iterator
from collections import defaultdict
from difflib import SequenceMatcher
import spacy 
from spacy.matcher import Matcher, PhraseMatcher
from spacy.tokens.doc import Doc
from spacy.tokens.span import Span
from spacy.tokens.token import Token


[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
filenames = os.listdir('./hrfCases') # Wherever files are located

#### `similar()` and `similar_in_list()` are the current search methods for finding panel members within a case.

In [4]:
def similar(a: str, return_b: str, min_score: float) -> Union[str, None]:
    """
    • Returns 2nd string if similarity score is above supplied
    minimum score. Else, returns None.
    """
    if SequenceMatcher(None, a, return_b).ratio() >= min_score:
        return return_b


def similar_in_list(lst: Union[List[str], Iterator[str]]) -> Callable:
    """
    • Uses a closure on supplied list to return a function that iterates over
    the list in order to search for the first similar term. It's used widely
    in the scraper.
    """

    def impl(item: str, min_score: float) -> Union[str, None]:
        for s in lst:
            s = similar(item, s, min_score)
            if s:
                return s

    return impl

In [5]:
def similar_outcome(str1, str2):
    """
    Returns True if the strings are off by a single character, and that 
    character is not a 'd' at the end. That 'd' at the end of a word is highly 
    indicative of whether something is actually an outcome.
    
    This is used in the get_outcome() method.
    """
    if abs(len(str1) - len(str2)) > 1:
        return False
    min_len = min(len(str1), len(str2))
    i = 0
    while i < min_len and str1[i] == str2[i]:
        i += 1

    # We've reached the end of one string, the other is one character longer
    if i == min_len:
        # If that character is a 'd', return False, otherwise True
        if ((len(str1) > len(str2) and str1[-1] == 'd') 
            or (len(str2) > len(str1) and str2[-1] == 'd')):
            return False
        else:
            return True

    # We're looking at a substitution that is 'd' at the end
    if (i == len(str1) -1 and len(str1) == len(str2) 
        and (str1[-1] == 'd' or str2[-1] == 'd')):
        return False

    # We're looking at a substitution other than 'd' at the end
    if str1[i+1:] == str2[i+1:]:
        return True

    # We're in the middle, str1 has an extra character
    if str1[i+1:] == str2[i:]:
        return True
    
    # We're in the middle, str2 has an extra character
    if str1[i:] == str2[i+1:]:
        return True
    
    return False

class BIACase:
    def __init__(self, text: str):
        """
        • Input will be text from a BIA case pdf file, after the pdf has
        been converted from PDF to text.
        • Scraping works utilizing spaCy, tokenizing the text, and iterating
        token by token searching for matching keywords.
        """
        self.doc: Doc = nlp(text)
        self.ents: Tuple[Span] = self.doc.ents
        self.state = None
        self.city = None
        
    def get_outcome_original(self) -> List[str]:
        """
        • Returns list of outcome terms from the case in a list.
          These will appear after 'ORDER' at the end of the document.
        """

        outcomes_return = []
        ordered_outcome = {'ORDER', 'ORDERED'}
        outcomes_list = ['denied', 'dismissed', 'granted', 'remanded', 'returned',
                         'reversal', 'sustained', 'terminated', 'terninated', 'vacated']

        # Interesting edge case in 349320269- typo on 'terminated' present in the pdf: fuzzywuzzy matches terminated
        # to [(terninated, 90)]
        ordered_i = -1
        for token in self.doc:
            if str(token) in ordered_outcome:
                ordered_i = token.i
                break
                
        # If we can't find where the orders start, assume there aren't any
        if ordered_i == -1:
            return []
                
        # If we can find where the orders start, check it for each type of outcome
        for outcome in outcomes_list:
            for i in range(ordered_i, min(ordered_i + 175, len(self.doc))):
                if str(self.doc[i]) == outcome:
                    outcomes_return.append(outcome)
                    break
                
        return outcomes_return
    
    def get_outcome(self):
        outcomes_return = []
        ordered_outcome = {'ORDER', 'ORDERED'}
        outcomes_list = ['denied', 'dismissed', 'granted', 'remanded',
                         'returned', 'sustained', 'terminated',
                         'vacated', 'affirmed']
        two_before_exclusion = {'may', 'any', 'has'}
        one_before_exclusion = {'it', 'has'}

        # locate where in the document the orders start
        order_start_i = -1
        #for token in self.doc:
        #    if token.text in ordered_outcome:
        #        order_start_i = token.i
        #        break

        # If we can't find where the orders start, assume there aren't any
        if order_start_i == -1:
            order_start_i = 0

        # Locate where in the document the orders end
        order_end_i = len(self.doc)
        # Orders end when we see "FOR THE BOARD" or "WARNING"
        # - this avoids finding keywords in footnotes or warnings
        for i in range(order_start_i+1, min(order_end_i, len(self.doc) - 2)):
            if (self.doc[i:i+3].text == "FOR THE BOARD" or
                self.doc[i].text == "WARNING"):
                order_end_i = i
                break

        # If we can find where the orders start, check the range for each type
        # of outcome
        for outcome in outcomes_list:
            for i in range(order_start_i, order_end_i):
                if (similar_outcome(self.doc[i].text, outcome) and
                    self.doc[i-2].text not in two_before_exclusion and
                    self.doc[i-1].text not in one_before_exclusion):
                    outcomes_return.append(outcome)
                    break

        return outcomes_return

    def get_outcome_new(self):
        outcomes_return = []
        ordered_outcome = {'ORDER', 'ORDERED'}
        outcomes_list = ['denied', 'dismissed', 'granted', 'remanded',
                         'returned', 'sustained', 'terminated',
                         'vacated', 'affirmed']
        two_before_exclusion = {'may', 'any', 'has'}
        one_before_exclusion = {'it', 'has'}

        # locate where in the document the orders start
        order_start_i = -1
        for token in self.doc:
            if token.text in ordered_outcome:
                order_start_i = token.i
                break

        # If we can't find where the orders start, assume they start at the beginning
        if order_start_i == -1:
            order_start_i = 0

        # Locate where in the document the orders end
        order_end_i = len(self.doc)
        # Orders end when we see "FOR THE BOARD" or "WARNING"
        # - this avoids finding keywords in footnotes or warnings
        for i in range(order_start_i+1, min(order_end_i, len(self.doc) - 2)):
            if (self.doc[i:i+3].text == "FOR THE BOARD" or
                self.doc[i].text == "WARNING"):
                order_end_i = i
                break

        # If we can find where the orders start, check the range for each type
        # of outcome
        for outcome in outcomes_list:
            for i in range(order_start_i, order_end_i):
                if (similar_outcome(self.doc[i].text, outcome) and
                    self.doc[i-2].text not in two_before_exclusion and
                    self.doc[i-1].text not in one_before_exclusion):
                    outcomes_return.append(outcome)
                    break

        return outcomes_return


In [6]:
new_outcome_dict = {}
old_outcome_dict = {}

for file in filenames:
    f = open(f"./hrfCases/{file}", "r", encoding='utf-8')
    case = BIACase(f.read())
    new_outcome = case.get_outcome_new()
    new_outcome_dict[file] = new_outcome
    old_outcome = case.get_outcome_original()
    old_outcome_dict[file] = old_outcome
    if old_outcome != new_outcome:
        print('new outcome: ', new_outcome)
        print('old outcome: ', old_outcome)
    f.close()

new_outcome_df = pd.DataFrame(new_outcome_dict.items(), columns=['UUID', 'new_outcome'])
old_outcome_df = pd.DataFrame(old_outcome_dict.items(), columns=['UUID', 'old_outcome'])

new outcome:  ['dismissed']
old outcome:  ['dismissed', 'granted']
new outcome:  ['denied', 'granted', 'remanded']
old outcome:  ['denied', 'granted']
new outcome:  ['sustained', 'terminated']
old outcome:  ['sustained', 'terninated']
new outcome:  ['granted', 'terminated', 'vacated']
old outcome:  ['granted']
new outcome:  ['sustained', 'terminated', 'vacated']
old outcome:  ['remanded', 'sustained', 'terminated', 'vacated']
new outcome:  ['remanded', 'sustained']
old outcome:  ['denied', 'remanded', 'sustained']
new outcome:  ['dismissed', 'remanded', 'affirmed']
old outcome:  ['dismissed', 'remanded']
new outcome:  ['remanded', 'sustained']
old outcome:  ['denied', 'remanded', 'sustained']
new outcome:  ['dismissed', 'remanded', 'sustained']
old outcome:  ['dismissed', 'sustained']
new outcome:  ['remanded', 'sustained', 'vacated']
old outcome:  ['denied', 'remanded', 'sustained', 'vacated']
new outcome:  ['remanded']
old outcome:  []
new outcome:  ['remanded']
old outcome:  ['dismi

# Retrieve correct manually extracted data for comparison.

In [7]:
df_csv = pd.read_csv('manually_scrapped.csv')
df_csv = df_csv[['UUID', 'outcome']]

#remove .pdf
df_csv['UUID'] = df_csv['UUID'].str[0:-4] 

#remove different ending of .txt file names
new_outcome_df['UUID'] = new_outcome_df['UUID'].apply(lambda x : x[0:x.find('output-1-to-') - 1])
old_outcome_df['UUID'] = old_outcome_df['UUID'].apply(lambda x : x[0:x.find('output-1-to-') - 1])

combined_df = df_csv.merge(new_outcome_df, on='UUID', how='outer').merge(old_outcome_df, on='UUID', how='outer')
combined_df.head()

Unnamed: 0,UUID,outcome,new_outcome,old_outcome
0,140194281-Ali-Fares-A047-654-200-BIA-Apr-30-2013,Dismissed,[dismissed],"[dismissed, granted]"
1,165227167-K-O-A-BIA-Aug-27-2013,Sustained; remanded,"[remanded, sustained]","[remanded, sustained]"
2,171952033-Luis-Narciso-Sedeno-Trujillo-A088-19...,Sustained; remanded,"[remanded, sustained]","[remanded, sustained]"
3,175361890-Jose-Zacaria-Quinteros-A088-239-850-...,Remanded,[remanded],[remanded]
4,202216334-Francisco-Hernandez-Pina-A073-976-63...,Sustained; remanded,"[remanded, sustained]","[remanded, sustained]"


In [8]:
def compare_outcomes(row, version):
    if type(row['outcome']) != str:
        return False

    list1 = row['outcome'].lower().split('; ')
    list2 = row[version+'_outcome']
    list1.sort()
    list2.sort()
    if len(list1) != len(list2):
        return False
    for i in range(len(list1)):
        if list1[i] != list2[i]:
            return False
    return True

In [9]:
combined_df['old_accurate'] = combined_df.apply(compare_outcomes, args=['old'], axis=1)
combined_df['new_accurate'] = combined_df.apply(compare_outcomes, args=['new'], axis=1)

old_accuracy = combined_df['old_accurate'].sum()/len(combined_df)*100
new_accuracy = combined_df['new_accurate'].sum()/len(combined_df)*100
print('old accuracy: ', old_accuracy, "%")
print('new accuracy: ', new_accuracy, "%")
print("improvement:   ", new_accuracy - old_accuracy, "%")

old accuracy:  91.76470588235294 %
new accuracy:  100.0 %
improvement:    8.235294117647058 %


In [10]:
diff_df = combined_df[combined_df['new_accurate'] == False]
print(len(diff_df))
diff_df.head(20)

0


Unnamed: 0,UUID,outcome,new_outcome,old_outcome,old_accurate,new_accurate


In [11]:
changes_df = combined_df[combined_df['new_accurate'] != combined_df['old_accurate']]
print(len(changes_df))
changes_df

14


Unnamed: 0,UUID,outcome,new_outcome,old_outcome,old_accurate,new_accurate
0,140194281-Ali-Fares-A047-654-200-BIA-Apr-30-2013,Dismissed,[dismissed],"[dismissed, granted]",False,True
6,208167027-J-M-S-B-W-AXX-XXX-109-BIA-Apr-4-2003,denied; granted; remanded,"[denied, granted, remanded]","[denied, granted]",False,True
18,349320269-S-D-AXXX-XXX-230-BIA-April-26-2017,sustained; terminated,"[sustained, terminated]","[sustained, terninated]",False,True
22,362583855-Eric-Omari-Thorpe-A047-924-686-BIA-S...,granted; vacated; terminated,"[granted, terminated, vacated]",[granted],False,True
43,393704112-M-Z-AXXX-XXX-502-BIA-Nov-1-2018,sustained; vacated; terminated,"[sustained, terminated, vacated]","[remanded, sustained, terminated, vacated]",False,True
49,398004870-R-A-M-R-AXXX-XXX-851-BIA-Dec-17-2018,remanded; sustained,"[remanded, sustained]","[denied, remanded, sustained]",False,True
62,402310836-M-F-H-AXXX-XXX-149-BIA-Feb-13-2019,dismissed; remanded; affirmed,"[affirmed, dismissed, remanded]","[dismissed, remanded]",False,True
65,402316607-R-R-P-AXXX-XXX-272-BIA-Feb-22-2019,Sustained; remanded,"[remanded, sustained]","[denied, remanded, sustained]",False,True
109,440769960-I-R-M-AXXX-XXX-308-BIA-Nov-22-2019,dismissed; sustained; remanded,"[dismissed, remanded, sustained]","[dismissed, sustained]",False,True
113,443776154-C-T-AXXX-XXX-676-BIA-Dec-18-2019,sustained; remanded; vacated,"[remanded, sustained, vacated]","[denied, remanded, sustained, vacated]",False,True
