In [1]:
!pip install spacy
!python -m spacy download en_core_web_sm

import os
import pandas as pd
import requests
from bs4 import BeautifulSoup
from typing import List, Tuple, Union, Callable, Dict, Iterator
from collections import defaultdict
from difflib import SequenceMatcher
import spacy 
from spacy.matcher import Matcher, PhraseMatcher
from spacy.tokens.doc import Doc
from spacy.tokens.span import Span
from spacy.tokens.token import Token


[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
filenames = os.listdir('./hrfCases') # Wherever files are located

In [13]:
text = 'The respondent had a pending asylum application, alleging a fear of harm on account of his race and membership in a particular social group (Exh. 2). The respondent showed due diligence in promptly seeking to redress the situation by filing his motion less than two months after the issuance of the in absentia order, and DHS did not file an opposition to either the motion or the appeal'
doc = nlp(text)
i = 0
for token in doc:
    print(i, '  ', token.text)
    i += 1

0    The
1    respondent
2    had
3    a
4    pending
5    asylum
6    application
7    ,
8    alleging
9    a
10    fear
11    of
12    harm
13    on
14    account
15    of
16    his
17    race
18    and
19    membership
20    in
21    a
22    particular
23    social
24    group
25    (
26    Exh
27    .
28    2
29    )
30    .
31    The
32    respondent
33    showed
34    due
35    diligence
36    in
37    promptly
38    seeking
39    to
40    redress
41    the
42    situation
43    by
44    filing
45    his
46    motion
47    less
48    than
49    two
50    months
51    after
52    the
53    issuance
54    of
55    the
56    in
57    absentia
58    order
59    ,
60    and
61    DHS
62    did
63    not
64    file
65    an
66    opposition
67    to
68    either
69    the
70    motion
71    or
72    the
73    appeal


In [15]:
 pattern = [
         [{"LOWER": "social"}, {"LOWER": "group"}],
         ]
matches = similar(target_phrases=pattern, file=doc)
for match in matches:
    print(match.start, ' ', in_parenthetical(match, doc))

23   False


#### `similar()` and `similar_in_list()` are the current search methods for finding panel members within a case.

In [6]:
def in_parenthetical(match, doc):
    open_parens = 0
    for i in range(match.end, len(doc)):
        if doc[i].text == '(':
            open_parens += 1
        elif doc[i].text == ')':
            if open_parens > 0:
                open_parens -= 1
            else:
                return True
        elif doc[i] in {'.', '?', '!'}:
            return False
    return False

def similar_outcome(str1, str2):
    """
    Returns True if the strings are off by a single character, and that 
    character is not a 'd' at the end. That 'd' at the end of a word is highly 
    indicative of whether something is actually an outcome.
    
    This is used in the get_outcome() method.
    """
    if abs(len(str1) - len(str2)) > 1:
        return False
    min_len = min(len(str1), len(str2))
    i = 0
    while i < min_len and str1[i] == str2[i]:
        i += 1

    # We've reached the end of one string, the other is one character longer
    if i == min_len:
        # If that character is a 'd', return False, otherwise True
        if ((len(str1) > len(str2) and str1[-1] == 'd') 
            or (len(str2) > len(str1) and str2[-1] == 'd')):
            return False
        else:
            return True

    # We're looking at a substitution that is 'd' at the end
    if (i == len(str1) -1 and len(str1) == len(str2) 
        and (str1[-1] == 'd' or str2[-1] == 'd')):
        return False

    # We're looking at a substitution other than 'd' at the end
    i2 = i + 1
    while i2 < min_len and str1[i2] == str2[i2]:
        i2 += 1
    if i2 == len(str1) and i2 == len(str2):
        return True

    # We're in the middle, str1 has an extra character
    if len(str1) == len(str2) + 1:
        i2 = i
        while i2 < min_len and str1[i2+1] == str2[i2]:
            i2 += 1
        if i2 + 1 == len(str1) and i2 == len(str2):
            return True
    
    # We're in the middle, str2 has an extra character
    if len(str1) + 1 == len(str2):
        i2 = i
        while i2 < min_len and str1[i2] == str2[i2+1]:
            i2 += 1
        if i2 == len(str1) and i2 + 1 == len(str2):
            return True
    
    return False

def similar(target_phrases, file):
    ''' helper function to create spacy matcher
    that searches for specified target_phrases,
    simplifies get field names function, and improves
    ability to change 
    '''
    """GET RID OF PUNCT"""
    # from string lib, we create an exclusion table
    #table = str.maketrans(dict.fromkeys(string.punctuation))
    # then use that table to make a string without punct
    #no_punct_string = target_phrases.translate(table)
    # create matcher object and add the pattern we are looking for
    matcher = Matcher(nlp.vocab)
    matcher.add('target_phrases', target_phrases)
    matches = matcher(file, as_spans=True)
    # in the functions where similiar is used,
    # must present target_phrases in a list of dictionary using Spacy pattern syntax
    # example
    # pattern = [[{"LOWER": "race"}]]
    # similar_pg = similar(target_phrases=pattern, file=self.doc)
    
    return matches

class BIACase:
    def __init__(self, text: str):
        """
        • Input will be text from a BIA case pdf file, after the pdf has
        been converted from PDF to text.
        • Scraping works utilizing spaCy, tokenizing the text, and iterating
        token by token searching for matching keywords.
        """
        self.doc = nlp(text)
        self.outcome = self.get_outcome()

    def get_outcome_original(self) -> List[str]:
        """
        • Returns list of outcome terms from the case in a list.
          These will appear after 'ORDER' at the end of the document.
        """

        outcomes_return = []
        ordered_outcome = {'ORDER', 'ORDERED'}
        outcomes_list = ['denied', 'dismissed', 'granted', 'remanded', 'returned',
                         'reversal', 'sustained', 'terminated', 'terninated', 'vacated']

        # Interesting edge case in 349320269- typo on 'terminated' present in the pdf: fuzzywuzzy matches terminated
        # to [(terninated, 90)]
        ordered_i = -1
        for token in self.doc:
            if str(token) in ordered_outcome:
                ordered_i = token.i
                break
                
        # If we can't find where the orders start, assume there aren't any
        if ordered_i == -1:
            return []
                
        # If we can find where the orders start, check it for each type of outcome
        for outcome in outcomes_list:
            for i in range(ordered_i, min(ordered_i + 175, len(self.doc))):
                if str(self.doc[i]) == outcome:
                    outcomes_return.append(outcome)
                    break
                
        return outcomes_return
    
    def get_outcome(self):
        outcomes_return = []
        ordered_outcome = {'ORDER', 'ORDERED'}
        outcomes_list = ['denied', 'dismissed', 'granted', 'remanded',
                         'returned', 'sustained', 'terminated',
                         'vacated', 'affirmed']
        two_before_exclusion = {'may', 'any', 'has'}
        one_before_exclusion = {'it', 'has'}

        # locate where in the document the orders start
        order_start_i = -1
        for token in self.doc:
            if token.text in ordered_outcome:
                order_start_i = token.i
                break

        # If we can't find where the orders start, assume they start at the beginning
        if order_start_i == -1:
            order_start_i = 0

        # Locate where in the document the orders end
        order_end_i = len(self.doc)
        # Orders end when we see "FOR THE BOARD" or "WARNING"
        # - this avoids finding keywords in footnotes or warnings
        for i in range(order_start_i+1, min(order_end_i, len(self.doc) - 2)):
            if (self.doc[i:i+3].text == "FOR THE BOARD" or
                self.doc[i].text == "WARNING"):
                order_end_i = i
                break

        # If we can find where the orders start, check the range for each type
        # of outcome
        for outcome in outcomes_list:
            for i in range(order_start_i, order_end_i):
                if (similar_outcome(self.doc[i].text, outcome) and
                    self.doc[i-2].text not in two_before_exclusion and
                    self.doc[i-1].text not in one_before_exclusion):
                    outcomes_return.append(outcome)
                    break

        return outcomes_return
    
    def get_protected_grounds_old(self):
        religions = ['christianity','christian','islam','atheist','hinduism','buddihism','jewish','judaism']
        
        # list of protected grounds
        # can expand this list and add different phrases to cover more ground
        pattern = [
        [{"LOWER": "race"}], 
        [{"LOWER": "religion"}], # expand to check for list of religions
        [{"LOWER": "nationality"}], # currently, phrase is pulled but out of context
        [{"LOWER": "social"}, {"LOWER": "group"}], 
        [{"LOWER": "political"}, {"LOWER": "opinion"}],
        [{"LOWER": "political"}, {"LOWER": "offense"}],
        #[{"LOWER": "protected"}, {"LOWER": "grounds"}],
        [{"LOWER": "political"}],
        ]
        # included major religions to expand search for religion
        for religion in religions:
            pattern.append([{'LOWER': religion}])

        politicals = ['political opinion', 'political offense']
        
        exclusions = ['real id', 'grounds specified', 'no claim']
        # potential grounds is a list of Span objects that have added functionality
        # in order to weed out bad tokens
        potential_grounds = similar(target_phrases=pattern, file=self.doc)
        # explore idea to implement a deque for more efficiency 
        confirmed_matches = []
        # check for exclusion phrases, remove match if found
        for match in potential_grounds:
        # remove 'nationality act' from potential_grounds
            if match.text.lower() == 'nationality':
                if 'act' in match.sent.text.lower():
                    potential_grounds.remove(match)
                else:
                    if 'nationality' not in confirmed_matches:
                        confirmed_matches.append('nationality')
        # check for specified religion, replace with 'religion'
            elif match.text.lower() in religions:
                #print(match)
                potential_grounds.remove(match)
                if 'religion' not in confirmed_matches:
                    confirmed_matches.append('religion')
            elif match.text.lower() in politicals:
                potential_grounds.remove(match)
                if 'political' not in confirmed_matches:
                    confirmed_matches.append('political')
            else:
                confirmed_matches.append(match.text.lower())
        
        #result = [str(i).lower() for i in potential_grounds] + confirmed_matches
        if confirmed_matches:
            return list(set(confirmed_matches))
        #if result:
        #    return set(result), confirmed_matches
        else:
            return []
        
    def get_protected_grounds(self):
        religions = ['christianity','christian','islam','atheist','hinduism','buddihism','jewish','judaism']
        
        # list of protected grounds
        # can expand this list and add different phrases to cover more ground
        pattern = [
        [{"LOWER": "race"}], 
        [{"LOWER": "religion"}], # expand to check for list of religions
        [{"LOWER": "nationality"}], # currently, phrase is pulled but out of context
        [{"LOWER": "social"}, {"LOWER": "group"}], 
        [{"LOWER": "political"}, {"LOWER": "opinion"}],
        [{"LOWER": "political"}, {"LOWER": "offense"}],
        #[{"LOWER": "protected"}, {"LOWER": "grounds"}],
        [{"LOWER": "political"}],
        ]
        # included major religions to expand search for religion
        for religion in religions:
            pattern.append([{'LOWER': religion}])

        politicals = ['political opinion', 'political offense']
        
        exclusions = ['real id', 'grounds specified', 'no claim']
        # potential grounds is a list of Span objects that have added functionality
        # in order to weed out bad tokens
        potential_grounds = similar(target_phrases=pattern, file=self.doc)
        # explore idea to implement a deque for more efficiency 
        confirmed_matches = []
        # check for exclusion phrases, remove match if found
        for match in potential_grounds:
        # remove 'nationality act' from potential_grounds
            if match.text.lower() == 'nationality':
                if 'act' in match.sent.text.lower():
                    potential_grounds.remove(match)
                else:
                    if 'nationality' not in confirmed_matches:
                        confirmed_matches.append('nationality')
        # check for specified religion, replace with 'religion'
            elif match.text.lower() in religions:
                #print(match)
                potential_grounds.remove(match)
                if 'religion' not in confirmed_matches:
                    confirmed_matches.append('religion')
            elif match.text.lower() in politicals:
                potential_grounds.remove(match)
                if 'political' not in confirmed_matches:
                    confirmed_matches.append('political')
            elif in_parenthetical(match, self.doc):
                potential_grounds.remove(match)
            else:
                confirmed_matches.append(match.text.lower())
        
        #result = [str(i).lower() for i in potential_grounds] + confirmed_matches
        if confirmed_matches:
            return list(set(confirmed_matches))
        #if result:
        #    return set(result), confirmed_matches
        else:
            return []


In [7]:
new_pg_dict = {}
old_pg_dict = {}

for file in filenames:
    f = open(f"./hrfCases/{file}", "r", encoding='utf-8')
    case = BIACase(f.read())
    old_pg = case.get_protected_grounds_old()
    old_pg_dict[file] = old_pg
    new_pg = case.get_protected_grounds()
    new_pg_dict[file] = new_pg
    if old_pg != new_pg:
        print('file: ', file)
        print('new outcome: ', new_pg)
        print('old outcome: ', old_pg)
    f.close()

new_pg_df = pd.DataFrame(new_pg_dict.items(), columns=['UUID', 'new_pg'])
old_pg_df = pd.DataFrame(old_pg_dict.items(), columns=['UUID', 'old_pg'])

file:  165227167-K-O-A-BIA-Aug-27-2013-output-1-to-5.txt
new outcome:  []
old outcome:  ['political']
file:  205871959-M-G-O-AXXX-XXX-611-BIA-Feb-4-2014-output-1-to-5.txt
new outcome:  []
old outcome:  ['social group']
file:  271354416-D-M-R-BIA-June-9-2015-output-1-to-6.txt
new outcome:  []
old outcome:  ['social group']
file:  311736830-H-R-M-AXXX-XXX-381-BIA-March-14-2016-output-1-to-5.txt
new outcome:  []
old outcome:  ['social group']
file:  334139459-S-V-C-AXXX-XXX-431-BIA-Nov-1-2016-output-1-to-5.txt
new outcome:  []
old outcome:  ['religion', 'political', 'social group']
file:  337540716-Clebson-Sousa-Carneiro-A078-254-701-BIA-Jan-6-2017-output-1-to-10.txt
new outcome:  ['social group']
old outcome:  ['nationality', 'political', 'race', 'social group']
file:  371997958-Rocio-Alida-Valencia-Barragan-A209-138-515-BIA-Feb-5-2018-output-1-to-7.txt
new outcome:  []
old outcome:  ['social group']
file:  377420389-A-L-D-G-AXXX-XXX-287-BIA-March-14-2018-output-1-to-4.txt
new outcome:  

# Retrieve correct manually extracted data for comparison.

In [8]:
df_csv = pd.read_csv('manually_scrapped.csv')
df_csv = df_csv[['UUID', 'protected grounds']]

#remove .pdf
df_csv['UUID'] = df_csv['UUID'].str[0:-4] 

#remove different ending of .txt file names
new_pg_df['UUID'] = new_pg_df['UUID'].apply(lambda x : x[0:x.find('output-1-to-') - 1])
old_pg_df['UUID'] = old_pg_df['UUID'].apply(lambda x : x[0:x.find('output-1-to-') - 1])

combined_df = df_csv.merge(new_pg_df, on='UUID', how='outer').merge(old_pg_df, on='UUID', how='outer')
combined_df.head()

Unnamed: 0,UUID,protected grounds,new_pg,old_pg
0,140194281-Ali-Fares-A047-654-200-BIA-Apr-30-2013,,[nationality],[nationality]
1,165227167-K-O-A-BIA-Aug-27-2013,"Social, Political",[],[political]
2,171952033-Luis-Narciso-Sedeno-Trujillo-A088-19...,,[race],[race]
3,175361890-Jose-Zacaria-Quinteros-A088-239-850-...,,[],[]
4,202216334-Francisco-Hernandez-Pina-A073-976-63...,,[],[]


In [9]:
import numpy as np

def compare_pgs(row, version, ground=None):
    if ground == None:
        if row['protected grounds'] != row['protected grounds']:
            return row[version+'_pg'] == []

        list1 = row['protected grounds'].lower().split(', ')
        list2 = row[version+'_pg']
        list1.sort()
        list2.sort()
        if len(list1) != len(list2):
            return False
        for i in range(len(list1)):
            if list1[i] != list2[i]:
                return False
        return True
    else:
        list2 = row[version+'_pg']
        if row['protected grounds'] != row['protected grounds']:
            return ground not in list2
        list1 = row['protected grounds'].lower().split(', ')
        return (ground in list1) == (ground in list2)

In [10]:
ground = 'race'

combined_df['old_accurate'] = combined_df.apply(compare_pgs, args=['old', ground], axis=1)
combined_df['new_accurate'] = combined_df.apply(compare_pgs, args=['new', ground], axis=1)

old_accuracy = combined_df['old_accurate'].sum()/len(combined_df)*100
new_accuracy = combined_df['new_accurate'].sum()/len(combined_df)*100
print('old accuracy: ', old_accuracy, "%")
print('new accuracy: ', new_accuracy, "%")
print("improvement:   ", new_accuracy - old_accuracy, "%")

old accuracy:  97.6470588235294 %
new accuracy:  98.23529411764706 %
improvement:    0.5882352941176521 %


In [11]:
diff_df = combined_df[combined_df['new_accurate'] == False]
print(len(diff_df))
diff_df.head(20)

3


Unnamed: 0,UUID,protected grounds,new_pg,old_pg,old_accurate,new_accurate
2,171952033-Luis-Narciso-Sedeno-Trujillo-A088-19...,,[race],[race],False,False
16,337540716-Clebson-Sousa-Carneiro-A078-254-701-...,"race, nationality, political",[social group],"[nationality, political, race, social group]",True,False
114,443776197-L-F-A-AXXX-XXX-919-BIA-Dec-18-2019,"race, social",[],"[race, social group]",True,False


In [12]:
changes_df = combined_df[combined_df['new_accurate'] != combined_df['old_accurate']]
print(len(changes_df))
changes_df

5


Unnamed: 0,UUID,protected grounds,new_pg,old_pg,old_accurate,new_accurate
16,337540716-Clebson-Sousa-Carneiro-A078-254-701-...,"race, nationality, political",[social group],"[nationality, political, race, social group]",True,False
30,377420389-A-L-D-G-AXXX-XXX-287-BIA-March-14-2018,Social Group,[political],"[religion, political, race, social group]",False,True
73,411263766-J-W-A-L-AXXX-XXX-848-BIA-April-26-2019,,[nationality],"[religion, race, political]",False,True
97,431711050-C-N-A-AXXX-XXX-484-BIA-Sept-12-2019,political; social,[political],"[religion, political, race, social group]",False,True
114,443776197-L-F-A-AXXX-XXX-919-BIA-Dec-18-2019,"race, social",[],"[race, social group]",True,False
