In [2]:
import os
import pandas as pd
import requests
import spacy
import string
from typing import List, Tuple, Union, Callable, Dict, Iterator
from collections import defaultdict
from difflib import SequenceMatcher
from spacy.matcher import Matcher, PhraseMatcher
from spacy.tokens import Doc, Token, Span
from spacy.matcher import Matcher

In [2]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.0.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.0.0/en_core_web_md-3.0.0-py3-none-any.whl (47.1 MB)
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.0.0
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_md')


In [3]:
nlp = spacy.load("en_core_web_md")

In [4]:
filenames = os.listdir('D:\\Lambda\\Labs\\human-rights-first-asylum-ds-a\\texts\\text cases') # Wherever files are located

In [5]:
# testing spacy matcher 
matcher = Matcher(nlp.vocab)
# phrases we are looking for in protected grounds
# explore court documents if there are identifiers
# when seaching for these patterns/phrases/tokens
pattern = [
    [{"LOWER": "race"}],
    [{"LOWER": "religion"}],
    [{"LOWER": "nationality"}],
    [{"LOWER": "social"}, {"LOWER": "group"}],
    [{"LOWER": "political"}, {"LOWER": "opinion"}]
           ]
matcher.add('protected_grounds',pattern)

In [6]:
# creates searchable files to read in, and then test similar and get protected grounds
counter = 0
dict_name = {}
for file in filenames:
    counter += 1
    f = open(f"D:\\Lambda\\Labs\\human-rights-first-asylum-ds-a\\texts\\text cases\\{file}", "r", encoding='utf-8')
    dict_name[counter] = nlp(f.read())
    f.close()

In [111]:
# already tokenized text
doc_1 = dict_name[1]
doc_2 = dict_name[2]
# run matcher 
matches = matcher(doc_1, as_spans=True)
matches

[Nationality,
 Nationality,
 nationality,
 nationality,
 nationality,
 nationality,
 nationality,
 nationality,
 nationality]

In [75]:
def similar(target_phrases, file):
    ''' helper function to create spacy matcher
    that searches for specified target_phrases,
    simplifies get field names function, and improves
    ability to change 
    '''
    """GET RID OF PUNCT"""
    # from string lib, we create an exclusion table
    #table = str.maketrans(dict.fromkeys(string.punctuation))
    # then use that table to make a string without punct
    #no_punct_string = target_phrases.translate(table)
    # create matcher object and add the pattern we are looking for
    matcher = Matcher(nlp.vocab)
    matcher.add('target_phrases', target_phrases)
    matches = matcher(file, as_spans=True)
    # in the functions where similiar is used,
    # must present target_phrases in a list of dictionary using Spacy pattern syntax
    # example
    # pattern = [[{"LOWER": "race"}]]
    # similar_pg = similar(target_phrases=pattern, file=self.doc)
    
    return matches

In [142]:
# testing how/why only 1 political grounds was returning and not all that were found
test = ["nationality", "political"] # activities", "political leaders"]


for word in test:
    target_phrase = []
    new_pattern = word.split()
    for strg in new_pattern:
        target_phrase.append({"LOWER": strg.lower()})
    print(target_phrase)
    similar_pg = similar(target_phrases=[target_phrase], file=dict_name[2])
    print(similar_pg)

[{'LOWER': 'nationality'}]
[Nationality]
[{'LOWER': 'political'}]
[political, political]


In [251]:
def get_protected_grounds(self):
    '''
    A function that scrapes protected grounds fields from a scanned doc    
    '''
    # list of protected grounds
    # can expand this list and add different phrases to cover more ground
    pattern = [
    [{"LOWER": "race"}],
    [{"LOWER": "religion"}],
    [{"LOWER": "political"}],
    [{"LOWER": "nationality"}], # currently, phrase is pulled but out of context
    [{"LOWER": "social"}, {"LOWER": "group"}],
    [{"LOWER": "political"}, {"LOWER": "opinion"}],
    [{"LOWER": "political"}, {"LOWER": "offense"}],
    [{"LOWER": "protected"}, {"LOWER": "grounds"}]    
           ]
    
    potential_grounds = similar(target_phrases=pattern, file=self.doc)
    
    
    for match in potential_grounds:
        # remove 'nationality' from potential_grounds before returning results
        if match.text.lower() == 'nationality':
            if 'act' in match.sent.text.lower():
                potential_grounds.remove(match)
    
    results = [str(i).lower() for i in potential_grounds]
    
    return list(set(results))

In [421]:
# def get_outcome_new(self):
        
#         outcomes_return = []
#         ordered_outcome = {'ORDER', 'ORDERED'}
#         outcomes_list = ['denied', 'dismissed', 'granted', 'remanded',
#                          'returned', 'sustained', 'terminated',
#                          'vacated', 'affirmed']
#         two_before_exclusion = {'may', 'any', 'has'}
#         one_before_exclusion = {'it', 'has'}
#         # locate where in the document the orders start
#         order_start_i = -1
#         for token in self.doc:
#             if token.text in ordered_outcome:
#                 order_start_i = token.i
#                 break
#         # If we can't find where the orders start, assume they start at the beginning
#         if order_start_i == -1:
#             order_start_i = 0
#         # Locate where in the document the orders end
#         order_end_i = len(self.doc)
#         # Orders end when we see "FOR THE BOARD" or "WARNING"
#         # - this avoids finding keywords in footnotes or warnings
#         for i in range(order_start_i+1, min(order_end_i, len(self.doc) - 2)):
#             if (self.doc[i:i+3].text == "FOR THE BOARD" or
#                 self.doc[i].text == "WARNING"):
#                 order_end_i = i
#                 break
#         # If we can find where the orders start, check the range for each type
#         # of outcome
#         for outcome in outcomes_list:
#             for i in range(order_start_i, order_end_i):
#                 if (similar_outcome(self.doc[i].text, outcome) and
#                     self.doc[i-2].text not in two_before_exclusion and
#                     self.doc[i-1].text not in one_before_exclusion):
#                     outcomes_return.append(outcome)
#                     break
#         return outcomes_return

In [422]:
# def similar_outcome(str1, str2):
#     """
#     Returns True if the strings are off by a single character, and that
#     character is not a 'd' at the end. That 'd' at the end of a word is highly
#     indicative of whether something is actually an outcome.
#     """
#     if abs(len(str1) - len(str2)) > 1:
#         return False
#     min_len = min(len(str1), len(str2))
#     i = 0
#     while i < min_len and str1[i] == str2[i]:
#         i += 1
#     # We've reached the end of one string, the other is one character longer
#     if i == min_len:
#         # If that character is a 'd', return False, otherwise True
#         if ((len(str1) > len(str2) and str1[-1] == 'd')
#             or (len(str2) > len(str1) and str2[-1] == 'd')):
#             return False
#         else:
#             return True
#     # We're looking at a substitution that is 'd' at the end
#     if (i == len(str1) -1 and len(str1) == len(str2)
#         and (str1[-1] == 'd' or str2[-1] == 'd')):
#         return False
#     # We're looking at a substitution other than 'd' at the end
#     i2 = i + 1
#     while i2 < min_len and str1[i2] == str2[i2]:
#         i2 += 1
#     if i2 == len(str1) and i2 == len(str2):
#         return True
#     # We're in the middle, str1 has an extra character
#     if len(str1) == len(str2) + 1:
#         i2 = i
#         while i2 < min_len and str1[i2+1] == str2[i2]:
#             i2 += 1
#         if i2 + 1 == len(str1) and i2 == len(str2):
#             return True
#     # We're in the middle, str2 has an extra character
#     if len(str1) + 1 == len(str2):
#         i2 = i
#         while i2 < min_len and str1[i2] == str2[i2+1]:
#             i2 += 1
#         if i2 == len(str1) and i2 + 1 == len(str2):
#             return True
#     return False

In [264]:
get_outcome_new(dict_name[2])

['remanded', 'sustained']

In [302]:
get_protected_grounds(dict_name[20])

['nationality']

In [469]:
### TEST USING class scraper ###
class Scraper:

    def __init__(self, text: str):
        """
        • Input will be text from a BIA case pdf file, after the pdf has
        been converted from PDF to text.
        • Scraping works utilizing spaCy, tokenizing the text, and iterating
        token by token searching for matching keywords.
        """
        
        # creating helpful attributes to improve accuracy of other functions
        self.doc = nlp(text)
        self.outcome = get_outcome_new(self)
        #self.appellate = is_appellate(self)
    
    def get_protected_grounds(self):

        # list of protected grounds
        # can expand this list and add different phrases to cover more ground
        pattern = [
        [{"LOWER": "race"}],
        [{"LOWER": "religion"}],
        [{"LOWER": "nationality"}], # currently, phrase is pulled but out of context
        [{"LOWER": "social"}, {"LOWER": "group"}],
        [{"LOWER": "political"}, {"LOWER": "opinion"}],
        [{"LOWER": "political"}, {"LOWER": "offense"}],
        #[{"LOWER": "protected"}, {"LOWER": "grounds"}],
        [{"LOWER": "political"}]
        # expand to include political activities case # 165227167
        ]
    
    
        # if 'granted' not in list of outcomes, then scrape the whole document
        potential_grounds = similar(target_phrases=pattern, file=self.doc)

        for match in potential_grounds:
        # remove 'nationality' from potential_grounds before returning results
            if match.text.lower() == 'nationality':
                if 'act' in match.sent.text.lower():
                    potential_grounds.remove(match)

        result = [str(i).lower() for i in potential_grounds]

        # if result is empty, do something besides return set
        if result:
            return set(result)
        else:
            return ['None found']
    
    
    def get_outcome_new(self):
        
        outcomes_return = []
        ordered_outcome = {'ORDER', 'ORDERED'}
        outcomes_list = ['denied', 'dismissed', 'granted', 'remanded',
                         'returned', 'sustained', 'terminated',
                         'vacated', 'affirmed']
        two_before_exclusion = {'may', 'any', 'has'}
        one_before_exclusion = {'it', 'has'}
        # locate where in the document the orders start
        order_start_i = -1
        for token in self.doc:
            if token.text in ordered_outcome:
                order_start_i = token.i
                break
        # If we can't find where the orders start, assume they start at the beginning
        if order_start_i == -1:
            order_start_i = 0
        # Locate where in the document the orders end
        order_end_i = len(self.doc)
        # Orders end when we see "FOR THE BOARD" or "WARNING"
        # - this avoids finding keywords in footnotes or warnings
        for i in range(order_start_i+1, min(order_end_i, len(self.doc) - 2)):
            if (self.doc[i:i+3].text == "FOR THE BOARD" or
                self.doc[i].text == "WARNING"):
                order_end_i = i
                break
        # If we can find where the orders start, check the range for each type
        # of outcome
        for outcome in outcomes_list:
            for i in range(order_start_i, order_end_i):
                if (similar_outcome(self.doc[i].text, outcome) and
                    self.doc[i-2].text not in two_before_exclusion and
                    self.doc[i-1].text not in one_before_exclusion):
                    outcomes_return.append(outcome)
                    break
        return outcomes_return
        
    
    def similar_outcome(str1, str2):
        """
        Returns True if the strings are off by a single character, and that
        character is not a 'd' at the end. That 'd' at the end of a word is highly
        indicative of whether something is actually an outcome.
        """
        if abs(len(str1) - len(str2)) > 1:
            return False
        min_len = min(len(str1), len(str2))
        i = 0
        while i < min_len and str1[i] == str2[i]:
            i += 1
        # We've reached the end of one string, the other is one character longer
        if i == min_len:
            # If that character is a 'd', return False, otherwise True
            if ((len(str1) > len(str2) and str1[-1] == 'd')
                or (len(str2) > len(str1) and str2[-1] == 'd')):
                return False
            else:
                return True
        # We're looking at a substitution that is 'd' at the end
        if (i == len(str1) -1 and len(str1) == len(str2)
            and (str1[-1] == 'd' or str2[-1] == 'd')):
            return False
        # We're looking at a substitution other than 'd' at the end
        i2 = i + 1
        while i2 < min_len and str1[i2] == str2[i2]:
            i2 += 1
        if i2 == len(str1) and i2 == len(str2):
            return True
        # We're in the middle, str1 has an extra character
        if len(str1) == len(str2) + 1:
            i2 = i
            while i2 < min_len and str1[i2+1] == str2[i2]:
                i2 += 1
            if i2 + 1 == len(str1) and i2 == len(str2):
                return True
        # We're in the middle, str2 has an extra character
        if len(str1) + 1 == len(str2):
            i2 = i
            while i2 < min_len and str1[i2] == str2[i2+1]:
                i2 += 1
            if i2 == len(str1) and i2 + 1 == len(str2):
                return True
        return False
    
    def is_appellate(self):
        
        BoIA = 'Board of Immigration Appeals'
        # pattern = BoIA
        pattern = [
            [{"TEXT" : "Board"}, {"TEXT": "of"}, {"TEXT":"Immigration"}, {"TEXT": "Appeals"}]
            ]
        find_appellate = similar(target_phrases=pattern, file=self.doc)
        
        if find_appellate:
            self.appellate = True
        else:
            self.appellate = False
            
        return self.appellate
    

In [382]:
test_func_dict = {}
count = 0
for file in filenames:
    count += 1
    f = open(f"D:\\Lambda\\Labs\\human-rights-first-asylum-ds-a\\texts\\text cases\\{file}", "r", encoding='utf-8')
    case = Scraper(f.read())
    test_func_dict[count] = case.get_protected_grounds()
    f.close()

In [None]:
for v in list(test_func_dict.values()):
    count = 0
    if v is []:
        count += 1
    else:
        pass
    print(count)

In [455]:
approved_asylum_rulings = []
for i,c in enumerate(list(test_func_dict.values())):

    if c == 'Whole doc to be scrapped':
        pass
    else:
        approved_asylum_rulings.append(i)

In [465]:
approved_asylum_rulings

[6, 22, 26, 32, 50, 51, 58, 66, 79, 103, 120, 128, 129, 130, 146, 148, 168]

In [470]:
approved_rulings_scrapping = {}
count = 0
for file in filenames:
    filename = count
    count += 1 
    if filename in approved_asylum_rulings:
        f = open(f"D:\\Lambda\\Labs\\human-rights-first-asylum-ds-a\\texts\\text cases\\{file}", "r", encoding='utf-8')
        case = Scraper(f.read())
        approved_rulings_scrapping[file] = case.get_protected_grounds()
        f.close

In [471]:
approved_rulings_scrapping

{'208167027-J-M-S-B-W-AXX-XXX-109-BIA-Apr-4-2003-output-1-to-3.txt': ['None found'],
 '362583855-Eric-Omari-Thorpe-A047-924-686-BIA-Sept-29-2017-output-1-to-8.txt': ['None found'],
 '371997582-Raul-Gonzalez-A092-143-856-BIA-Feb-2-2018-output-1-to-2.txt': ['None found'],
 '380078935-A-T-AXXX-XXX-662-BIA-May-8-2018-output-1-to-2.txt': ['None found'],
 '398005224-V-M-Z-B-AXXX-XXX-488-BIA-Dec-19-2018-output-1-to-3.txt': ['None found'],
 '398005328-J-C-T-AXXX-XXX-956-BIA-Dec-20-2018-output-1-to-3.txt': {'nationality',
  'political'},
 '400015130-W-M-Y-AXXX-XXX-044-BIA-Jan-28-2019-output-1-to-3.txt': {'nationality',
  'religion'},
 '402318267-J-M-B-AXXX-XXX-197-BIA-Feb-28-2019-output-1-to-4.txt': {'religion',
  'social group'},
 '414382478-S-S-S-AXXX-XXX-836-BIA-May-23-2019-output-1-to-3.txt': ['None found'],
 '431716946-J-H-G-AXXX-XXX-552-BIA-Oct-7-2019-output-1-to-3.txt': {'nationality'},
 '447696403-F-J-AXXX-XXX-616-BIA-Jan-13-2020-output-1-to-3.txt': {'social group'},
 '452083423-A-M-A-H

In [232]:
# creates searchable files to read in, and then test similar and get protected grounds
#file_number = 0
case_data = {}
for file in filenames:
    #file_number += 1
    f = open(f"D:\\Lambda\\Labs\\human-rights-first-asylum-ds-a\\texts\\text cases\\{file}", "r", encoding='utf-8')
    case = Scraper(f.read())
    #case_data[file_number] = nlp(f.read())
    case_data[file] = case.get_protected_grounds()
    f.close()

In [313]:
case_data

{'140194281-Ali-Fares-A047-654-200-BIA-Apr-30-2013-output-1-to-11.txt': ['nationality',
  'nationality',
  'nationality',
  'nationality',
  'nationality',
  'nationality',
  'nationality'],
 '165227167-K-O-A-BIA-Aug-27-2013-output-1-to-5.txt': ['political',
  'political'],
 '171952033-Luis-Narciso-Sedeno-Trujillo-A088-190-240-BIA-Sept-22-2010-output-1-to-9.txt': ['race'],
 '175361890-Jose-Zacaria-Quinteros-A088-239-850-BIA-Mar-31-2011-output-1-to-9.txt': ['race'],
 '202216334-Francisco-Hernandez-Pina-A073-976-639-BIA-Jan-19-2012-output-1-to-4.txt': ['nationality'],
 '205871959-M-G-O-AXXX-XXX-611-BIA-Feb-4-2014-output-1-to-5.txt': ['social group',
  'social group',
  'social group',
  'social group',
  'social group'],
 '208167027-J-M-S-B-W-AXX-XXX-109-BIA-Apr-4-2003-output-1-to-3.txt': [],
 '225423441-Roberson-Joseph-A078-360-606-BIA-Nov-18-2013-output-1-to-11.txt': ['nationality',
  'nationality'],
 '271354416-D-M-R-BIA-June-9-2015-output-1-to-6.txt': ['social group',
  'social group

In [431]:
# implementing the make_fields func 
def make_fields():
    all_cases = []
    for file in filenames:
        f = open(f"D:\\Lambda\\Labs\\human-rights-first-asylum-ds-a\\texts\\text cases\\{file}", "r", encoding='utf-8')
        case = Scraper(f.read())
        f.close()
        case_data = {
        'case outcome': case.get_outcome_new(),
        'protected grounds': case.get_protected_grounds(),
        'initial case or appellate': case.is_appellate
        }
        all_cases.append(case_data)
    return all_cases

In [473]:
make_fields()

[{'case outcome': ['dismissed'],
  'protected grounds': {'nationality'},
  'initial case or appellate': True},
 {'case outcome': ['remanded', 'sustained'],
  'protected grounds': {'political'},
  'initial case or appellate': True},
 {'case outcome': ['remanded', 'sustained'],
  'protected grounds': {'race'},
  'initial case or appellate': True},
 {'case outcome': ['remanded'],
  'protected grounds': {'race'},
  'initial case or appellate': True},
 {'case outcome': ['remanded', 'sustained'],
  'protected grounds': {'nationality'},
  'initial case or appellate': True},
 {'case outcome': ['remanded', 'sustained'],
  'protected grounds': {'social group'},
  'initial case or appellate': True},
 {'case outcome': ['denied', 'granted', 'remanded'],
  'protected grounds': ['None found'],
  'initial case or appellate': True},
 {'case outcome': ['remanded', 'sustained'],
  'protected grounds': {'nationality'},
  'initial case or appellate': True},
 {'case outcome': ['dismissed', 'remanded', 'sust

In [290]:
def make_fields(doc):
    #f = open(f"D:\\Lambda\\Labs\\human-rights-first-asylum-ds-a\\texts\\text cases\\{file}", "r", encoding='utf-8')
    #case = Scraper(f.read())
    #f.close()
    
    case = Scrapper
    
    case_data = {
        'case outcome': case.get_outcome_new(),
        'protected grounds': case.get_protected_grounds()
    }
    return case_data

In [300]:
dict_name[16]

U.S. Department of Justice
JUSTIC?
Executive Office for Immigration Review
>>FIL
Board of Immigration Appeals
Office of the Clerk
5107 Leesburg Pike, Suite 2000
Falls Church, Virginia 22041
Staples, Kathryn Elizabeth
The Law Offices of Matthew H Green
130 W Cushing Street
Tucson, AZ 85701
DHS/ICE Office of Chief Counsel - EAZ
Eloy Detention Ctr,1705 E. Hanna Rd
Eloy, AZ 85131
Name: LOPEZ-TOVAR, BERTHA A
A 071-904-519
Date of this notice: 12/12/2016
Immigrant & Refugee Appellate Center, LLC
Enclosed is a copy of the Board's decision and order in the above-referenced case.
Sincerely,
Donna Carr
Donna Carr
Chief Clerk
Enclosure
Panel Members:
Pauley, Roger
Guendelsberger, John
Geller, Joan B
www.irac.net
Userteam: Docket
For more unpublished BIA decisions, visit
www.irac.net/unpublished/index/
Cite as: Bertha A. Lopez-Tovar, A071 904 519 (BIA Dec. 12, 2016)
WS
U.S. Department of Justice
Executive Office for Immigration Review
14
Funcil
Board of Immigration Appeals
Office of the Clerk
5107

In [327]:
empty_list = [1]
if empty_list:
    print('not empty')
    

not empty


In [340]:
type(case_data['140194281-Ali-Fares-A047-654-200-BIA-Apr-30-2013-output-1-to-11.txt'])

list

In [472]:
len(case_data['140194281-Ali-Fares-A047-654-200-BIA-Apr-30-2013-output-1-to-11.txt'])

7

In [None]:
### TEST USING class scraper ###
### back to testing get_protected_grounds ###
class Scraper:

    def __init__(self, text: str):
        """
        • Input will be text from a BIA case pdf file, after the pdf has
        been converted from PDF to text.
        • Scraping works utilizing spaCy, tokenizing the text, and iterating
        token by token searching for matching keywords.
        """
        
        # creating helpful attributes to improve accuracy of other functions
        self.doc = nlp(text)
        self.outcome = get_outcome_new(self)
        #self.appellate = is_appellate(self)
    
    def get_protected_grounds(self):

        # list of protected grounds
        # can expand this list and add different phrases to cover more ground
        pattern = [
        [{"LOWER": "race"}],
        [{"LOWER": "religion"}],
        [{"LOWER": "nationality"}], # currently, phrase is pulled but out of context
        [{"LOWER": "social"}, {"LOWER": "group"}],
        [{"LOWER": "political"}, {"LOWER": "opinion"}],
        [{"LOWER": "political"}, {"LOWER": "offense"}],
        #[{"LOWER": "protected"}, {"LOWER": "grounds"}],
        [{"LOWER": "political"}]
        # expand to include political activities case # 165227167
        ]
    
    
        # if 'granted' not in list of outcomes, then scrape the whole document
        potential_grounds = similar(target_phrases=pattern, file=self.doc)

        for match in potential_grounds:
        # remove 'nationality' from potential_grounds before returning results
            if match.text.lower() == 'nationality':
                if 'act' in match.sent.text.lower():
                    potential_grounds.remove(match)

        result = [str(i).lower() for i in potential_grounds]

        # if result is empty, do something besides return set
        if result:
            return set(result)
        else:
            return ['None found']