In [1]:
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup
from typing import List, Tuple, Union, Callable, Dict, Iterator
from collections import defaultdict
from difflib import SequenceMatcher
import spacy 
from spacy.matcher import Matcher, PhraseMatcher
from spacy.tokens.doc import Doc
from spacy.tokens.span import Span
from spacy.tokens.token import Token

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
filenames = os.listdir('./hrfCases') # Wherever files are located

#### `similar()` and `similar_in_list()` are the current search methods for finding panel members within a case.

In [4]:
def similar(a: str, return_b: str, min_score: float) -> Union[str, None]:
    """
    • Returns 2nd string if similarity score is above supplied
    minimum score. Else, returns None.
    """
    if SequenceMatcher(None, a, return_b).ratio() >= min_score:
        return return_b


def similar_in_list(lst: Union[List[str], Iterator[str]]) -> Callable:
    """
    • Uses a closure on supplied list to return a function that iterates over
    the list in order to search for the first similar term. It's used widely
    in the scraper.
    """

    def impl(item: str, min_score: float) -> Union[str, None]:
        for s in lst:
            s = similar(item, s, min_score)
            if s:
                return s

    return impl

#### `members` is a hard-coded list of judges emulating a stored list, which is preferred over making requests to what could be unreliable links. I made this list from our previous list, and our correct manually extracted panel members.

In [5]:
members = [
    "Adkins-Blanch, Charles K.",
    "Michael P. Baird",
    "Cassidy, William A.",
    "Cole, Patricia A.",
    "Couch, V. Stuart",
    "Creppy, Michael J.",
    "Crossett, John P.",
    "Donovan, Teresa L.",
    "Foote, Megan E.",
    "Geller, Joan B.",
    "Gemoets, Marcos",
    "Gonzalez, Gabriel",
    "Goodwin, Deborah K.",
    "Gorman, Stephanie E.",
    "Grant, Edward R.",
    "Greer, Anne J.",
    "Guendelsberger, John",
    "Hunsucker, Keith E.",
    "Kelly, Edward F.",
    "Kendall Clark, Molly",
    "Liebmann, Beth S.",
    "Liebowitz, Ellen C.",
    "Mahtabfar, Sunita B.",
    "Malphrus, Garry D.",
    "Mann, Ana",
    "Miller, Neil P.",
    "Monsky, Megan Foote",
    "Montante Jr., Phillip J.",
    "Morris, Daniel",
    "Mullane, Hugh G.",
    "Neal, David L.",
    "Noferi, Mark",
    "O'Connor, Blair",
    "O'Herron, Margaret M.",
    "O'Leary, Brian M.",
    "Owen, Sirce E.",
    "Pauley, Roger",
    "Petty, Aaron R.",
    "Pepper, S. Kathleen",
    "RILEY, KEVIN W.",
    "Rosen, Scott",
    "Snow, Thomas G.",
    "Swanwick, Daniel L.",
    "Wendtland, Linda S.",
    "Wetmore, David H.",
    "Wilson, Earle B."
]

In [6]:
len(members)

46

#### Here we'll grab our currently implemented list for testing of potential panel members from Wikipedia, which has only 29 judges, and even though they are repeated in the list above, they are ordered differently, namely from first name to last name, which contributes to more patterns for matching.

In [7]:
judges_url = 'https://en.wikipedia.org/wiki/Board_of_Immigration_Appeals'
html = requests.get(judges_url).text
soup = BeautifulSoup(html, 'html.parser')
table = soup.find("table", class_="wikitable")
web_judges = [itm.get_text().strip() for itm in table.select("td")[1::4]]

web_judges

['David H. Wetmore',
 'Charles Adkins-Blanch',
 'Garry D. Malphrus',
 'Michael P. Baird',
 'William A. Cassidy',
 'V. Stuart Couch',
 'Michael J. Creppy',
 'Deborah K. Goodwin',
 'Stephanie E. Gorman',
 'Edward R. Grant',
 'Anne J. Greer',
 'Keith E. Hunsucker',
 'Edward F. Kelly',
 'Ellen Liebowitz',
 'Sunita B. Mahtabfar',
 'Ana Landazabal Mann',
 'Philip J. Montante, Jr.',
 'Hugh Mullane',
 "Blair O'Connor",
 'Sirce E. Owen',
 'Aaron R. Petty',
 'Kevin W. Riley',
 'Earle B. Wilson',
 'Megan Foote Monsky',
 'Joan B. Geller',
 'Gabriel Gonzalez',
 'Beth Liebmann',
 'Mark Noferi',
 'S. Kathleen Pepper']

In [8]:
len(web_judges)

29

#### Combining the lists gives us one big list of patterns to match

In [9]:
combined_members = members + web_judges
len(combined_members)

75

#### Below is identical to the current code implemented, except for the alternating of `names` in the constructor for `GetJudge`, for which we will test the different lists.

In [19]:
class GetJudge:
    """ Returns the judge's name if a match is found. """
    accuracy = 0.7

    def __init__(self):
        # Currently grabs potential judges names from a URL here.
        # For testing we'll instead alternate `names`

#         names = web_judges
#         names = members
        names = combined_members
        self.is_judge: Callable = similar_in_list(names)

    def __call__(self, name):
        result = self.is_judge(name, self.accuracy)
        if not result:
            flip_name = ' '.join(reversed(name.split(', ')))
            result = self.is_judge(flip_name, self.accuracy)
        return result


class BIACase:
    def __init__(self, text: str):
        """
        • Input will be text from a BIA case pdf file, after the pdf has
        been converted from PDF to text.
        • Scraping works utilizing spaCy, tokenizing the text, and iterating
        token by token searching for matching keywords.
        """
        self.doc: Doc = nlp(text)
        self.ents: Tuple[Span] = self.doc.ents
        self.if_judge = GetJudge()

    def get_ents(self, labels: List[str]) -> Iterator[Span]:
        """
        • Retrieves entitiess of a specified label(s) in the document,
        if no label is specified, returns all entities
        """
        return (ent for ent in self.ents if ent.label_ in labels)

    def get_panel(self) -> str:
        """
        • Returns the panel members of case in document.
        """
        panel_members: List[str]
        panel_members = []
        possible_members: Iterator[Span]
        possible_members = map(
            lambda ent: ent.text, self.get_ents(['PERSON'])
        )
        for member in possible_members:
            judge: Union[str, None]
            judge = self.if_judge(member)
            if judge:
                panel_members.append(judge)

        return '; '.join(set(panel_members))

#### Helper function for pandas to count the number of members in a cell.

In [11]:
def num_members(x):
    if x != "":
        return len(x.split(';'))
    return 0

# Test list of judges from Wikipedia
#### For each case, initialize a BIACase instance `case`, invoke the get_panel() method, and save the results to the filename in a dictionary.

In [12]:
# ** Change `names` to `web_judges` list in GetJudge() **
web_dict = {}

for file in filenames:
    f = open(f"./hrfCases/{file}", "r")
    case = BIACase(f.read())
    web_dict[file] = case.get_panel()
    f.close()
    

#### Create a dataframe from results dictionary `web_dict`

In [13]:
df_web = pd.DataFrame(web_dict.items(), columns=['case', 'members'])
df_web['num_members'] = df_web['members'].apply(num_members)
df_web.head()

Unnamed: 0,case,members,num_members
0,414384746-W-S-Y-AXXX-XXX-857-BIA-June-6-2019-o...,Hugh Mullane; Michael P. Baird,2
1,437078509-Y-V-P-AXXX-XXX-977-BIA-Nov-6-2019-ou...,Michael P. Baird,1
2,470811646-D-A-C-AXXX-XXX-366-BIA-June-12-2020-...,,0
3,447698812-G-L-P-AXXX-XXX-275-BIA-Jan-22-2020-o...,Beth Liebmann; Hugh Mullane,2
4,386773053-A-A-H-AXXX-XXX-492-BIA-July-18-2018-...,Charles Adkins-Blanch,1


#### The list of judges pulled from Wikipedia, then searched for in our documents via the current implementation, results in finding 115 panel members for 170 cases.

In [14]:
df_web.num_members.sum()

115

# Test hard-coded list of judges
#### Same process as above, just using a different list of judges

In [16]:
# ** Change `names` to `members` list in GetJudge() **
local_dict = {}

for file in filenames:
    f = open(f"./hrfCases/{file}", "r")
    case = BIACase(f.read())
    local_dict[file] = case.get_panel()
    f.close()



In [17]:
df_local = pd.DataFrame(local_dict.items(), columns=['case', 'members'])
df_local['num_members'] = df_local['members'].apply(num_members)
df_local.head()

Unnamed: 0,case,members,num_members
0,414384746-W-S-Y-AXXX-XXX-857-BIA-June-6-2019-o...,Michael P. Baird,1
1,437078509-Y-V-P-AXXX-XXX-977-BIA-Nov-6-2019-ou...,Michael P. Baird,1
2,470811646-D-A-C-AXXX-XXX-366-BIA-June-12-2020-...,,0
3,447698812-G-L-P-AXXX-XXX-275-BIA-Jan-22-2020-o...,,0
4,386773053-A-A-H-AXXX-XXX-492-BIA-July-18-2018-...,,0


#### The list of hard-coded judges, searched for in our documents via the current implementation, results in finding 69 panel members for 170 cases.

In [18]:
df_local.num_members.sum()

69

# Test for a combined list of judges for more searchable patterns.

In [20]:
# ** Change `names` to `combined_members` list in GetJudge() **
combined_dict = {}

for file in filenames:
    f = open(f"./hrfCases/{file}", "r")
    case = BIACase(f.read())
    combined_dict[file] = case.get_panel()
    f.close()


In [21]:
df_combined = pd.DataFrame(combined_dict.items(), columns=['case', 'members'])
df_combined['num_members'] = df_combined['members'].apply(num_members)
df_combined.head()

Unnamed: 0,case,members,num_members
0,414384746-W-S-Y-AXXX-XXX-857-BIA-June-6-2019-o...,Hugh Mullane; Michael P. Baird,2
1,437078509-Y-V-P-AXXX-XXX-977-BIA-Nov-6-2019-ou...,Michael P. Baird,1
2,470811646-D-A-C-AXXX-XXX-366-BIA-June-12-2020-...,,0
3,447698812-G-L-P-AXXX-XXX-275-BIA-Jan-22-2020-o...,Beth Liebmann; Hugh Mullane,2
4,386773053-A-A-H-AXXX-XXX-492-BIA-July-18-2018-...,Charles Adkins-Blanch,1


#### The combined list of judges, searched for in our documents via the current implementation, results in finding 165 panel members for 170 cases.

In [22]:
df_combined.num_members.sum()

155

# Retrieve correct manually extracted data for comparison.

In [23]:
df_csv = pd.read_csv('cleaner_judges.csv')

In [24]:
# Drop rows that didn't put last name first (drops observation count from 170 to 110)
df_csv = df_csv[df_csv['YOUR NAME HERE'] != 'Josiah']
df_csv = df_csv[df_csv['YOUR NAME HERE'] != 'tori']
df_csv = df_csv[df_csv['YOUR NAME HERE'] != 'River']
df_csv = df_csv[df_csv['YOUR NAME HERE'] != 'Kevin']

In [25]:
len(df_csv)

110

In [26]:
df_csv = df_csv[['UUID', 'panel members']]
df_csv['num_members'] = df_csv['panel members'].apply(num_members)
df_csv.columns = ['case', 'panel_members', 'num_panel_members']
df_csv.head()

Unnamed: 0,case,panel_members,num_panel_members
0,140194281-Ali-Fares-A047-654-200-BIA-Apr-30-20...,"Creppy, Michael J.",1
1,165227167-K-O-A-BIA-Aug-27-2013.pdf,"Pauley, Roger; Wendtland, Linda S.; Donovan, T...",3
2,171952033-Luis-Narciso-Sedeno-Trujillo-A088-19...,"Greer, Anne J.; Pauley, Roger; Wendtland, Lind...",3
3,175361890-Jose-Zacaria-Quinteros-A088-239-850-...,"Greer, Anne J.; Pauley, Roger; Wendtland, Lind...",3
4,202216334-Francisco-Hernandez-Pina-A073-976-63...,"Cole, Patricia A.; Greer, Anne J.; Pauley, Roger",3


#### The csv of manual extracted data has 281 panel members for 110 cases.

In [27]:
df_csv.num_panel_members.sum()

281

# Test Phrase Matcher
#### Create `matcher` and add patterns. There are two patterns for each name, the first is the exact name as it is in the `members` list, which grabs 398 matches, and the other removes any periods, which grabs another 29 matches, for a total of 427.

In [28]:
matcher = PhraseMatcher(nlp.vocab)
counter = 0

for judge in members:
    matcher.add(f'PATTERN_{counter}', [nlp(judge)])
    matcher.add(f'PATTERNX_{counter}', [nlp(judge.replace(".",""))])
    counter += 1

In [29]:
matcher_dict = {}

for file in filenames:
    f = open(f"./hrfCases/{file}", "r")
    doc = nlp(f.read())
    f.close()
    
    matches = matcher(doc)
    p_members = set()
    
    if len(matches) > 0:
        for match_id, start, end in matches:
            judge = doc[start:end]
            p_members.add(judge.text)

    matcher_dict[file] = "; ".join(p_members)

In [30]:
df_matcher = pd.DataFrame(matcher_dict.items(), columns=['case', 'members'])
df_matcher['num_members'] = df_matcher['members'].apply(num_members)
df_matcher.head()

Unnamed: 0,case,members,num_members
0,414384746-W-S-Y-AXXX-XXX-857-BIA-June-6-2019-o...,"Malphrus, Garry D.; Mullane, Hugh G.",2
1,437078509-Y-V-P-AXXX-XXX-977-BIA-Nov-6-2019-ou...,"Malphrus, Garry D.; Liebowitz, Ellen C",2
2,470811646-D-A-C-AXXX-XXX-366-BIA-June-12-2020-...,"Donovan, Teresa L.",1
3,447698812-G-L-P-AXXX-XXX-275-BIA-Jan-22-2020-o...,"Liebmann, Beth S.; Mullane, Hugh G.; Kelly, Ed...",3
4,386773053-A-A-H-AXXX-XXX-492-BIA-July-18-2018-...,"Snow, Thomas G; Adkins-Blanch, Charles K.; Kel...",3


### The phrase matcher finds 427 panel member for the 170 cases! Is this the right amount? To find out we merge the lists on case number to compare against the correct manually extracted data.

In [31]:
df_matcher.num_members.sum()

427

#### Strip `case` down to number

In [32]:
df_matcher.case = df_matcher.case.apply(lambda x: x[:9])
df_csv.case = df_csv.case.apply(lambda x: x[:9])

#### Merge on `case`, resulting in 109 observations 

In [33]:
df_merged = df_matcher.merge(df_csv, on='case')
df_merged.shape

(109, 5)

#### `num_members` is how many panel members the phrase matcher found, while `num_panel_members` is how many panel members were manually extracted for that case, for comparison.

In [34]:
pd.set_option("display.max_rows", None, "display.max_columns", None)
df_merged

Unnamed: 0,case,members,num_members,panel_members,num_panel_members
0,470811646,"Donovan, Teresa L.",1,"Donovan, Teresa L.",1
1,386773053,"Snow, Thomas G; Adkins-Blanch, Charles K.; Kel...",3,"Adkins-Blanch, Charles K; Kelly, Edward F.; S...",3
2,481527271,"Creppy, Michael J.",1,"Creppy, Michael J.; MONSKY, MEGAN FOOTE; Hunsu...",3
3,398004870,"Donovan, Teresa L.; Cole, Patricia A.; Greer, ...",3,"Cole, Patricia A.; Greer, Anne J.; Donovan, Te...",3
4,470812246,"Gemoets, Marcos; Morris, Daniel",2,"Hunsucker, Keith; Gemoets, Marcos; Morris, Daniel",3
5,175361890,"Pauley, Roger; Wendtland, Linda S.; Greer, Ann...",3,"Greer, Anne J.; Pauley, Roger; Wendtland, Lind...",3
6,457375787,"Greer, Anne J.; Swanwick, Daniel L.; Donovan, ...",3,"Donavan, Teresa L; Swanwick, Daniel L; Greer, ...",3
7,225423441,"Wendtland, Linda S.; Greer, Anne J.; Pauley, R...",3,"Wendtland, Linda S.; Greer, Anne J.; Pauley, R...",3
8,334139459,"Adkins-Blanch, Charles K.; Greer, Anne J.; O'H...",3,"Greer, Anne J.; Adkins-Blanch, Charles K.; O'H...",3
9,400015107,"Kelly, Edward F.",1,"Kelly, Edward F.",1


# The Phrase Matcher with two patterns for each name is able two find 271 of the 280 panel members, or approximately 97%

In [35]:
df_merged.num_members.sum(), df_merged.num_panel_members.sum()

(271, 280)