In [1]:
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup
from typing import List, Tuple, Union, Callable, Dict, Iterator
from collections import defaultdict
from difflib import SequenceMatcher
import spacy 
from spacy.matcher import Matcher, PhraseMatcher
from spacy.tokens.doc import Doc
from spacy.tokens.span import Span
from spacy.tokens.token import Token

In [2]:
nlp = spacy.load("en_core_web_sm")

### There are very few initial cases to work with, here there are 17. Most of our cases are appeals cases, where you can search for panel members, not initial case judges. Cases 175361890, 309306110, 318845018, and 349320269, are initial cases from our current set of cases, while the other 13 came from [here](https://drive.google.com/drive/folders/1_1MLtL3pm-05cff7piUFMLKsMUeubVmX).

In [3]:
filenames = os.listdir('./HRF_original_decisions_txt') # Wherever files are located

### This scrapes a link for a list of immigration judges (not panel members). This is unreliable, and a list of judges should probably be handled with the backend.

In [4]:
judges_url = 'https://www.justice.gov/eoir/eoir-immigration-court-listing#MP' # url containing judge names
html = requests.get(judges_url).text # get html text
soup = BeautifulSoup(html, 'html.parser') # parse html with bs4
tables = soup.find_all("tbody") # get all tables

In [5]:
judge_list = [] # initialize judge name list
for table in tables: # iterate through each table
    for judges in table.find_all('tr')[2:]: # iterate through each row of each table skipping first two rows
        judge_list.extend(list(judges)[4].get_text().strip().replace('\t', '').split('\n')) # get and clean text from judge names cell in each row
        
len(judge_list)

491

### Reformat the names to have the first name first since this is how they are found in the case documents. 

In [6]:
judges = []
for judge in judge_list:
    judges.append(' '.join(reversed(judge.split(', '))))
    print(' '.join(reversed(judge.split(', '))))

John W. Cortes
Jennifer I. Gaz
Paul Habich
Jose L. Penalosa Jr.
Molly S. Frazer
Marni Guerrero
Bruce A. Taylor
Robert C. Bartlemay Sr.
Elizabeth A. Cottor
LaMonte S. Freerks
Joseph S. Imburgia
Ken Josephson
Melissa B. Karlen
Munish Sharda
Linda Spencer-Walters
John W. Davis
Kathryn L. DeAngelis
Sean H. Keenan
Lauren Mathon
Ravit R. Halperin
Steven Marcus
Nathaniel B. Walker
Curtis G. White
Jeffrey V. Muñoz
Christian Pressman
Eugene H. M. Robinson
Nathan N. Aina
Lily C. Hsu
James M. Left
Kristin Piepmeier
Frank Travieso
Janette L. Allen
Joyce Bakke Varzandeh
Ira Bank
Audra Behne
Hye Y. Chon
Philip Costa
Jankhana Desai
Timothy Everett
Leon J. Francis
Andrea H. Hong
Natalie B. Huddleston
Jaime Jasso
Carlos R. Juelle
Jan D. Latimore
Edward F. Lee
Wilbur Lee
Daniel H. Malvin
Nancy E. Miller
Jeannette Lim Park
Sebastian T. Patti
Rachel Ann Ruane
Anita L. Simons
Christine E. Stancill
Gita Vahid-Tehrani
Veronica S. Villegas
Bridget Virchis
Jason R. Waterloo
Brian H. Burke
David Burke
Arlene Do

### Instantiate a matcher, and add each judge's name as a pattern to search for in a document.

In [7]:
matcher = PhraseMatcher(nlp.vocab)

for judge in judges:
    # Add the pattern to the matcher
    matcher.add(f'PATTERN_{judge}', [nlp(judge)])
    

### Go through the documents and search for matches with judges in our list, using the same logic as `get_panel()` does to search for panel members, except with a different list of judges.

In [8]:
matcher_dict = {}

for file in filenames:
    if file != '.DS_Store': # mac added this, ugh, so make sure it isn't this file
        f = open(f"./HRF_original_decisions_txt/{file}", "r")
        print(file)
        doc = nlp(f.read())
        f.close()

        matches = matcher(doc)
        possible_judges = set()

        if len(matches) > 0:
            for match_id, start, end in matches:
                judge = doc[start:end]
                possible_judges.add(judge.text)

        matcher_dict[file] = "; ".join(possible_judges)

175361890-Jose-Zacaria-Quinteros-A088-239-850-BIA-Mar-31-2011-output-1-to-9.txt
IJ Decision in another UB Case w serious non-political crime issues_REDACTED-output-1-to-22.txt
Honduran Women, Redacted Asylum Grant, Arlington VA-output-1-to-16.txt
Redacted Grant, PSG Guatemalan Women-output-1-to-19.txt
Chicago Immigration Judge - Post-AB Domestic Violence Based Grant of Asylum-output-1-to-9.txt
Nicaragua - DV - IJ Rubin Newark, NJ (2020)-output-1-to-12.txt
2020.02.19 Asylum grant_Redacted-output-1-to-15.txt
Redacted IJ Decision %22single Salvadoran mothers lacking male protection%2225072018 (1)-output-1-to-10.txt
Redacted_IJ Decision_Asylum grant for false gang alleg PSG (1)-output-1-to-17.txt
Family PSGAnti-gangPO - IJ Feder 9.18.19_Redacted (1)-output-1-to-14.txt
El Salvador - US labeled gang members - IJ Rubin Newark, NJ (2020)-output-1-to-17.txt
Honduras.Women in domestic relationships asylum grant -Redacted-output-1-to-28.txt
Asylum Grant Honduran Business Owners-output-1-to-50.txt

In [9]:
df_matcher = pd.DataFrame(matcher_dict.items(), columns=['case', 'judge'])
df_matcher

Unnamed: 0,case,judge
0,175361890-Jose-Zacaria-Quinteros-A088-239-850-...,
1,IJ Decision in another UB Case w serious non-p...,
2,"Honduran Women, Redacted Asylum Grant, Arlingt...",Wynne P. Kelly
3,"Redacted Grant, PSG Guatemalan Women-output-1-...",
4,Chicago Immigration Judge - Post-AB Domestic V...,
5,"Nicaragua - DV - IJ Rubin Newark, NJ (2020)-ou...",Shifra Rubin
6,2020.02.19 Asylum grant_Redacted-output-1-to-1...,David C. Koelsch
7,Redacted IJ Decision %22single Salvadoran moth...,
8,Redacted_IJ Decision_Asylum grant for false ga...,
9,Family PSGAnti-gangPO - IJ Feder 9.18.19_Redac...,


# 7 out of 17 is pretty bad!