In [1]:
!pip install spacy
!python -m spacy download en_core_web_sm

import os
import pandas as pd
import requests
from bs4 import BeautifulSoup
from typing import List, Tuple, Union, Callable, Dict, Iterator
from collections import defaultdict
from difflib import SequenceMatcher
import spacy 
from spacy.matcher import Matcher, PhraseMatcher
from spacy.tokens.doc import Doc
from spacy.tokens.span import Span
from spacy.tokens.token import Token


[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
filenames = os.listdir('./hrfCases') # Wherever files are located

In [4]:
def similar(a: str, return_b: str, min_score: float) -> Union[str, None]:
    """
    • Returns 2nd string if similarity score is above supplied
    minimum score. Else, returns None.
    """
    if SequenceMatcher(None, a, return_b).ratio() >= min_score:
        return return_b


def similar_in_list(lst: Union[List[str], Iterator[str]]) -> Callable:
    """
    • Uses a closure on supplied list to return a function that iterates over
    the list in order to search for the first similar term. It's used widely
    in the scraper.
    """

    def impl(item: str, min_score: float) -> Union[str, None]:
        for s in lst:
            s = similar(item, s, min_score)
            if s:
                return s

    return impl

In [13]:
class BIACase:
    def __init__(self, text: str):
        """
        • Input will be text from a BIA case pdf file, after the pdf has
        been converted from PDF to text.
        • Scraping works utilizing spaCy, tokenizing the text, and iterating
        token by token searching for matching keywords.
        """
        self.doc: Doc = nlp(text)
        self.ents: Tuple[Span] = self.doc.ents
        self.state = None
        self.city = None
        
    def get_interpreter(self):
        return False
    
    def get_interpreter_new(self):
        """
        • If the terms "interpreter" or "translator" appear in the document,
        the field will return whether the asylum seeker had access to an
        interpreter during their hearings. Currently, the field's output is
        dependent on occurrence of specific tokens in the document; this method
        needs to be fine-tuned and validated.
        """
        for token in self.doc:
            sent = token.sent.text.lower()
        
            similar_interpreter = similar_in_list(['interpreter', 'translator'])
        
            s = similar_interpreter(token.text.lower(), 0.9)
        
            if s == 'interpreter' or s == 'translator':
                surrounding = self.get_surrounding_sents(token)
        
                next_word = self.doc[token.i+1].text.lower()
                if 'requested' in surrounding.text.lower() \
                    and 'granted' in surrounding.text.lower():
                    return True
                elif 'requested' in surrounding.text.lower() \
                    and 'was present' in surrounding.text.lower():
                    return True
                elif 'requested' in surrounding.text.lower() \
                    and 'granted' not in surrounding.text.lower():
                    return False
                elif 'requested' in surrounding.text.lower() \
                    and 'was present' in surrounding.text.lower():
                    return False
        return False
    


In [14]:
new_interpreter_dict = {}
old_interpreter_dict = {}

for file in filenames:
    f = open(f"./hrfCases/{file}", "r", encoding='utf-8')
    case = BIACase(f.read())
    old_interpreter = case.get_interpreter()
    old_interpreter_dict[file] = old_interpreter
    new_interpreter = case.get_interpreter_new()
    new_interpreter_dict[file] = new_interpreter
    if old_interpreter != new_interpreter:
        print('case: ', )
        print('new outcome: ', new_interpreter)
        print('old outcome: ', old_interpreter)
    f.close()

new_interpreter_df = pd.DataFrame(new_interpreter_dict.items(), columns=['UUID', 'new_interpreter'])
old_interpreter_df = pd.DataFrame(old_interpreter_dict.items(), columns=['UUID', 'old_interpreter'])

NameError: name 'old_outcome' is not defined

# Retrieve correct manually extracted data for comparison.

In [7]:
df_csv = pd.read_csv('manually_scrapped.csv')
df_csv = df_csv[['UUID', 'interpreter']]

#remove .pdf
df_csv['UUID'] = df_csv['UUID'].str[0:-4] 

#remove different ending of .txt file names
new_interpreter_df['UUID'] = new_outcome_df['UUID'].apply(lambda x : x[0:x.find('output-1-to-') - 1])
old_interpreter_df['UUID'] = old_outcome_df['UUID'].apply(lambda x : x[0:x.find('output-1-to-') - 1])

combined_df = df_csv.merge(new_interpreter_df, on='UUID', how='outer').merge(old_interpreter_df, on='UUID', how='outer')
combined_df.head()

140194281-Ali-Fares-A047-654-200-BIA-Apr-30-2013.pdf
140194281-Ali-Fares-A047-654-200-BIA-Apr-30-2013-output-1-to-11.txt
140194281-Ali-Fares-A047-654-200-BIA-Apr-30-2013-output-1-to-11.txt
140194281-Ali-Fares-A047-654-200-BIA-Apr-30-2013
140194281-Ali-Fares-A047-654-200-BIA-Apr-30-2013
140194281-Ali-Fares-A047-654-200-BIA-Apr-30-2013


Unnamed: 0,UUID,outcome,new_outcome,old_outcome
0,140194281-Ali-Fares-A047-654-200-BIA-Apr-30-2013,Dismissed,[dismissed],"[dismissed, granted]"
1,165227167-K-O-A-BIA-Aug-27-2013,Sustained; remanded,"[remanded, sustained]","[remanded, sustained]"
2,171952033-Luis-Narciso-Sedeno-Trujillo-A088-19...,Sustained; remanded,"[remanded, sustained]","[remanded, sustained]"
3,175361890-Jose-Zacaria-Quinteros-A088-239-850-...,Remanded,[remanded],[remanded]
4,202216334-Francisco-Hernandez-Pina-A073-976-63...,Sustained; remanded,"[remanded, sustained]","[remanded, sustained]"


In [9]:
combined_df['old_accurate'] = combined_df.apply(compare_outcomes, args=['old'], axis=1)
combined_df['new_accurate'] = combined_df.apply(compare_outcomes, args=['new'], axis=1)

old_accuracy = combined_df['old_accurate'].sum()/len(combined_df)*100
new_accuracy = combined_df['new_accurate'].sum()/len(combined_df)*100
print('old accuracy: ', old_accuracy, "%")
print('new accuracy: ', new_accuracy, "%")
print("improvement:   ", new_accuracy - old_accuracy, "%")

old accuracy:  91.76470588235294 %
new accuracy:  100.0 %
improvement:    8.235294117647058 %


In [10]:
diff_df = combined_df[combined_df['new_accurate'] == False]
print(len(diff_df))
diff_df.head(20)

0


Unnamed: 0,UUID,outcome,new_outcome,old_outcome,old_accurate,new_accurate


In [11]:
changes_df = combined_df[combined_df['new_accurate'] != combined_df['old_accurate']]
print(len(changes_df))
changes_df

14


Unnamed: 0,UUID,outcome,new_outcome,old_outcome,old_accurate,new_accurate
0,140194281-Ali-Fares-A047-654-200-BIA-Apr-30-2013,Dismissed,[dismissed],"[dismissed, granted]",False,True
6,208167027-J-M-S-B-W-AXX-XXX-109-BIA-Apr-4-2003,denied; granted; remanded,"[denied, granted, remanded]","[denied, granted]",False,True
18,349320269-S-D-AXXX-XXX-230-BIA-April-26-2017,sustained; terminated,"[sustained, terminated]","[sustained, terninated]",False,True
22,362583855-Eric-Omari-Thorpe-A047-924-686-BIA-S...,granted; vacated; terminated,"[granted, terminated, vacated]",[granted],False,True
43,393704112-M-Z-AXXX-XXX-502-BIA-Nov-1-2018,sustained; vacated; terminated,"[sustained, terminated, vacated]","[remanded, sustained, terminated, vacated]",False,True
49,398004870-R-A-M-R-AXXX-XXX-851-BIA-Dec-17-2018,remanded; sustained,"[remanded, sustained]","[denied, remanded, sustained]",False,True
62,402310836-M-F-H-AXXX-XXX-149-BIA-Feb-13-2019,dismissed; remanded; affirmed,"[affirmed, dismissed, remanded]","[dismissed, remanded]",False,True
65,402316607-R-R-P-AXXX-XXX-272-BIA-Feb-22-2019,Sustained; remanded,"[remanded, sustained]","[denied, remanded, sustained]",False,True
109,440769960-I-R-M-AXXX-XXX-308-BIA-Nov-22-2019,dismissed; sustained; remanded,"[dismissed, remanded, sustained]","[dismissed, sustained]",False,True
113,443776154-C-T-AXXX-XXX-676-BIA-Dec-18-2019,sustained; remanded; vacated,"[remanded, sustained, vacated]","[denied, remanded, sustained, vacated]",False,True
