In [1]:
!pip install spacy
!python -m spacy download en_core_web_sm

import os
import pandas as pd
import requests
from bs4 import BeautifulSoup
from typing import List, Tuple, Union, Callable, Dict, Iterator
from collections import defaultdict
from difflib import SequenceMatcher
import spacy 
from spacy.matcher import Matcher, PhraseMatcher
from spacy.tokens.doc import Doc
from spacy.tokens.span import Span
from spacy.tokens.token import Token


[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
filenames = os.listdir('./hrfCases') # Wherever files are located

In [4]:
def similar(a: str, return_b: str, min_score: float) -> Union[str, None]:
    """
    • Returns 2nd string if similarity score is above supplied
    minimum score. Else, returns None.
    """
    if SequenceMatcher(None, a, return_b).ratio() >= min_score:
        return return_b


def similar_in_list(lst: Union[List[str], Iterator[str]]) -> Callable:
    """
    • Uses a closure on supplied list to return a function that iterates over
    the list in order to search for the first similar term. It's used widely
    in the scraper.
    """

    def impl(item: str, min_score: float) -> Union[str, None]:
        for s in lst:
            s = similar(item, s, min_score)
            if s:
                return s

    return impl

In [11]:
class BIACase:
    def __init__(self, text: str):
        """
        • Input will be text from a BIA case pdf file, after the pdf has
        been converted from PDF to text.
        • Scraping works utilizing spaCy, tokenizing the text, and iterating
        token by token searching for matching keywords.
        """
        self.doc: Doc = nlp(text)
        self.ents: Tuple[Span] = self.doc.ents
        self.state = None
        self.city = None
        
    def check_for_one_year_original(self) -> bool:
        """
        Checks whether or not the asylum-seeker argued to be exempt from the
        one-year guideline.  Specifically, it checks to see if the document
        contains either "changed circumstance" or "extraordinary circumstance".
        If it does, and one of the five terms ("year", "delay", "time",
        "period", "deadline") is within 10 lemmas, then the function
        returns True.  Otherwise, it returns False.
        If one of the four context words are w/in 100 characters of the
        phrase, we conclude that it is related to the one-year rule
        """
        terms = ('year', 'delay', 'time', 'period', 'deadline')
        lemma_list = [token.lemma_.lower() for token in self.doc]

        for idx in range(0, len(lemma_list)):
            if lemma_list[idx] == 'change' and \
                    lemma_list[idx + 1] == 'circumstance':
                idx_start = lemma_list.index('change')
                idx_end = idx_start + 1
                search_list = [
                    lemma for lemma in lemma_list[idx_start - 10: idx_end + 10]
                ]
                if any(term in search_list for term in terms):
                    return True

        for idx in range(0, len(lemma_list)):
            if lemma_list[idx] == 'extraordinary' and \
                    lemma_list[idx + 1] == 'circumstance':
                idx_start = lemma_list.index('extraordinary')
                idx_end = idx_start + 1
                search_list = [
                    lemma for lemma in lemma_list[idx_start - 10: idx_end + 10]
                ]
                if any(term in search_list for term in terms):
                    return True

        return False
    
    def check_for_one_year(self) -> bool:
        """
        Checks whether or not the asylum-seeker argued to be exempt from the
        one-year guideline.
        """
        # If any of these terms appear in the document, return True
        terms_len_3 = {'within one year', 'within 1 year'}
        terms_len_4 = {'within 1 - year', 'within one - year'}
        
        # If a pimary term and a secondary term appear in the same
        # sentence, return True
        primary_terms_len_2 = {'change circumstance', 
                               'extraordinary circumstance', 
                               'untimely application'}
        primary_terms_len_3 = {'change \" circumstance', 
                               'extraordinary \" circumstance'}
        secondary_terms = {'year', 'delay', 'time', 'period', 'deadline'}

        for i in range(len(self.doc)):
            if self.doc[i:i+2].lemma_ in primary_terms_len_2 \
                or self.doc[i:i+3].lemma_ in primary_terms_len_3:
                for token in self.doc[i].sent:
                    if token.lemma_ in secondary_terms:
                        return True
                    
            if self.doc[i:i+3].lemma_ in terms_len_3 \
                or self.doc[i:i+4].lemma_ in terms_len_4:
                return True

        return False
    
    def check_for_one_year_new(self) -> bool:
        """
        Checks whether or not the asylum-seeker argued to be exempt from the
        one-year guideline.
        
        Returns true if the phrase "within one-year" appears in the document.
        Also returns true if a time-based word appears in the same sentence
        with "extraordinary circumstances" or "changed circumstances" or
        "untimely application". Otherwise, returns False.
        
        """
        year_pattern = [
            [{'LOWER':'within'}, {'LOWER': {'IN':['1', 'one']}}, 
             {'LOWER': '-', 'OP': '?'}, {'LOWER': 'year'}]
        ]
        matcher = Matcher(nlp.vocab)
        matcher.add('year pattern', year_pattern)
        matches = matcher(self.doc, as_spans=True)
        if matches:
            return True
        matcher.remove('year pattern')
        
        secondary_terms = {'year', 'delay', 'time', 'period', 'deadline'}
        circumstance_pattern = [
            [{'LEMMA': {'IN':['change', 'extraordinary']}}, 
             {'LOWER': {'IN':['"', '”']}, 'OP': '?'}, {'LEMMA': 'circumstance'}]
        ]
        application_pattern = [
            [{'LOWER':'untimely'}, {'LOWER':'application'}]
        ]
        matcher.add('circumstance pattern', circumstance_pattern)
        matcher.add('application pattern', application_pattern)
        matches = matcher(self.doc, as_spans=True)
        for match in matches:
            for token in match.sent:
                if token.lemma_ in secondary_terms:
                    return True
        return False

In [6]:
new_outcome_dict = {}
old_outcome_dict = {}

for file in filenames:
    f = open(f"./hrfCases/{file}", "r", encoding='utf-8')
    case = BIACase(f.read())
    old_outcome = case.check_for_one_year()
    old_outcome_dict[file] = old_outcome
    new_outcome = case.check_for_one_year_new()
    new_outcome_dict[file] = new_outcome
    if old_outcome != new_outcome:
        print('case: ', file)
        print('new outcome: ', new_outcome)
        print('old outcome: ', old_outcome)
    f.close()

new_outcome_df = pd.DataFrame(new_outcome_dict.items(), columns=['UUID', 'new_check_for_one_year'])
old_outcome_df = pd.DataFrame(old_outcome_dict.items(), columns=['UUID', 'old_check_for_one_year'])

0
1
0
0
0
2
0
0
0
0
0
0
0
3
0
0
0
0
0
0
0
0
0
0
0
0
0
0
8
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
1
0
0
0
0
0
0
1
0
0
1
0
0
0
0
0
0
0
0
0
0
1
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
8
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


In [14]:
import time

start_time = time.time()

for file in filenames:
    f = open(f"./hrfCases/{file}", "r", encoding='utf-8')
    case = BIACase(f.read())
    case.check_for_one_year_new()
    f.close()

end_time = time.time()
print(end_time - start_time)

start_time = time.time()

for file in filenames:
    f = open(f"./hrfCases/{file}", "r", encoding='utf-8')
    case = BIACase(f.read())
    case.check_for_one_year()
    f.close()

end_time = time.time()
print(end_time - start_time)

44.68866777420044
49.58596181869507


# Retrieve correct manually extracted data for comparison.

In [7]:
import numpy as np

df_csv = pd.read_csv('manually_scrapped.csv')
df_csv = df_csv[['UUID', 'check for one year']]

#remove .pdf
df_csv['UUID'] = df_csv['UUID'].str[0:-4] 

#remove different ending of .txt file names
new_outcome_df['UUID'] = new_outcome_df['UUID'].apply(lambda x : x[0:x.find('output-1-to-') - 1])
old_outcome_df['UUID'] = old_outcome_df['UUID'].apply(lambda x : x[0:x.find('output-1-to-') - 1])

combined_df = df_csv.merge(new_outcome_df, on='UUID', how='outer').merge(old_outcome_df, on='UUID', how='outer')
combined_df = combined_df[(combined_df['check for one year'] == '0') | (combined_df['check for one year'] == '1')]
combined_df['check_for_one_year'] = np.where(combined_df['check for one year']=='1', True, False)
combined_df = combined_df.drop('check for one year', axis=1)
combined_df.head()

Unnamed: 0,UUID,new_check_for_one_year,old_check_for_one_year,check_for_one_year
0,140194281-Ali-Fares-A047-654-200-BIA-Apr-30-2013,False,False,False
1,165227167-K-O-A-BIA-Aug-27-2013,True,True,True
2,171952033-Luis-Narciso-Sedeno-Trujillo-A088-19...,False,False,False
3,175361890-Jose-Zacaria-Quinteros-A088-239-850-...,False,False,False
4,202216334-Francisco-Hernandez-Pina-A073-976-63...,False,False,False


In [8]:
combined_df['old_accurate'] = (combined_df['old_check_for_one_year'] == combined_df['check_for_one_year'])
combined_df['new_accurate'] = (combined_df['new_check_for_one_year'] == combined_df['check_for_one_year'])

old_accuracy = combined_df['old_accurate'].sum()/len(combined_df)*100
new_accuracy = combined_df['new_accurate'].sum()/len(combined_df)*100
print('old accuracy: ', old_accuracy, "%")
print('new accuracy: ', new_accuracy, "%")
print("improvement:   ", new_accuracy - old_accuracy, "%")

old accuracy:  100.0 %
new accuracy:  100.0 %
improvement:    0.0 %


In [9]:
diff_df = combined_df[combined_df['new_accurate'] == False]
print(len(diff_df))
diff_df.head(20)

0


Unnamed: 0,UUID,new_check_for_one_year,old_check_for_one_year,check_for_one_year,old_accurate,new_accurate


In [10]:
changes_df = combined_df[combined_df['new_accurate'] != combined_df['old_accurate']]
print(len(changes_df))
changes_df

0


Unnamed: 0,UUID,new_check_for_one_year,old_check_for_one_year,check_for_one_year,old_accurate,new_accurate
