In [1]:
# Dependencies go in this cell.
# The script env.sh in this directory should create an Anaconda environment with
# all these dependencies installed

# Python built-in packages
import functools
import importlib
from typing import *

# Libraries
import numpy as np
import pandas as pd
import regex
import spacy
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English

# TEMPORARY until we can use Python 3.8 functools' built-in memoized property
from memoized_property import memoized_property

# [re]import our local library code
import pandas_text as pt
importlib.reload(pt)

<module 'pandas_text' from '/Users/freiss/pd/pandas_text/__init__.py'>

In [2]:
SYSTEMT_ROOT = "./SystemT/SystemT/biginsights-extractor-library"
#TEST_DOC_FILE = (SYSTEMT_ROOT + 
#                "/data/unlabeled/ace2005trainingDoc/converted_20030303.1900.00.CNN_CF_sgm.txt")
TEST_DOC_FILE = "example_doc.txt"

with open (TEST_DOC_FILE, "r") as f:
    TEST_TEXT = f.read()

LAST_NAMES_DICT_FILE = "last_name.dict"

In [4]:
# Utility functions start in this cell and eventually migrate to pandas_text package


In [5]:
class Resources:
    """
    Data structures that are loaded once, as opposed to recreated on
    every document.
    
    This category includes tokenizers, dictionaries, and compiled regexes.
    
    Everything in this class is a cached property
    """
    
    @memoized_property
    def Tokenizer(self):
        nlp = spacy.lang.en.English()
        return nlp.Defaults.create_tokenizer(nlp)
    
    
    ############################
    # DICTIONARIES
    
    @memoized_property
    def GlobalFirstNameDict(self):
        return pt.load_dict("first_name.dict", self.Tokenizer)
    
    @memoized_property
    def GlobalLastNameDict(self):
        return pt.load_dict("last_name.dict", self.Tokenizer)
    
    ############################
    # REGEXES
    
    @memoized_property
    def CapsWordRegex(self):
        return regex.compile("[A-Z][a-z]*")
    

In [6]:
class Document:
    """
    Rules that define the fields of the current document.
    
    In this case the document is a single string.
    """
    def __init__(self, doc_text: str, resources: Resources):
        self._text = doc_text
        self._resources = resources
        
    @property
    def Text(self):
        return self._text
    
    def token_substr(self, begin: int, end: int):
        """
        Retrieve a substring via token boundaries.
        
        Args:
            begin: First token in the substring
            end: 1 past the last token in the substring
            
        Returns tokens begin through (end-1), inclusive, of the document text,
        with the original whitespace between tokens.
        """
        toks_table = self.Tokens
        begin_char_offset = toks_table["char_offsets"][begin].left
        end_char_offset = toks_table["char_offsets"][end - 1].right
        return self.Text[begin_char_offset:end_char_offset]
    
    @memoized_property
    def Tokens(self):
        # Tokenize with SpaCy. SpaCy tokenizer returns a Document object.
        spacy_doc = self._resources.Tokenizer(self._text)
        tok_begins = [t.idx for t in spacy_doc]
        tok_ends = [t.idx + len(t) for t in spacy_doc]
        tok_intervals = pd.IntervalIndex.from_arrays(tok_begins, tok_ends, closed="left")
        tok_texts = [t.text for t in spacy_doc]
        num_toks = len(tok_texts)
        ret = pd.DataFrame({"token_id": pd.Series(range(num_toks)), 
                            "char_offsets": tok_intervals, 
                            "text": tok_texts})
        #ret = ret.set_index("token_id", drop=False)
        return ret

In [7]:
class Dictionaries:
    """
    Rules that evaluate dictionaries against the document's raw tokens.
    """
    def __init__(self, d: Document, resources: Resources):
        self._d = d
        self._resources = resources
    
    @memoized_property
    def GlobalFirstName(self):
        return pt.extract_dict(self._d.Tokens, self._resources.GlobalFirstNameDict)
    
    @memoized_property
    def GlobalLastName(self):
        return pt.extract_dict(self._d.Tokens, self._resources.GlobalLastNameDict)



In [8]:
class Regexes:
    """
    Rules that evaluate regular expressions against the document's raw tokens.
    """
    def __init__(self, d: Document, resources: Resources):
        self._d = d
        self._resources = resources
    
    @property
    def CapsWord(self):
        """
        A single token that starts with a capital letter, with subsequent letters not
        capitalized.
        """
        return pt.extract_regex_tok(
            token_offsets = self._d.Tokens["char_offsets"],
            target_str = self._d.Text,
            compiled_regex = self._resources.CapsWordRegex)
    


In [9]:
class PersonName:
    """
    Rules that extract potential person name entities.
    """
    def __init__(self, doc: Document, dicts: Dictionaries, regexes: Regexes):
        self._doc = doc
        self._dicts = dicts
        self._regexes = regexes
    
    @property
    def Person1(self):
        """
        <match of GlobalFirstName dict> <match of GlobalLastName dict>
        """
        first = self._dicts.GlobalFirstName
        last = self._dicts.GlobalLastName
        return pt.adjacent_join(
            first_series = first["matches"],
            second_series = last["matches"],
            first_name = "first_name",
            second_name = "last_name",
            min_gap = 0,
            max_gap = 0)
    
    @property
    def Person2(self):
        """
        <match of GlobalFirstName dict> <capitalized word>
        """
        first = self._dicts.GlobalFirstName
        last = self._regexes.CapsWord
        return pt.adjacent_join(
            first_series = first["matches"],
            second_series = last["matches"],
            first_name = "first_name",
            second_name = "last_name",
            min_gap = 0,
            max_gap = 0)



In [10]:
resources = Resources()
doc = Document(TEST_TEXT, resources)
dicts = Dictionaries(doc, resources)
regexes = Regexes(doc, resources)
persons = PersonName(doc, dicts, regexes)

In [11]:
regexes.CapsWord

Unnamed: 0,matches
0,"[0, 1)"
4,"[4, 5)"
5,"[5, 6)"
6,"[6, 7)"
7,"[7, 8)"
...,...
967,"[967, 968)"
968,"[968, 969)"
979,"[979, 980)"
988,"[988, 989)"


In [12]:
persons.Person1

  result = result.to_dense()


Unnamed: 0,first_name,last_name
0,"[173, 174)","[174, 175)"
1,"[575, 576)","[576, 577)"
2,"[549, 550)","[550, 551)"
3,"[299, 300)","[300, 301)"
4,"[4, 5)","[5, 8)"
5,"[9, 10)","[10, 12)"
6,"[305, 306)","[306, 307)"


In [13]:
persons.Person2

Unnamed: 0,first_name,last_name
0,"[173, 174)","[174, 175)"
1,"[283, 284)","[284, 285)"
2,"[575, 576)","[576, 577)"
3,"[549, 550)","[550, 551)"
4,"[289, 290)","[290, 291)"
5,"[589, 590)","[590, 591)"
6,"[299, 300)","[300, 301)"
7,"[279, 280)","[280, 281)"
8,"[429, 430)","[430, 431)"
9,"[491, 492)","[492, 493)"


In [None]:
pd.__version__