In [1]:
# Dependencies go in this cell.
# The script env.sh in this directory should create an Anaconda environment with
# all these dependencies installed

# Python built-in packages
import functools
import importlib
from typing import *

# Libraries
import numpy as np
import pandas as pd
import regex
import spacy
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English

# TEMPORARY until we can use Python 3.8 functools' built-in memoized property
from memoized_property import memoized_property

# [re]import our local library code
import pandas_text as pt
importlib.reload(pt)

<module 'pandas_text' from '/Users/freiss/pd/pandas_text/__init__.py'>

In [2]:
#TEST_DOC_FILE = "resources/example_doc.txt"
TEST_DOC_FILE = "resources/short_example_doc.txt"

with open (TEST_DOC_FILE, "r") as f:
    TEST_TEXT = f.read()


In [3]:
class Resources:
    """
    Data structures that are loaded once, as opposed to recreated on
    every document.
    
    This category includes tokenizers, dictionaries, and compiled regexes.
    
    Everything in this class is a cached property
    """
    
    @memoized_property
    def Tokenizer(self):
        nlp = spacy.lang.en.English()
        return nlp.Defaults.create_tokenizer(nlp)
    
    
    ############################
    # DICTIONARIES
    
    @memoized_property
    def GlobalFirstNameDict(self):
        return pt.load_dict("resources/first_name.dict", self.Tokenizer)
    
    @memoized_property
    def GlobalLastNameDict(self):
        return pt.load_dict("resources/last_name.dict", self.Tokenizer)
    
    ############################
    # REGEXES
    
    @memoized_property
    def CapsWordRegex(self):
        return regex.compile("[A-Z][a-z]*")
    

class Document:
    """
    Rules that define the fields of the current document.
    
    In this case the document is a single string.
    """
    def __init__(self, doc_text: str, resources: Resources):
        self._text = doc_text
        self._resources = resources
        
    @property
    def Text(self):
        return self._text
    
    @memoized_property
    def Tokens(self):
        return pt.make_tokens(self.Text, self._resources.Tokenizer)

In [4]:
class Dictionaries:
    """
    Rules that evaluate dictionaries against the document's raw tokens.
    """
    def __init__(self, d: Document, resources: Resources):
        self._d = d
        self._resources = resources
    
    @memoized_property
    def GlobalFirstName(self):
        return pt.extract_dict(self._d.Tokens, self._resources.GlobalFirstNameDict)
    
    @memoized_property
    def GlobalLastName(self):
        return pt.extract_dict(self._d.Tokens, self._resources.GlobalLastNameDict)

class Regexes:
    """
    Rules that evaluate regular expressions against the document's raw tokens.
    """
    def __init__(self, d: Document, resources: Resources):
        self._d = d
        self._resources = resources
    
    @property
    def CapsWord(self):
        """
        A single token that starts with a capital letter, with subsequent letters not
        capitalized.
        """
        return pt.extract_regex_tok(
            token_offsets = self._d.Tokens["char_offsets"],
            target_str = self._d.Text,
            compiled_regex = self._resources.CapsWordRegex)
    


In [5]:
class PersonName:
    """
    Rules that extract potential person name entities.
    """
    def __init__(self, doc: Document, dicts: Dictionaries, regexes: Regexes):
        self._doc = doc
        self._dicts = dicts
        self._regexes = regexes
    
    @property
    def Person1(self):
        """
        <match of GlobalFirstName dict> <match of GlobalLastName dict>
        """
        first = self._dicts.GlobalFirstName
        last = self._dicts.GlobalLastName
        return pt.adjacent_join(
            first_series = first["matches"],
            second_series = last["matches"],
            first_name = "first_name",
            second_name = "last_name",
            min_gap = 0,
            max_gap = 0)
    
    @property
    def Person2(self):
        """
        <match of GlobalFirstName dict> <capitalized word>
        """
        first = self._dicts.GlobalFirstName
        last = self._regexes.CapsWord
        return pt.adjacent_join(
            first_series = first["matches"],
            second_series = last["matches"],
            first_name = "first_name",
            second_name = "last_name",
            min_gap = 0,
            max_gap = 0)



In [6]:
resources = Resources()
doc = Document(TEST_TEXT, resources)
dicts = Dictionaries(doc, resources)
regexes = Regexes(doc, resources)
persons = PersonName(doc, dicts, regexes)

In [7]:
dicts.GlobalFirstName

  result = result.to_dense()


Unnamed: 0,matches
0,"[173, 174)"
1,"[283, 284)"
2,"[289, 290)"
3,"[299, 300)"
4,"[279, 280)"
5,"[4, 5)"
6,"[122, 123)"
7,"[269, 270)"
8,"[9, 10)"
9,"[286, 287)"


In [None]:
tokseries = pd.Series(doc.Tokens)
tokdf = pd.DataFrame({
    "token_id": tokseries.index
})
tokdf

In [None]:
tokseries.index

In [None]:
repr(tokdf)

In [None]:
doc.Tokens[0:20]

In [None]:
regeTokens.CapsWord

In [None]:
persons.Person1

In [None]:
persons.Person2

In [None]:
pd.__version__