In [1]:
# Dependencies go in this cell.
# The script env.sh in this directory should create an Anaconda environment with
# all these dependencies installed

# Python built-in packages
import functools
import importlib
from typing import *

# Libraries
import numpy as np
import pandas as pd
import regex
import spacy
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English

# TEMPORARY until we can use Python 3.8 functools' built-in memoized property
from memoized_property import memoized_property

#TEST_DOC_FILE = "resources/example_doc.txt"
TEST_DOC_FILE = "resources/short_example_doc.txt"

with open (TEST_DOC_FILE, "r") as f:
    TEST_TEXT = f.read()


# [re]import our local library code. Usually doesn't work, but worth a try.
import pandas_text as pt
importlib.reload(pt)

<module 'pandas_text' from '/Users/freiss/pd/pandas_text/__init__.py'>

In [2]:
class Resources:
    """
    Data structures that are loaded once, as opposed to recreated on
    every document.
    
    This category includes tokenizers, dictionaries, and compiled regexes.
    
    Everything in this class is a cached property
    """
    
    def preload_all(self):
        """
        Preload all cached values in this class so it won't be necessary to 
        load them lazily later on.
        """
        for name in dir(self):
            if name[0].isupper():
                getattr(self, name)
                
    @memoized_property
    def LanguageModel(self):
        return spacy.load("en_core_web_sm")
    
    @memoized_property
    def Tokenizer(self):
        nlp = self.LanguageModel
        return nlp.Defaults.create_tokenizer(nlp)
        
    
    ############################
    # DICTIONARIES
    
    @memoized_property
    def GlobalFirstNameDict(self):
        # Use the "pre-compiled" Parquet version of the dictionary, which we
        # produced by running:
        #    pt.load_dict("resources/first_name.dict", self.Tokenizer) \
        #      .to_parquet("resources/first_name.dict.parquet")
        return pd.read_parquet("resources/first_name.dict.parquet")
    
    @memoized_property
    def GlobalLastNameDict(self):
        return pd.read_parquet("resources/last_name.dict.parquet")
    
    ############################
    # REGEXES
    
    @memoized_property
    def CapsWordRegex(self):
        return regex.compile("[A-Z][a-z]*")
       
resources = Resources()
resources.preload_all()
#resources.GlobalFirstNameDict[~resources.GlobalFirstNameDict["toks_1"].isna()]
#resources.GlobalFirstNameDict

In [3]:
class Document:
    """
    Define the fields of the current document.
    
    In this case the document is a single string.
    """
    def __init__(self, doc_text: str, resources: Resources):
        self._text = doc_text
        self._resources = resources
        
    @property
    def Text(self):
        return self._text
    
    @memoized_property
    def TokenFeatures(self):
        return pt.make_tokens_and_features(self._text, self._resources.LanguageModel)
    
    @memoized_property
    def Sentence(self):
        return pd.DataFrame({"sentence": self.TokenFeatures["sentence"].unique()})
    
    @property
    def Tokens(self):
        """
        :return: tokens as a `pd.Series` backed by a `CharSpanArray`.
        """
        return self.TokenFeatures["char_span"]
    
    

In [4]:
# Create an example Document object and show the Tokens as a DataFrame
doc = Document(TEST_TEXT, resources)
doc.TokenFeatures

Unnamed: 0,id,char_span,token_span,lemma,pos,tag,dep,head,shape,is_alpha,is_stop,sentence
0,0,"[0, 4): 'Some'","[0, 1): 'Some'",some,DET,DT,det,1,Xxxx,True,True,"[0, 4): 'Some example names:'"
1,1,"[5, 12): 'example'","[1, 2): 'example'",example,NOUN,NN,nsubj,2,xxxx,True,False,"[0, 4): 'Some example names:'"
2,2,"[13, 18): 'names'","[2, 3): 'names'",name,NOUN,NNS,ROOT,2,xxxx,True,False,"[0, 4): 'Some example names:'"
3,3,"[18, 19): ':'","[3, 4): ':'",:,PUNCT,:,punct,2,:,False,False,"[0, 4): 'Some example names:'"
4,4,"[20, 23): 'Joe'","[4, 5): 'Joe'",Joe,PROPN,NNP,compound,7,Xxx,True,False,"[4, 15): 'Joe Van Der Wals, Mary Van Trapp  B..."
...,...,...,...,...,...,...,...,...,...,...,...,...
334,334,"[1656, 1657): ','","[334, 335): ','",",",PUNCT,",",punct,324,",",False,False,"[283, 339): 'Bruce Babbitt, Mike Espy, Dan Gli..."
335,335,"[1658, 1662): 'Bill'","[335, 336): 'Bill'",Bill,PROPN,NNP,compound,336,Xxxx,True,False,"[283, 339): 'Bruce Babbitt, Mike Espy, Dan Gli..."
336,336,"[1663, 1667): 'Daly'","[336, 337): 'Daly'",Daly,PROPN,NNP,appos,324,Xxxx,True,False,"[283, 339): 'Bruce Babbitt, Mike Espy, Dan Gli..."
337,337,"[1667, 1668): '.'","[337, 338): '.'",.,PUNCT,.,punct,314,.,False,False,"[283, 339): 'Bruce Babbitt, Mike Espy, Dan Gli..."


### `Document.sentence`: spans from the "sentence" column in doc.TokenFeatures
```python
class Document:
    [...]
   @memoized_property
   def Sentence(self):
       return pd.DataFrame({"sentence": self.TokenFeatures["sentence"].unique()})
```

In [5]:
doc.Sentence

Unnamed: 0,sentence
0,"[0, 4): 'Some example names:'"
1,"[4, 15): 'Joe Van Der Wals, Mary Van Trapp  B..."
2,"[15, 27): 'Well, we'll debate that later on in..."
3,"[27, 47): 'We'll have a couple of experts come..."
4,"[47, 94): 'Even as the secretary of homeland s..."
5,"[94, 120): 'Now, why has our president placed ..."
6,"[120, 133): 'Attorney General John Ashcroft, f..."
7,"[133, 143): 'He lost an election to a dead man.'"
8,"[143, 167): 'Secretary of Homeland Security To..."
9,"[167, 192): 'And Deputy Secretary of Homeland ..."


In [6]:
# Contents of the seventh sentence in the document
seventh_sentence = doc.TokenFeatures[doc.TokenFeatures["sentence"] == doc.Sentence["sentence"][6]]
seventh_sentence

Unnamed: 0,id,char_span,token_span,lemma,pos,tag,dep,head,shape,is_alpha,is_stop,sentence
120,120,"[564, 572): 'Attorney'","[120, 121): 'Attorney'",Attorney,PROPN,NNP,compound,121,Xxxxx,True,False,"[120, 133): 'Attorney General John Ashcroft, f..."
121,121,"[573, 580): 'General'","[121, 122): 'General'",General,PROPN,NNP,compound,123,Xxxxx,True,False,"[120, 133): 'Attorney General John Ashcroft, f..."
122,122,"[581, 585): 'John'","[122, 123): 'John'",John,PROPN,NNP,compound,123,Xxxx,True,False,"[120, 133): 'Attorney General John Ashcroft, f..."
123,123,"[586, 594): 'Ashcroft'","[123, 124): 'Ashcroft'",Ashcroft,PROPN,NNP,nsubj,128,Xxxxx,True,False,"[120, 133): 'Attorney General John Ashcroft, f..."
124,124,"[594, 595): ','","[124, 125): ','",",",PUNCT,",",punct,128,",",False,False,"[120, 133): 'Attorney General John Ashcroft, f..."
125,125,"[596, 599): 'for'","[125, 126): 'for'",for,ADP,IN,prep,128,xxx,True,True,"[120, 133): 'Attorney General John Ashcroft, f..."
126,126,"[600, 607): 'example'","[126, 127): 'example'",example,NOUN,NN,pobj,125,xxxx,True,False,"[120, 133): 'Attorney General John Ashcroft, f..."
127,127,"[607, 608): ','","[127, 128): ','",",",PUNCT,",",punct,128,",",False,False,"[120, 133): 'Attorney General John Ashcroft, f..."
128,128,"[609, 611): 'is'","[128, 129): 'is'",be,AUX,VBZ,ROOT,128,xx,True,True,"[120, 133): 'Attorney General John Ashcroft, f..."
129,129,"[612, 613): 'a'","[129, 130): 'a'",a,DET,DT,det,131,x,True,True,"[120, 133): 'Attorney General John Ashcroft, f..."


In [7]:
# Show the dependency parse of the sentence
pt.render_parse_tree(seventh_sentence)

In [8]:
# Pretty-print the tokens in the sentence
seventh_sentence["token_span"].values

Unnamed: 0,begin,end,begin_token,end_token,covered_text
0,564,572,120,121,Attorney
1,573,580,121,122,General
2,581,585,122,123,John
3,586,594,123,124,Ashcroft
4,594,595,124,125,","
5,596,599,125,126,for
6,600,607,126,127,example
7,607,608,127,128,","
8,609,611,128,129,is
9,612,613,129,130,a


In [9]:
# Use a Gremlin query to find all compound nouns in the sentence
g = pt.token_features_to_traversal(seventh_sentence)
compound_nouns = (
    g.V()
    .has("dep", "compound").as_("src")
    .out().as_("dest")
    .select("src", "dest").by("token_span")
).toDataFrame()
compound_nouns

Unnamed: 0,src,dest
0,"[120, 121): 'Attorney'","[121, 122): 'General'"
1,"[121, 122): 'General'","[123, 124): 'Ashcroft'"
2,"[122, 123): 'John'","[123, 124): 'Ashcroft'"
3,"[130, 131): 'career'","[131, 132): 'politician'"


In [10]:
# Filter down the seventh sentence to just the tokens that take part in compound nouns
all_tokens_df = pd.DataFrame({
    "token_span" : pd.concat([compound_nouns[c] for c in compound_nouns]).unique()})
compound_noun_tokens = seventh_sentence.merge(all_tokens_df)
compound_noun_tokens = compound_noun_tokens.set_index(compound_noun_tokens["id"])
compound_noun_tokens

Unnamed: 0_level_0,id,char_span,token_span,lemma,pos,tag,dep,head,shape,is_alpha,is_stop,sentence
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
120,120,"[564, 572): 'Attorney'","[120, 121): 'Attorney'",Attorney,PROPN,NNP,compound,121,Xxxxx,True,False,"[120, 133): 'Attorney General John Ashcroft, f..."
121,121,"[573, 580): 'General'","[121, 122): 'General'",General,PROPN,NNP,compound,123,Xxxxx,True,False,"[120, 133): 'Attorney General John Ashcroft, f..."
122,122,"[581, 585): 'John'","[122, 123): 'John'",John,PROPN,NNP,compound,123,Xxxx,True,False,"[120, 133): 'Attorney General John Ashcroft, f..."
123,123,"[586, 594): 'Ashcroft'","[123, 124): 'Ashcroft'",Ashcroft,PROPN,NNP,nsubj,128,Xxxxx,True,False,"[120, 133): 'Attorney General John Ashcroft, f..."
130,130,"[614, 620): 'career'","[130, 131): 'career'",career,NOUN,NN,compound,131,xxxx,True,False,"[120, 133): 'Attorney General John Ashcroft, f..."
131,131,"[621, 631): 'politician'","[131, 132): 'politician'",politician,NOUN,NN,attr,128,xxxx,True,False,"[120, 133): 'Attorney General John Ashcroft, f..."


In [11]:
# Render the partial parse trees of just those tokens
pt.render_parse_tree(compound_noun_tokens)

In [12]:
# Build some business rules that define some text features.
# The rules are organized into Python classes.
# The output of each rule is a Pandas DataFrame.

class Dictionaries:
    """
    Rules that evaluate dictionaries against the document's raw tokens.
    """
    def __init__(self, d: Document, resources: Resources):
        self._d = d
        self._resources = resources
    
    @memoized_property
    def GlobalFirstName(self):
        return pt.extract_dict(self._d.Tokens, self._resources.GlobalFirstNameDict)
    
    @memoized_property
    def GlobalLastName(self):
        return pt.extract_dict(self._d.Tokens, self._resources.GlobalLastNameDict)

class Regexes:
    """
    Rules that evaluate regular expressions against the document's raw tokens.
    """
    def __init__(self, d: Document, resources: Resources):
        self._d = d
        self._resources = resources
    
    @property
    def CapsWord(self):
        """
        A single token that starts with a capital letter, with subsequent letters not
        capitalized.
        """
        return pt.extract_regex_tok(
            tokens = self._d.Tokens,
            compiled_regex = self._resources.CapsWordRegex)
    

class Morphology:
    """
    Rules that filter tokens according to shallow linguistic features.
    """
    def __init__(self, d: Document):
        self._d = d
        
    @property
    def ProperNounToken(self):
        """
        Tokens that the part of speech tagger tagged as proper nouns.
        """
        feats = self._d.TokenFeatures
        return pd.DataFrame({"match": feats["token_span"][feats["tag"] == "NNP"]})



In [13]:
# Show the tokens labeled as proper nouns
doc = Document(TEST_TEXT, resources)
morph = Morphology(doc)
morph.ProperNounToken

Unnamed: 0,match
4,"[4, 5): 'Joe'"
5,"[5, 6): 'Van'"
6,"[6, 7): 'Der'"
7,"[7, 8): 'Wals'"
9,"[9, 10): 'Mary'"
...,...
319,"[319, 320): 'Chairman'"
320,"[320, 321): 'Ron'"
321,"[321, 322): 'Brown'"
335,"[335, 336): 'Bill'"


In [14]:
# Pretty-print the spans in ProperNounToken
morph.ProperNounToken["match"].values

Unnamed: 0,begin,end,begin_token,end_token,covered_text
0,20,23,4,5,Joe
1,24,27,5,6,Van
2,28,31,6,7,Der
3,32,36,7,8,Wals
4,38,42,9,10,Mary
5,43,46,10,11,Van
6,47,52,11,12,Trapp
7,55,61,13,14,BEGALA
8,203,212,51,52,secretary
9,216,224,53,54,homeland


In [15]:
# Write some additional business rules that define a person extractor.
# Note the use of a Python method to avoid duplicate code in the rules.
    
class PersonName:
    """
    Rules that extract potential person name entities.
    """
    def __init__(self, doc: Document, dicts: Dictionaries, regexes: Regexes,
                 morphology: Morphology):
        self._doc = doc
        self._dicts = dicts
        self._regexes = regexes
        self._morphology = morphology

    @staticmethod
    def first_last_name(first: pd.DataFrame, last: pd.DataFrame):
        """
        Generic <first name> <last name> pattern match. Subroutine of rules below.
        
        :param first: DataFrame of first names, with the name in the column "match".
        
        :param last: DataFrame of last names, with the name in the column "match".
        
        :returns: A DataFrame with all <first name> <last name> matches, including the
            columns "first_name", "last_name", and "name" 
            (span that covers both first and last names)
        """
        ret = pt.adjacent_join(
            first_series = first["match"],
            second_series = last["match"],
            first_name = "first_name",
            second_name = "last_name")
        ret["name"] = pt.combine_spans(ret["first_name"], ret["last_name"])
        return ret
    
    @property
    def Person1(self):
        """
        <match of GlobalFirstName dict> <match of GlobalLastName dict>
        """
        return PersonName.first_last_name(self._dicts.GlobalFirstName, self._dicts.GlobalLastName)
    
    @property
    def Person2(self):
        """
        <match of GlobalFirstName dict> <capitalized word>
        """
        return PersonName.first_last_name(self._dicts.GlobalFirstName, self._regexes.CapsWord)
    
    @property
    def Person3(self):
        """
        <token labeled as proper noun> <match of GlobalLastName dict>
        """
        return PersonName.first_last_name(self._morphology.ProperNounToken, self._dicts.GlobalLastName)


In [16]:
# Instantiate our rules for a document
doc = Document(TEST_TEXT, resources)
dicts = Dictionaries(doc, resources)
regexes = Regexes(doc, resources)
morph = Morphology(doc)
persons = PersonName(doc, dicts, regexes, morph)

In [17]:
# Show one of the output DataFrames
persons.Person3

Unnamed: 0,first_name,last_name,name
0,"[4, 5): 'Joe'","[5, 8): 'Van Der Wals'","[4, 8): 'Joe Van Der Wals'"
1,"[9, 10): 'Mary'","[10, 12): 'Van Trapp'","[9, 12): 'Mary Van Trapp'"
2,"[10, 11): 'Van'","[11, 12): 'Trapp'","[10, 12): 'Van Trapp'"
3,"[173, 174): 'Asa'","[174, 175): 'Hutchinson'","[173, 175): 'Asa Hutchinson'"
4,"[276, 277): 'S.'","[277, 278): 'Cohen'","[276, 278): 'S. Cohen'"
5,"[295, 296): 'Henry'","[296, 297): 'Cisneros'","[295, 297): 'Henry Cisneros'"
6,"[299, 300): 'Federico'","[300, 301): 'Pena'","[299, 301): 'Federico Pena'"
7,"[302, 303): 'Bill'","[303, 304): 'Richardson'","[302, 304): 'Bill Richardson'"
8,"[305, 306): 'Richard'","[306, 307): 'Riley'","[305, 307): 'Richard Riley'"
9,"[335, 336): 'Bill'","[336, 337): 'Daly'","[335, 337): 'Bill Daly'"


In [18]:
# Show a detailed view of the "name" column of the above DataFrame
persons.Person3["name"].values

Unnamed: 0,begin,end,begin_token,end_token,covered_text
0,20,36,4,8,Joe Van Der Wals
1,38,52,9,12,Mary Van Trapp
2,43,52,10,12,Van Trapp
3,837,851,173,175,Asa Hutchinson
4,1385,1393,276,278,S. Cohen
5,1462,1476,295,297,Henry Cisneros
6,1478,1491,299,301,Federico Pena
7,1493,1508,302,304,Bill Richardson
8,1510,1523,305,307,Richard Riley
9,1658,1667,335,337,Bill Daly
