### Useful Links
Information extraction 
<ul>
    <li><a href="https://towardsdatascience.com/named-entity-recognition-with-nltk-and-spacy-8c4a7d88e7da">NLTK</a></li>
    <li><a href="https://www.analyticsvidhya.com/blog/2020/06/nlp-project-information-extraction/">Finding Patterns</a></li>
    <li><a href="https://www.analyticsvidhya.com/blog/2019/09/introduction-information-extraction-python-spacy/?utm_source=blog&utm_medium=nlp-project-information-extraction">Hearst Patterns</a></li>
</ul>

In [1]:
# perform standard imports
import spacy
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
from spacy import displacy
from spacy.matcher import Matcher
from spacy.tokens import Span
from spacy.pipeline import EntityRuler

nlp = spacy.load('en_core_web_sm')

In [2]:
# create a simple Doc object
doc = nlp(u"The children love cream biscuits")

### Part of Speech (POS)
There are eight different POS in the English language: noun, pronoun, verb, adjective, adverb, preposition, conjunction, and intersection.

In [3]:
for token in doc:
    print(f'{token.text:{10}} {token.pos_:{8}} {token.tag_:{6}} {spacy.explain(token.tag_)}')

The        DET      DT     determiner
children   NOUN     NNS    noun, plural
love       VERB     VBP    verb, non-3rd person singular present
cream      NOUN     NN     noun, singular or mass
biscuits   NOUN     NNS    noun, plural


In [4]:
# extract nouns
for token in doc:
    # check token pos
    if token.pos_== 'NOUN':
        # print token
        print(token.text)

children
cream
biscuits


### Coarse-grained Part-of-speech Tags
Every token is assigned a POS Tag from the following list:


<table><tr><th>POS</th><th>DESCRIPTION</th><th>EXAMPLES</th></tr>
    
<tr><td>ADJ</td><td>adjective</td><td>*big, old, green, incomprehensible, first*</td></tr>
<tr><td>ADP</td><td>adposition</td><td>*in, to, during*</td></tr>
<tr><td>ADV</td><td>adverb</td><td>*very, tomorrow, down, where, there*</td></tr>
<tr><td>AUX</td><td>auxiliary</td><td>*is, has (done), will (do), should (do)*</td></tr>
<tr><td>CONJ</td><td>conjunction</td><td>*and, or, but*</td></tr>
<tr><td>CCONJ</td><td>coordinating conjunction</td><td>*and, or, but*</td></tr>
<tr><td>DET</td><td>determiner</td><td>*a, an, the*</td></tr>
<tr><td>INTJ</td><td>interjection</td><td>*psst, ouch, bravo, hello*</td></tr>
<tr><td>NOUN</td><td>noun</td><td>*girl, cat, tree, air, beauty*</td></tr>
<tr><td>NUM</td><td>numeral</td><td>*1, 2017, one, seventy-seven, IV, MMXIV*</td></tr>
<tr><td>PART</td><td>particle</td><td>*'s, not,*</td></tr>
<tr><td>PRON</td><td>pronoun</td><td>*I, you, he, she, myself, themselves, somebody*</td></tr>
<tr><td>PROPN</td><td>proper noun</td><td>*Mary, John, London, NATO, HBO*</td></tr>
<tr><td>PUNCT</td><td>punctuation</td><td>*., (, ), ?*</td></tr>
<tr><td>SCONJ</td><td>subordinating conjunction</td><td>*if, while, that*</td></tr>
<tr><td>SYM</td><td>symbol</td><td>*$, %, §, ©, +, −, ×, ÷, =, :), 😝*</td></tr>
<tr><td>VERB</td><td>verb</td><td>*run, runs, running, eat, ate, eating*</td></tr>
<tr><td>X</td><td>other</td><td>*sfpksdpsxmsa*</td></tr>
<tr><td>SPACE</td><td>space</td></tr>

___
### Fine-grained Part-of-speech Tags
Tokens are subsequently given a fine-grained tag as determined by morphology:
<table>
<tr><th>POS</th><th>Description</th><th>Fine-grained Tag</th><th>Description</th><th>Morphology</th></tr>
<tr><td>ADJ</td><td>adjective</td><td>AFX</td><td>affix</td><td>Hyph=yes</td></tr>
<tr><td>ADJ</td><td></td><td>JJ</td><td>adjective</td><td>Degree=pos</td></tr>
<tr><td>ADJ</td><td></td><td>JJR</td><td>adjective, comparative</td><td>Degree=comp</td></tr>
<tr><td>ADJ</td><td></td><td>JJS</td><td>adjective, superlative</td><td>Degree=sup</td></tr>
<tr><td>ADJ</td><td></td><td>PDT</td><td>predeterminer</td><td>AdjType=pdt PronType=prn</td></tr>
<tr><td>ADJ</td><td></td><td>PRP\$</td><td>pronoun, possessive</td><td>PronType=prs Poss=yes</td></tr>
<tr><td>ADJ</td><td></td><td>WDT</td><td>wh-determiner</td><td>PronType=int rel</td></tr>
<tr><td>ADJ</td><td></td><td>WP\$</td><td>wh-pronoun, possessive</td><td>Poss=yes PronType=int rel</td></tr>
<tr><td>ADP</td><td>adposition</td><td>IN</td><td>conjunction, subordinating or preposition</td><td></td></tr>
<tr><td>ADV</td><td>adverb</td><td>EX</td><td>existential there</td><td>AdvType=ex</td></tr>
<tr><td>ADV</td><td></td><td>RB</td><td>adverb</td><td>Degree=pos</td></tr>
<tr><td>ADV</td><td></td><td>RBR</td><td>adverb, comparative</td><td>Degree=comp</td></tr>
<tr><td>ADV</td><td></td><td>RBS</td><td>adverb, superlative</td><td>Degree=sup</td></tr>
<tr><td>ADV</td><td></td><td>WRB</td><td>wh-adverb</td><td>PronType=int rel</td></tr>
<tr><td>CONJ</td><td>conjunction</td><td>CC</td><td>conjunction, coordinating</td><td>ConjType=coor</td></tr>
<tr><td>DET</td><td>determiner</td><td>DT</td><td>determiner</td><td></td></tr>
<tr><td>INTJ</td><td>interjection</td><td>UH</td><td>interjection</td><td></td></tr>
<tr><td>NOUN</td><td>noun</td><td>NN</td><td>noun, singular or mass</td><td>Number=sing</td></tr>
<tr><td>NOUN</td><td></td><td>NNS</td><td>noun, plural</td><td>Number=plur</td></tr>
<tr><td>NOUN</td><td></td><td>WP</td><td>wh-pronoun, personal</td><td>PronType=int rel</td></tr>
<tr><td>NUM</td><td>numeral</td><td>CD</td><td>cardinal number</td><td>NumType=card</td></tr>
<tr><td>PART</td><td>particle</td><td>POS</td><td>possessive ending</td><td>Poss=yes</td></tr>
<tr><td>PART</td><td></td><td>RP</td><td>adverb, particle</td><td></td></tr>
<tr><td>PART</td><td></td><td>TO</td><td>infinitival to</td><td>PartType=inf VerbForm=inf</td></tr>
<tr><td>PRON</td><td>pronoun</td><td>PRP</td><td>pronoun, personal</td><td>PronType=prs</td></tr>
<tr><td>PROPN</td><td>proper noun</td><td>NNP</td><td>noun, proper singular</td><td>NounType=prop Number=sign</td></tr>
<tr><td>PROPN</td><td></td><td>NNPS</td><td>noun, proper plural</td><td>NounType=prop Number=plur</td></tr>
<tr><td>PUNCT</td><td>punctuation</td><td>-LRB-</td><td>left round bracket</td><td>PunctType=brck PunctSide=ini</td></tr>
<tr><td>PUNCT</td><td></td><td>-RRB-</td><td>right round bracket</td><td>PunctType=brck PunctSide=fin</td></tr>
<tr><td>PUNCT</td><td></td><td>,</td><td>punctuation mark, comma</td><td>PunctType=comm</td></tr>
<tr><td>PUNCT</td><td></td><td>:</td><td>punctuation mark, colon or ellipsis</td><td></td></tr>
<tr><td>PUNCT</td><td></td><td>.</td><td>punctuation mark, sentence closer</td><td>PunctType=peri</td></tr>
<tr><td>PUNCT</td><td></td><td>''</td><td>closing quotation mark</td><td>PunctType=quot PunctSide=fin</td></tr>
<tr><td>PUNCT</td><td></td><td>""</td><td>closing quotation mark</td><td>PunctType=quot PunctSide=fin</td></tr>
<tr><td>PUNCT</td><td></td><td>``</td><td>opening quotation mark</td><td>PunctType=quot PunctSide=ini</td></tr>
<tr><td>PUNCT</td><td></td><td>HYPH</td><td>punctuation mark, hyphen</td><td>PunctType=dash</td></tr>
<tr><td>PUNCT</td><td></td><td>LS</td><td>list item marker</td><td>NumType=ord</td></tr>
<tr><td>PUNCT</td><td></td><td>NFP</td><td>superfluous punctuation</td><td></td></tr>
<tr><td>SYM</td><td>symbol</td><td>#</td><td>symbol, number sign</td><td>SymType=numbersign</td></tr>
<tr><td>SYM</td><td></td><td>\$</td><td>symbol, currency</td><td>SymType=currency</td></tr>
<tr><td>SYM</td><td></td><td>SYM</td><td>symbol</td><td></td></tr>
<tr><td>VERB</td><td>verb</td><td>BES</td><td>auxiliary "be"</td><td></td></tr>
<tr><td>VERB</td><td></td><td>HVS</td><td>forms of "have"</td><td></td></tr>
<tr><td>VERB</td><td></td><td>MD</td><td>verb, modal auxiliary</td><td>VerbType=mod</td></tr>
<tr><td>VERB</td><td></td><td>VB</td><td>verb, base form</td><td>VerbForm=inf</td></tr>
<tr><td>VERB</td><td></td><td>VBD</td><td>verb, past tense</td><td>VerbForm=fin Tense=past</td></tr>
<tr><td>VERB</td><td></td><td>VBG</td><td>verb, gerund or present participle</td><td>VerbForm=part Tense=pres Aspect=prog</td></tr>
<tr><td>VERB</td><td></td><td>VBN</td><td>verb, past participle</td><td>VerbForm=part Tense=past Aspect=perf</td></tr>
<tr><td>VERB</td><td></td><td>VBP</td><td>verb, non-3rd person singular present</td><td>VerbForm=fin Tense=pres</td></tr>
<tr><td>VERB</td><td></td><td>VBZ</td><td>verb, 3rd person singular present</td><td>VerbForm=fin Tense=pres Number=sing Person=3</td></tr>
<tr><td>X</td><td>other</td><td>ADD</td><td>email</td><td></td></tr>
<tr><td>X</td><td></td><td>FW</td><td>foreign word</td><td>Foreign=yes</td></tr>
<tr><td>X</td><td></td><td>GW</td><td>additional word in multi-word expression</td><td></td></tr>
<tr><td>X</td><td></td><td>XX</td><td>unknown</td><td></td></tr>
<tr><td>SPACE</td><td>space</td><td>_SP</td><td>space</td><td></td></tr>
<tr><td></td><td></td><td>NIL</td><td>missing tag</td><td></td></tr>
</table>

### Dependency Graph
The arrows carry a lot of significance:
<ul>
    <li>The arrowhead points to the words that are dependent on the word pointed by the origin of the arrow</li>
    <li>The former is referred to as the child node of the latter. For example, “children” is the child node of “love”</li>
    <li>The word which has no incoming arrow is called the root node of the sentence</li>
</ul>

In [5]:
# render the dependency parse immediately inside Jupyter:
displacy.render(doc, style='dep', jupyter=True, options={'distance': 110})

In [6]:
# extract subject and object
for token in doc:
    # extract subject
    if (token.dep_=='nsubj'):
        print(token.text)
    # extract object
    elif (token.dep_=='dobj'):
        print(token.text)

children
biscuits


### Named Entity Recognition (NER)


In [7]:
# function to display basic entity info:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))
    else:
        print('No named entities found.')

In [8]:
doc = nlp(u'May I go to Washington, DC next May to see the Washington Monument?')

show_ents(doc)

Washington, DC - GPE - Countries, cities, states
next May - DATE - Absolute or relative dates or periods
the Washington Monument - ORG - Companies, agencies, institutions, etc.


### NER Tags
Tags are accessible through the `.label_` property of an entity.
<table>
<tr><th>TYPE</th><th>DESCRIPTION</th><th>EXAMPLE</th></tr>
<tr><td>`PERSON`</td><td>People, including fictional.</td><td>*Fred Flintstone*</td></tr>
<tr><td>`NORP`</td><td>Nationalities or religious or political groups.</td><td>*The Republican Party*</td></tr>
<tr><td>`FAC`</td><td>Buildings, airports, highways, bridges, etc.</td><td>*Logan International Airport, The Golden Gate*</td></tr>
<tr><td>`ORG`</td><td>Companies, agencies, institutions, etc.</td><td>*Microsoft, FBI, MIT*</td></tr>
<tr><td>`GPE`</td><td>Countries, cities, states.</td><td>*France, UAR, Chicago, Idaho*</td></tr>
<tr><td>`LOC`</td><td>Non-GPE locations, mountain ranges, bodies of water.</td><td>*Europe, Nile River, Midwest*</td></tr>
<tr><td>`PRODUCT`</td><td>Objects, vehicles, foods, etc. (Not services.)</td><td>*Formula 1*</td></tr>
<tr><td>`EVENT`</td><td>Named hurricanes, battles, wars, sports events, etc.</td><td>*Olympic Games*</td></tr>
<tr><td>`WORK_OF_ART`</td><td>Titles of books, songs, etc.</td><td>*The Mona Lisa*</td></tr>
<tr><td>`LAW`</td><td>Named documents made into laws.</td><td>*Roe v. Wade*</td></tr>
<tr><td>`LANGUAGE`</td><td>Any named language.</td><td>*English*</td></tr>
<tr><td>`DATE`</td><td>Absolute or relative dates or periods.</td><td>*20 July 1969*</td></tr>
<tr><td>`TIME`</td><td>Times smaller than a day.</td><td>*Four hours*</td></tr>
<tr><td>`PERCENT`</td><td>Percentage, including "%".</td><td>*Eighty percent*</td></tr>
<tr><td>`MONEY`</td><td>Monetary values, including unit.</td><td>*Twenty Cents*</td></tr>
<tr><td>`QUANTITY`</td><td>Measurements, as of weight or distance.</td><td>*Several kilometers, 55kg*</td></tr>
<tr><td>`ORDINAL`</td><td>"first", "second", etc.</td><td>*9th, Ninth*</td></tr>
<tr><td>`CARDINAL`</td><td>Numerals that do not fall under another type.</td><td>*2, Two, Fifty-two*</td></tr>
</table>

### Visualising Named Entities

In [9]:
nlp(u'credits will be allocated on 1 Jan of every year '
         u'carried forward up to one credit year.')

'''
doc = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million. '
         u'By contrast, Sony sold only 7 thousand Walkman music players.')
'''

for sent in doc.sents:
    docx = nlp(sent.text)
    if docx.ents:
        displacy.render(docx, style='ent', jupyter=True)
    else:
        print(docx.text)

### Pattern Matching

In [10]:
# create a simple Doc object
doc = nlp(u'credits will be allocated on 1 Jan of every year '
         u'carried forward up to one credit year.')

# print token, dependency, POS tag 
for token in doc:
    print(f'{token.text:{10}} {token.pos_:{8}} {token.dep_:{6}} {spacy.explain(token.tag_)}')

credits    NOUN     nsubjpass noun, plural
will       VERB     aux    verb, modal auxiliary
be         AUX      auxpass verb, base form
allocated  VERB     ROOT   verb, past participle
on         ADP      prep   conjunction, subordinating or preposition
1          NUM      nummod cardinal number
Jan        PROPN    pobj   noun, proper singular
of         ADP      prep   conjunction, subordinating or preposition
every      DET      det    determiner
year       NOUN     pobj   noun, singular or mass
carried    VERB     advcl  verb, past participle
forward    ADV      advmod adverb
up         ADP      prep   conjunction, subordinating or preposition
to         ADP      prep   conjunction, subordinating or preposition
one        NUM      nummod cardinal number
credit     NOUN     compound noun, singular or mass
year       NOUN     pobj   noun, singular or mass
.          PUNCT    punct  punctuation mark, sentence closer


In [11]:
# define the pattern 
pattern = [{"label": "NOUN", "pattern": "church"}, 
             {"label": "ORG",              
             "pattern": [{"lower": "the"}, 
             {"lower": {"IN": ["first", "second", "third"]}},                          
             {"ORTH": "Estate"}]}]

'''
# the key ‘OP’: ‘?’ in the pattern means that the modifier ('amod') can occur once or not at all.
advancedPattern = [{'DEP':'amod', 'OP':"?"}, # adjectival modifier
                   {'POS':'NOUN'},
                   {'LOWER': 'such'},
                   {'LOWER': 'as'},
                   {'POS': 'PROPN'}]
'''

'\n# the key ‘OP’: ‘?’ in the pattern means that the modifier (\'amod\') can occur once or not at all.\nadvancedPattern = [{\'DEP\':\'amod\', \'OP\':"?"}, # adjectival modifier\n                   {\'POS\':\'NOUN\'},\n                   {\'LOWER\': \'such\'},\n                   {\'LOWER\': \'as\'},\n                   {\'POS\': \'PROPN\'}]\n'

In [12]:
# instantiate an object of EntityRuler class
ruler = EntityRuler(nlp)

# add the pattern to the matcher object
ruler.add_patterns(pattern)

# add the matcher object as a new pipe to the model
nlp.add_pipe(ruler)

# print the entities in the sentenced after adding the EntityRuler matcher
print([(ent.text, ent.label_) for ent in doc.ents])

'''
# matcher class object 
matcher = Matcher(nlp.vocab) 
matcher.add("matching_1", None, pattern) 

matches = matcher(doc)
for match in matches:
    span = doc[match[1]:match[2]] 
    print(span.text)
'''

[('1 Jan of every year', 'DATE'), ('one credit year', 'DATE')]


'\n# matcher class object \nmatcher = Matcher(nlp.vocab) \nmatcher.add("matching_1", None, pattern) \n\nmatches = matcher(doc)\nfor match in matches:\n    span = doc[match[1]:match[2]] \n    print(span.text)\n'

In [13]:
'''
# matcher class object 
matcher = Matcher(nlp.vocab) 
matcher.add("matching_2", None, advancedPattern) 

matches = matcher(doc)
for match in matches:
    span = doc[match[1]:match[2]] 
    print(span.text)
'''

'\n# matcher class object \nmatcher = Matcher(nlp.vocab) \nmatcher.add("matching_2", None, advancedPattern) \n\nmatches = matcher(doc)\nfor match in matches:\n    span = doc[match[1]:match[2]] \n    print(span.text)\n'

### Reading HTML

In [14]:
with open('htmlWithTables.txt','r') as txt:
    htmlText = txt.read()
    soup = BeautifulSoup(htmlText, 'lxml')

In [15]:
# retrieve text of tags in HTML
print(soup.h2.text)
print(soup.head.text)
print(soup.li.text)

Content

Header


Contract Staff


In [16]:
# retrieve all text of li tags in HTML
for tag in soup.find_all('li'):
    print(f'{tag.name}: {tag.text}')

li: Contract Staff
li: Temporary Staff


In [17]:
# create convert table into dataframe
df = pd.read_html(htmlText, header = 0)

firstTable = df[0]
print(firstTable)

print("\n")

print(firstTable.loc[0])

   Tier                   Length       Credits
0     1  0 to 1 years of service     1 credits
1     2  2 to 3 years of service    10 credits
2     3  4 to 5 years of service   100 credits
3     4  6 to 7 years of service  1000 credits


Tier                             1
Length     0 to 1 years of service
Credits                  1 credits
Name: 0, dtype: object


In [18]:
# create a simple Doc object
doc = nlp(htmlText)