In [1]:
%%time
import spacy
nlp = spacy.load("en_core_web_trf")
nlp.add_pipe("merge_entities")

CPU times: user 5.94 s, sys: 2.21 s, total: 8.16 s
Wall time: 11.7 s


<function spacy.pipeline.functions.merge_entities(doc: spacy.tokens.doc.Doc)>

In [2]:
tags = ['$', '-LRB-', '-RRB-', '.', ':', 'ADD', 'AFX', 'CC', 'CD', 'DT', 'EX', 'FW', 'HYPH', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NFP', 'NN', 'NNP', 'NNPS', 'NNS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB', 'XX']
for tag in tags:
    print(tag, '=>', spacy.explain(str(tag)))

$ => symbol, currency
-LRB- => left round bracket
-RRB- => right round bracket
. => punctuation mark, sentence closer
: => punctuation mark, colon or ellipsis
ADD => email
AFX => affix
CC => conjunction, coordinating
CD => cardinal number
DT => determiner
EX => existential there
FW => foreign word
HYPH => punctuation mark, hyphen
IN => conjunction, subordinating or preposition
JJ => adjective (English), other noun-modifier (Chinese)
JJR => adjective, comparative
JJS => adjective, superlative
LS => list item marker
MD => verb, modal auxiliary
NFP => superfluous punctuation
NN => noun, singular or mass
NNP => noun, proper singular
NNPS => noun, proper plural
NNS => noun, plural
PDT => predeterminer
POS => possessive ending
PRP => pronoun, personal
PRP$ => pronoun, possessive
RB => adverb
RBR => adverb, comparative
RBS => adverb, superlative
RP => adverb, particle
SYM => symbol
TO => infinitival "to"
UH => interjection
VB => verb, base form
VBD => verb, past tense
VBG => verb, gerund or p

In [3]:
spacy.explain("advcl")

'adverbial clause modifier'

In [4]:
%%time

from spacy.tokens import Token
from spacy.symbols import NOUN, PROPN, PRON, ADJ, ADV, ADP, VERB


def get_left_edge(word):
    
    nouns = [NOUN, PROPN, PRON]
    if word.pos not in nouns:
        return word.i
    
    if word.left_edge.pos == PRON:
        return word.i
    
    return word.left_edge.i

def get_right_edge(word):
    
    nouns = [NOUN, PROPN, PRON]
    if word.pos not in nouns:
        return word.i
    
    next_token = next(word.rights, False)
    
    if next_token and next_token.pos in (ADJ, ADV):
        return next_token.i
    
    if next_token and next_token.pos == ADP:   
        for right_token in next_token.rights:
            if right_token.dep_ == "pobj" and right_token.head.i == next_token.i:
                return right_token.i
            
    return word.i

Token.set_extension("get_left_edge", getter=get_left_edge, force=True)
Token.set_extension("get_right_edge", getter=get_right_edge, force=True)
print(Token.has_extension("get_left_edge"))
print(Token.has_extension("get_right_edge"))

True
True
CPU times: user 136 µs, sys: 40 µs, total: 176 µs
Wall time: 174 µs


In [5]:
from LoadText import load_text

bush_filename = "20010920-Address to Joint Session of Congress Following 911 Attacks.txt"
laden_filename = "19960823-Declaration of Jihad Against the Americans Occupying the Land of the Two Holiest Sites.txt"

bush = load_text(bush_filename)
binLaden = load_text(laden_filename)
print(f'Bush: {len(bush)} words')
print(f'bin Laden: {len(binLaden)} words')

Bush: 17321 words
bin Laden: 48741 words


In [None]:
%load_ext autoreload
%autoreload 2

from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))


from dependency_patterns import _object, _subject, _subject_passive, _adjective
from dependency_patterns import coordination_deps, _prepositional_object, _adposition, nominal_deps, adverbial_deps, complements_deps
from spacy.matcher import DependencyMatcher
from DependencyMatchers import CustomDependencyMatcher
import pandas as pd
from spacy import displacy

# dependency_matcher = DependencyMatcher(nlp.vocab)

NOUN_POS = {"POS": {"IN": ["NOUN", "PROPN", "PRON"]}}
VERB_POS = {"POS": {"IN": ["VERB", "AUX"]}}
CLAUSE = {"POS": "VERB", "MORPH": {"INTERSECTS": ["Tense=Past"]}}

travel_verbs = {"LEMMA": {"IN": ["traffic", "smuggle", "bring", "travel", "take"]}}
live_verbs = {"LEMMA": {"IN": ["traffic", "bring", "travel", "take"]}}
mode_lemma = {"LEMMA": {"IN": ["on"]}}
to_lemma = {"LEMMA": "to"}
from_lemma = {"LEMMA": "from"}
by_lemma = {"LEMMA": "by"}
VERB = {"POS": {"IN": ["VERB"]}}
AUX = {"POS": {"IN": ["AUX"]}}
GPE = {"ENT_TYPE": "GPE"}
agent_dep = {"DEP": {"IN": ["agent"]}}

PAST_TENSE_VERB_TAGS = ["VBD", "VBN"]
CLAUSE = {"TAG": {"IN": PAST_TENSE_VERB_TAGS}}
VERB_TAGS = ['VB', 'VBG', 'VBP', 'VBZ']
VERBS = {"TAG": {"IN": VERB_TAGS}}

patterns = [
    {
        "pattern_name": "NarrativeClause",
        "pattern":
            [
                {"RIGHT_ID": "PREDICATE", "RIGHT_ATTRS": CLAUSE},
                {"LEFT_ID": "PREDICATE", "REL_OP": ">", "RIGHT_ID": "SUBJECT", "RIGHT_ATTRS": _subject},
                {"LEFT_ID": "PREDICATE", "REL_OP": ">", "RIGHT_ID": "DIRECTOBJECT", "RIGHT_ATTRS": _object},
            ],
        "category": "hasEvent", "inverse": "isEventOf"
    },
    {
        "pattern_name": "NarrativeClauseNominals",
        "pattern":
            [
                {"RIGHT_ID": "PREDICATE", "RIGHT_ATTRS": CLAUSE},
                {"LEFT_ID": "PREDICATE", "REL_OP": ">", "RIGHT_ID": "SUBJECT", "RIGHT_ATTRS": _subject},
                {"LEFT_ID": "PREDICATE", "REL_OP": ">", "RIGHT_ID": "DIRECTOBJECT", "RIGHT_ATTRS": _object},
                {"LEFT_ID": "DIRECTOBJECT", "REL_OP": ">", "RIGHT_ID": "NOMINAL", "RIGHT_ATTRS": nominal_deps},
                
            ],
        "category": "hasEvent", "inverse": "isEventOf"
    },
    {
        "pattern_name": "PrepositionalNarrativeClause",
        "pattern":
            [
                {"RIGHT_ID": "PREDICATE", "RIGHT_ATTRS": CLAUSE},
                {"LEFT_ID": "PREDICATE", "REL_OP": ">", "RIGHT_ID": "SUBJECT", "RIGHT_ATTRS": _subject},
                {"LEFT_ID": "PREDICATE", "REL_OP": ">", "RIGHT_ID": "PREPOSITION", "RIGHT_ATTRS": _adposition},
                {"LEFT_ID": "PREPOSITION", "REL_OP": ">", "RIGHT_ID": "PREPOSITIONALOBJECT", "RIGHT_ATTRS": _prepositional_object},
            ],
        "category": "hasEvent", "inverse": "isEventOf"
    },
    {
    "pattern_name": "PassivePrepositionalNarrativeClause",
    "pattern":
        [
            {"RIGHT_ID": "PREDICATE", "RIGHT_ATTRS": CLAUSE},
            {"LEFT_ID": "PREDICATE", "REL_OP": ">", "RIGHT_ID": "SUBJECT", "RIGHT_ATTRS": _subject_passive},
            {"LEFT_ID": "PREDICATE", "REL_OP": ">", "RIGHT_ID": "PREPOSITION", "RIGHT_ATTRS": _adposition},
            {"LEFT_ID": "PREPOSITION", "REL_OP": ">", "RIGHT_ID": "PREPOSITIONALOBJECT", "RIGHT_ATTRS": _prepositional_object},
        ],
    "category": "hasEvent", "inverse": "isEventOf"
    },
    {
    "pattern_name": "hasAttribute",
    "pattern":
        [
            {"RIGHT_ID": "PREDICATE", "RIGHT_ATTRS": AUX},
            {"LEFT_ID": "PREDICATE", "REL_OP": ">", "RIGHT_ID": "SUBJECT", "RIGHT_ATTRS": _subject},
            {"LEFT_ID": "PREDICATE", "REL_OP": ">", "RIGHT_ID": "MODIFIER", "RIGHT_ATTRS": _adjective},
        ],
    "category": "hasAttribute", "inverse": "isAttributeOf"
    },
    {
    "pattern_name": "hasPrepositionalAttribute",
    "pattern":
        [
            {"RIGHT_ID": "PREDICATE", "RIGHT_ATTRS": AUX},
            {"LEFT_ID": "PREDICATE", "REL_OP": ">", "RIGHT_ID": "SUBJECT", "RIGHT_ATTRS": _subject},
            {"LEFT_ID": "PREDICATE", "REL_OP": ">", "RIGHT_ID": "PREPOSITION", "RIGHT_ATTRS": _adposition},
            {"LEFT_ID": "PREPOSITION", "REL_OP": ">", "RIGHT_ID": "MODIFIER", "RIGHT_ATTRS": _adjective},
        ],
    "category": "hasAttribute", "inverse": "isAttributeOf"
    },
    {
        "pattern_name": "VerbClause",
        "pattern":
        [
            {"RIGHT_ID": "PREDICATE", "RIGHT_ATTRS": VERBS},
            {"LEFT_ID": "PREDICATE", "REL_OP": ">", "RIGHT_ID": "SUBJECT", "RIGHT_ATTRS": _subject},
            {"LEFT_ID": "PREDICATE", "REL_OP": ">", "RIGHT_ID": "DIRECTOBJECT", "RIGHT_ATTRS": _object},
        ],
        "category": "hasLinkTo", "inverse": "isLinkedTo"
    },
    {
        "pattern_name": "PrepositionalVerbClause",
        "pattern":
        [
            {"RIGHT_ID": "PREDICATE", "RIGHT_ATTRS": VERBS},
            {"LEFT_ID": "PREDICATE", "REL_OP": ">", "RIGHT_ID": "SUBJECT", "RIGHT_ATTRS": _subject},
            {"LEFT_ID": "PREDICATE", "REL_OP": ">", "RIGHT_ID": "PREPOSITION", "RIGHT_ATTRS": _adposition},
            {"LEFT_ID": "PREPOSITION", "REL_OP": ">", "RIGHT_ID": "PREPOSITIONALOBJECT", "RIGHT_ATTRS": _prepositional_object},
        ],
        "category": "hasLinkTo", "inverse": "isLinkedTo"
    },
    {
        "pattern_name": "AdverbialModifierVerbClause",
        "pattern":
        [
            {"RIGHT_ID": "PREDICATE", "RIGHT_ATTRS": VERBS},
            {"LEFT_ID": "PREDICATE", "REL_OP": ">", "RIGHT_ID": "PREDICATEMODIFIER", "RIGHT_ATTRS": adverbial_deps},
            {"LEFT_ID": "PREDICATEMODIFIER", "REL_OP": ">", "RIGHT_ID": "PREPOSITION", "RIGHT_ATTRS": _adposition},
            {"LEFT_ID": "PREPOSITION", "REL_OP": ">", "RIGHT_ID": "PREPOSITIONALOBJECT", "RIGHT_ATTRS": _prepositional_object},
        ],
        "category": "hasLinkTo", "inverse": "isLinkedTo"
    },
    {
        "pattern_name": "ComplementVerbClause",
        "pattern":
        [
            {"RIGHT_ID": "PREDICATE", "RIGHT_ATTRS": VERBS},
            {"LEFT_ID": "PREDICATE", "REL_OP": ">", "RIGHT_ID": "COMPLEMENTVERB", "RIGHT_ATTRS": complements_deps},
            {"LEFT_ID": "COMPLEMENTVERB", "REL_OP": ">", "RIGHT_ID": "DIRECTOBJECT", "RIGHT_ATTRS": _object},

        ],
        "category": "hasLinkTo", "inverse": "isLinkedTo"
    },
    {
        "pattern_name": "SinglePassivePrepositionalClause",
        "pattern":
        [
            {"RIGHT_ID": "PREDICATE", "RIGHT_ATTRS": VERBS},
            {"LEFT_ID": "PREDICATE", "REL_OP": ">", "RIGHT_ID": "SUBJECT", "RIGHT_ATTRS": _subject_passive},
            {"LEFT_ID": "PREDICATE", "REL_OP": ">", "RIGHT_ID": "PREPOSITION", "RIGHT_ATTRS": _adposition},
            {"LEFT_ID": "PREPOSITION", "REL_OP": ">", "RIGHT_ID": "PREPOSITIONALOBJECT", "RIGHT_ATTRS": _prepositional_object},
        ],
        "category": "hasLinkTo", "inverse": "isLinkedTo"
    },
]

extract_clauses = CustomDependencyMatcher(nlp, patterns)
org_doc = nlp(bush)
# text = "The terrorists are traitors to their own faith, trying, in effect, to hijack Islam itself."
# org_doc = nlp(text)

for sent in org_doc.sents:

    doc = nlp(sent.text)

    clauses = extract_clauses(doc)
    if clauses:
        displacy.render(doc, style='dep')
        # print([chunk for chunk in doc.noun_chunks])
        print(doc)
        display(pd.DataFrame(clauses))
        print('-----')
    
    # tag_list = [token.tag_ for token in doc if token.pos_ == "VERB"]
    # if not set(tag_list).isdisjoint(set(PAST_TENSE_VERB_TAGS)):
    #     print([token for token in doc if token.tag_ in PAST_TENSE_VERB_TAGS])
    #     displacy.render(doc, style='dep')
    #     df = [(token, token.pos_, token.tag_, token.dep_, token.morph) for token in doc]
    #     display(pd.DataFrame(df).T)
    #     clauses = extract_clauses(doc)
    #     display(pd.DataFrame(clauses))
            

In [17]:
import pandas as pd
from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab)

pattern = [{"POS": {"IN": ["VERB", "AUX"]}, "MORPH": {"INTERSECTS": ["Tense=Past"]}}]

matcher.add("Verb_Past_Tense", [pattern])

text = """Your brothers and sons, the sons of the two holy mosques, have launched the jihad for the sake of God's cause to expel the occupying enemy from the country of the two holy mosques."""
doc = nlp(binLaden)

def display_df(doc_obj, match_function):
    
    df = []
    
    for match_id, start, end in match_function(doc_obj):
        verb = doc_obj[start:end]
        sentence = verb.root.sent.text.strip()
        pos = verb.root.pos_
        tag = verb.root.tag_
        explain = spacy.explain(tag)
        morph = verb.root.morph
        df.append((sentence, verb, pos, tag, explain, morph))

    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_rows', None)
    pd.set_option('display.expand_frame_repr', False)
    pd.set_option('max_colwidth', None)
    display(pd.DataFrame(df))
    
display_df(doc, matcher)

Unnamed: 0,0,1,2,3,4,5
0,Whoever is guided by God cannot be misled and whoever misleads can find no guidance.,(guided),VERB,VBN,"verb, past participle","(Aspect=Perf, Tense=Past, VerbForm=Part)"
1,Whoever is guided by God cannot be misled and whoever misleads can find no guidance.,(misled),VERB,VBN,"verb, past participle","(Aspect=Perf, Tense=Past, VerbForm=Part)"
2,"""O ye who believe, fear God as he should be feared and die not except in a state of Islam,"" ""O mankind, reverence your Guardian-Lord who created you from a single person, created, of like nature, his mate and from them twain scattered (like seeds) countless men and women; reverence God, through whom ye demand your mutual (rights) and (reverence) the wombs (that bore you) for God ever watches over you,"" ""O ye who believe, fear God and (always) say a word directed to the right, that he may make your conduct whole and sound and forgive you your sins.",(feared),VERB,VBN,"verb, past participle","(Aspect=Perf, Tense=Past, VerbForm=Part)"
3,"""O ye who believe, fear God as he should be feared and die not except in a state of Islam,"" ""O mankind, reverence your Guardian-Lord who created you from a single person, created, of like nature, his mate and from them twain scattered (like seeds) countless men and women; reverence God, through whom ye demand your mutual (rights) and (reverence) the wombs (that bore you) for God ever watches over you,"" ""O ye who believe, fear God and (always) say a word directed to the right, that he may make your conduct whole and sound and forgive you your sins.",(created),VERB,VBD,"verb, past tense","(Tense=Past, VerbForm=Fin)"
4,"""O ye who believe, fear God as he should be feared and die not except in a state of Islam,"" ""O mankind, reverence your Guardian-Lord who created you from a single person, created, of like nature, his mate and from them twain scattered (like seeds) countless men and women; reverence God, through whom ye demand your mutual (rights) and (reverence) the wombs (that bore you) for God ever watches over you,"" ""O ye who believe, fear God and (always) say a word directed to the right, that he may make your conduct whole and sound and forgive you your sins.",(created),VERB,VBN,"verb, past participle","(Aspect=Perf, Tense=Past, VerbForm=Part)"
5,"""O ye who believe, fear God as he should be feared and die not except in a state of Islam,"" ""O mankind, reverence your Guardian-Lord who created you from a single person, created, of like nature, his mate and from them twain scattered (like seeds) countless men and women; reverence God, through whom ye demand your mutual (rights) and (reverence) the wombs (that bore you) for God ever watches over you,"" ""O ye who believe, fear God and (always) say a word directed to the right, that he may make your conduct whole and sound and forgive you your sins.",(scattered),VERB,VBN,"verb, past participle","(Aspect=Perf, Tense=Past, VerbForm=Part)"
6,"""O ye who believe, fear God as he should be feared and die not except in a state of Islam,"" ""O mankind, reverence your Guardian-Lord who created you from a single person, created, of like nature, his mate and from them twain scattered (like seeds) countless men and women; reverence God, through whom ye demand your mutual (rights) and (reverence) the wombs (that bore you) for God ever watches over you,"" ""O ye who believe, fear God and (always) say a word directed to the right, that he may make your conduct whole and sound and forgive you your sins.",(bore),VERB,VBD,"verb, past tense","(Tense=Past, VerbForm=Fin)"
7,"""O ye who believe, fear God as he should be feared and die not except in a state of Islam,"" ""O mankind, reverence your Guardian-Lord who created you from a single person, created, of like nature, his mate and from them twain scattered (like seeds) countless men and women; reverence God, through whom ye demand your mutual (rights) and (reverence) the wombs (that bore you) for God ever watches over you,"" ""O ye who believe, fear God and (always) say a word directed to the right, that he may make your conduct whole and sound and forgive you your sins.",(directed),VERB,VBN,"verb, past participle","(Aspect=Perf, Tense=Past, VerbForm=Part)"
8,"He that obeys God and his apostle has already attained the highest achievement.""",(attained),VERB,VBN,"verb, past participle","(Aspect=Perf, Tense=Past, VerbForm=Part)"
9,"[Koranic verses] Thank God who said: ""And reform is my only aim and only God can grant me success.",(said),VERB,VBD,"verb, past tense","(Tense=Past, VerbForm=Fin)"


In [18]:
text = "Your blood has been spilled in Palestine and Iraq."
# text = "Your brothers and sons, the sons of the two holy mosques, have launched the jihad for the sake of God's cause to expel the occupying enemy from the country of the two holy mosques"
text = "On my orders, the United States military has begun strikes against Al Qaeda terrorist training camps and military installations of the Taliban regime in Afghanistan"
text = "It was the occupation of the land of the two holy mosques [Saudi Arabia], the cradle of Islam, the scene of the revelation, the source of the message, and the site of the holy Ka'bah, the qiblah [to which Muslims turn in prayer] of all Muslims, by Christian armies of the Americans and their allies"

doc = nlp(text)
print(doc[0].pos_)
displacy.render(doc, style='dep')
display_df(doc, matcher)



PRON


Unnamed: 0,0,1,2,3,4,5
0,"It was the occupation of the land of the two holy mosques [Saudi Arabia], the cradle of Islam, the scene of the revelation, the source of the message, and the site of the holy Ka'bah, the qiblah [to which Muslims turn in prayer] of all Muslims, by Christian armies of the Americans and their allies",(was),AUX,VBD,"verb, past tense","(Mood=Ind, Number=Sing, Person=3, Tense=Past, VerbForm=Fin)"
