In [1]:
%%time
import pandas as pd
import spacy
nlp = spacy.load("en_core_web_md")
nlp.add_pipe("merge_entities")

CPU times: user 3.18 s, sys: 967 ms, total: 4.15 s
Wall time: 3.18 s


<function spacy.pipeline.functions.merge_entities(doc: spacy.tokens.doc.Doc)>

In [2]:
import pandas as pd
from spacy.attrs import ORTH, POS, DEP, LEMMA, ENT_TYPE
from spacy.symbols import NOUN, PROPN, PRON, VERB

def display_simple(doc):
    
    return pd.DataFrame({
        'tokens': [token for token in doc],
        'Part of Speech tag': [f"{token.pos_} ({spacy.explain(token.pos_)})" for token in doc],
        'Dependency tag': [f"{token.dep_}, ({spacy.explain(token.dep_)})" for token in doc],
        'Lemma': [token.lemma_ for token in doc]
    })
    
def display_complex(doc):
    
    np_array = doc.to_array([ORTH, POS, DEP, LEMMA, ENT_TYPE])

    columns = ['Token', 'Part of Speech', 'Dependency', 'Lemma', 'Entity Type']
    df = pd.DataFrame(np_array, columns = columns)

    for col in df.columns:

        if col in ['Part of Speech', 'Dependency']:
            df[col] = df[col].apply(lambda token: f"{nlp.vocab.strings[token]} ({spacy.explain(nlp.vocab.strings[token])})")

        else:
            df[col] = df[col].apply(lambda token: f"{nlp.vocab.strings[token]}")
            
    return df

def get_lexical_units(doc):
    
    lexical_units = [NOUN, PROPN, PRON, VERB]
    
    

In [3]:
texts = [
    """The evidence we have gathered all points to a collection of loosely affiliated terrorist organizations known as al Qaeda.""",
    """This group and its leader -- a person named Usama bin Laden -- are linked to many other organizations in different countries, including the Egyptian Islamic Jihad and the Islamic Movement of Uzbekistan"""
]

for text in texts:

    display(display_simple(nlp(text)))

Unnamed: 0,tokens,Part of Speech tag,Dependency tag,Lemma
0,The,DET (determiner),"det, (determiner)",the
1,evidence,NOUN (noun),"ROOT, (root)",evidence
2,we,PRON (pronoun),"nsubj, (nominal subject)",we
3,have,AUX (auxiliary),"aux, (auxiliary)",have
4,gathered,VERB (verb),"relcl, (relative clause modifier)",gather
5,all,DET (determiner),"det, (determiner)",all
6,points,NOUN (noun),"dobj, (direct object)",point
7,to,ADP (adposition),"prep, (prepositional modifier)",to
8,a,DET (determiner),"det, (determiner)",a
9,collection,NOUN (noun),"pobj, (object of preposition)",collection


Unnamed: 0,tokens,Part of Speech tag,Dependency tag,Lemma
0,This,DET (determiner),"det, (determiner)",this
1,group,NOUN (noun),"nsubjpass, (nominal subject (passive))",group
2,and,CCONJ (coordinating conjunction),"cc, (coordinating conjunction)",and
3,its,PRON (pronoun),"poss, (possession modifier)",its
4,leader,NOUN (noun),"conj, (conjunct)",leader
5,--,PUNCT (punctuation),"punct, (punctuation)",--
6,a,DET (determiner),"det, (determiner)",a
7,person,NOUN (noun),"appos, (appositional modifier)",person
8,named,VERB (verb),"acl, (clausal modifier of noun (adjectival cla...",name
9,Usama bin Laden,PROPN (proper noun),"oprd, (object predicate)",Usama bin Laden


In [4]:
from IPython.core.display import display, HTML

def display_side_by_side(dfs:list, captions:list, last = False):
    
    """Display tables side by side to save vertical space
    Input:
        dfs: list of pandas.DataFrame
        captions: list of table captions
    """
    
    output = ""

    heading_properties = [('font-size', '18px')]

    cell_properties = [('font-size', '16px')]

    dfstyle = [dict(selector="th", props=heading_properties),
               dict(selector="td", props=cell_properties)]
    
    combined = dict(zip(captions, dfs))
    
    for caption, df in combined.items():

        if last == False:
            output += df.style.set_table_attributes("style='display:inline'").\
                set_caption(caption).\
                _repr_html_()
            output += "\xa0\xa0\xa0"

        if last == True:
            output += df.style.set_table_attributes("style='display:inline'"). \
                set_caption(caption).\
                applymap('font-weight: bold', subset=pd.IndexSlice[len(df), :]).\
                _repr_html_()
            output += "\xa0\xa0\xa0"
            
    display(HTML(output))

  from IPython.core.display import display, HTML


In [7]:
from spacy.symbols import NOUN, PROPN, PRON, VERB

lexical_units = [NOUN, PROPN, PRON, VERB]


index = ["Lexical Unit", "word"]

df = lambda doc: pd.DataFrame(
    [
        [
            spacy.explain(token.pos_).title(),
            token
        ] 
    for token in doc if token.pos in lexical_units
    ], 
    columns = index
)

columns = ["Noun Chunk"]
noun_chunks = lambda doc: pd.DataFrame(
    [
        chunk.text for chunk in doc.noun_chunks
    ],
    columns = columns
)

dfs = []

for text in texts:
    doc = nlp(text)
    dfs.append(df(doc))
    dfs.append(noun_chunks(doc))

captions = ["Sentence 2", "Sentence 2 Noun Chunks", "Sentence 3", "Sentence 3 Noun Chunks"]
display_side_by_side(dfs, captions = captions)

Unnamed: 0,Lexical Unit,word
0,Noun,evidence
1,Pronoun,we
2,Verb,gathered
3,Noun,points
4,Noun,collection
5,Verb,affiliated
6,Noun,organizations
7,Verb,known
8,Proper Noun,al Qaeda

Unnamed: 0,Noun Chunk
0,The evidence
1,we
2,all points
3,a collection
4,loosely affiliated terrorist organizations
5,al Qaeda

Unnamed: 0,Lexical Unit,word
0,Noun,group
1,Pronoun,its
2,Noun,leader
3,Noun,person
4,Verb,named
5,Proper Noun,Usama bin Laden
6,Verb,linked
7,Noun,organizations
8,Noun,countries
9,Verb,including

Unnamed: 0,Noun Chunk
0,This group
1,its leader
2,a person
3,Usama bin Laden
4,many other organizations
5,different countries
6,the Egyptian Islamic Jihad
7,the Islamic Movement of Uzbekistan


In [6]:
doc = nlp(texts[0])
display(pd.DataFrame([str(chunk) for chunk in doc.noun_chunks]))

Unnamed: 0,0
0,The evidence
1,we
2,all points
3,a collection
4,loosely affiliated terrorist organizations
5,al Qaeda
