In [1]:
#!python -m spacy download en_core_web_lg
import spacy
nlp = spacy.load("en_core_web_lg")

In [2]:
text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)
for token in doc:
    print(token, end=" | ")

My | best | friend | Ryan | Peters | likes | fancy | adventure | games | . | 

In [3]:
import pandas as pd
def display_nlp(doc, include_punct=False):
    """Generate data frame for visualization of spaCy tokens."""
    rows = []
    for i, t in enumerate(doc):
        if not t.is_punct or include_punct:
            row = {'token': i, 'text': t.text, 'lemma_': t.lemma_,
                   'is_stop': t.is_stop, 'is_alpha': t.is_alpha,
                   'pos_': t.pos_, 'dep_': t.dep_,'ent_type_': t.ent_type_, 'ent_iob_': t.ent_iob_}
            rows.append(row)
            df = pd.DataFrame(rows).set_index('token')
            df.index.name = None
            return df
display_nlp(doc)

Unnamed: 0,text,lemma_,is_stop,is_alpha,pos_,dep_,ent_type_,ent_iob_
0,My,my,True,True,PRON,poss,,O


In [4]:
text = "Dear Ryan, we need to sit down and talk. Regards, Pete"
doc = nlp(text)
non_stop = [t for t in doc if not t.is_stop and not t.is_punct]
print(non_stop)

[Dear, Ryan, need, sit, talk, Regards, Pete]


In [5]:
text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)
nouns = [t for t in doc if t.pos_ in ['NOUN', 'PROPN']]
print(nouns)

[friend, Ryan, Peters, adventure, games]


In [6]:
text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)
for ent in doc.ents:
    print(f"({ent.text}, {ent.label_})", end=" ")

(Ryan Peters, PERSON) 

In [7]:
text = "James O'Neill, chairman of World Cargo Inc, lives in San Francisco."
doc = nlp(text)
for ent in doc.ents:
    print(f"({ent.text}, {ent.label_})", end=" ")

(James O'Neill, PERSON) (World Cargo Inc, ORG) (San Francisco, GPE) 

In [8]:
from spacy import displacy
displacy.render(doc, style='ent', jupyter=True)

In [19]:
from bs4 import BeautifulSoup
import requests
import re
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))
ny_bb = url_to_string('https://abcnews.go.com/Politics/trump-address-west-point-graduates-dei-crackdown/story?id=122116685')
article = nlp(ny_bb)
len(article.ents)

167

In [20]:
displacy.render(article, style='ent', jupyter=True)

In [21]:
from collections import Counter
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'ORG': 59,
         'GPE': 22,
         'PERSON': 27,
         'DATE': 29,
         'ORDINAL': 4,
         'LOC': 2,
         'FAC': 2,
         'NORP': 6,
         'EVENT': 4,
         'CARDINAL': 11,
         'TIME': 1})

In [22]:
items = [x.text for x in article.ents]
Counter(items).most_common(5)

[('West Point', 13),
 ('Trump', 12),
 ('US', 6),
 ('Donald Trump', 6),
 ('America', 3)]

In [23]:
sentences = [x for x in article.sents]
print(sentences[20])

"President Donald Trump and US Military Academy Superintendent Lt. Gen. Steven Gilland listen to the national anthem before Trump delivers the commencement address at the 2025 graduation ceremony at the US Military Academy West Point ,on May 24, 2025, in West Point, New York.


In [24]:
displacy.render(nlp(str(sentences[20])), jupyter=True, style='ent')

In [25]:
[(x.orth_,x.pos_, x.lemma_) for x in [y
                                      for y
                                      in nlp(str(sentences[20]))
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('President', 'PROPN', 'President'),
 ('Donald', 'PROPN', 'Donald'),
 ('Trump', 'PROPN', 'Trump'),
 ('Military', 'PROPN', 'Military'),
 ('Academy', 'PROPN', 'Academy'),
 ('Superintendent', 'PROPN', 'Superintendent'),
 ('Lt', 'PROPN', 'Lt'),
 ('.', 'PROPN', '.'),
 ('Gen.', 'PROPN', 'Gen.'),
 ('Steven', 'PROPN', 'Steven'),
 ('Gilland', 'PROPN', 'Gilland'),
 ('listen', 'VERB', 'listen'),
 ('national', 'ADJ', 'national'),
 ('anthem', 'NOUN', 'anthem'),
 ('Trump', 'PROPN', 'Trump'),
 ('delivers', 'VERB', 'deliver'),
 ('commencement', 'NOUN', 'commencement'),
 ('address', 'NOUN', 'address'),
 ('2025', 'NUM', '2025'),
 ('graduation', 'NOUN', 'graduation'),
 ('ceremony', 'NOUN', 'ceremony'),
 ('Military', 'PROPN', 'Military'),
 ('Academy', 'PROPN', 'Academy'),
 ('West', 'PROPN', 'West'),
 ('Point', 'PROPN', 'Point'),
 ('24', 'NUM', '24'),
 ('2025', 'NUM', '2025'),
 ('West', 'PROPN', 'West'),
 ('Point', 'PROPN', 'Point'),
 ('New', 'PROPN', 'New'),
 ('York', 'PROPN', 'York')]

In [26]:
displacy.render(nlp(str(sentences[20])), style='dep', jupyter = True,
options = {'distance': 120})