#                                       Named Entity Recognition

# Importing Libraries

In [58]:
# Importing required libraries
import spacy
import pandas as pd
from spacy import displacy
from bs4 import BeautifulSoup
import requests
import re
from collections import Counter

# Loading SpaCy Language Model

In [59]:
# Loading the English language model for spaCy
nlp = spacy.load("en_core_web_lg")

# NER on Sample Text

In [60]:
# Sample text for Named Entity Recognition (NER)
text = "My best friend Ryan Peters likes fancy adventure games."
# NLP processing of the sample text
doc = nlp(text)
# Iterating through tokens in the processed text and printing them
for token in doc:
    print(token, end=" | ")        

My | best | friend | Ryan | Peters | likes | fancy | adventure | games | . | 

# Displaying NLP Tokens in DataFrame

In [61]:
# Defining a function to display NLP tokens in a DataFrame
def display_nlp(doc, include_punct=False):
    """Generate data frame for visualization of spaCy tokens."""
    rows = []
    for i, t in enumerate(doc):
        if not t.is_punct or include_punct:
            row = {'token': i,  'text': t.text, 'lemma_': t.lemma_, 
                   'is_stop': t.is_stop, 'is_alpha': t.is_alpha,
                   'pos_': t.pos_, 'dep_': t.dep_, 
                   'ent_type_': t.ent_type_, 'ent_iob_': t.ent_iob_}
            rows.append(row)
    
    df = pd.DataFrame(rows).set_index('token')
    df.index.name = None
    return df
# Displaying NLP tokens in a DataFrame
display_nlp(doc)

Unnamed: 0,text,lemma_,is_stop,is_alpha,pos_,dep_,ent_type_,ent_iob_
0,My,my,True,True,PRON,poss,,O
1,best,good,False,True,ADJ,amod,,O
2,friend,friend,False,True,NOUN,nsubj,,O
3,Ryan,Ryan,False,True,PROPN,compound,PERSON,B
4,Peters,Peters,False,True,PROPN,appos,PERSON,I
5,likes,like,False,True,VERB,ROOT,,O
6,fancy,fancy,False,True,ADJ,amod,,O
7,adventure,adventure,False,True,NOUN,compound,,O
8,games,game,False,True,NOUN,dobj,,O


# Filtering Non-Stop and Non-Punctuation Tokens

In [62]:
# Another text for NER
text = "Dear Ryan, we need to sit down and talk. Regards, Pete"
doc = nlp(text)
# Filtering out non-stop and non-punctuation tokens
non_stop = [t for t in doc if not t.is_stop and not t.is_punct]
print(non_stop)

[Dear, Ryan, need, sit, talk, Regards, Pete]


# Extracting Nouns and Proper Nouns

In [63]:
# Yet another text for NER
text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)
# Extracting nouns and proper nouns from the text
nouns = [t for t in doc if t.pos_ in ['NOUN', 'PROPN']]
print(nouns)

[friend, Ryan, Peters, adventure, games]


# Extracting Named Entities

In [64]:
# Iterating through named entities in the text and printing them
text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)

for ent in doc.ents:
    print(f"({ent.text}, {ent.label_})", end=" ")

(Ryan Peters, PERSON) 

# Named Entities in a Complex Sentence

In [65]:
# More complex sentence for NER
text = "James O'Neill, chairman of World Cargo Inc, lives in San Francisco." 
doc = nlp(text)
# Extracting and printing named entities in the complex sentence
for ent in doc.ents:
    print(f"({ent.text}, {ent.label_})", end=" ")

(James O'Neill, PERSON) (World Cargo Inc, ORG) (San Francisco, GPE) 

# Visualizing Named Entities

In [66]:
# Rendering the visualization of named entities
displacy.render(doc, style='ent', jupyter=True)

# Extracting Named Entities from Web Page

In [67]:
# Function to extract text from a URL
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))
# Extracting text from a URL
ny_bb = url_to_string('https://crypto.com/price')
article = nlp(ny_bb)
# Counting named entities in the extracted text
len(article.ents)

43

# Visualizing Named Entities from Web Page

In [68]:
# Rendering the visualization of named entities from the web page
displacy.render(article, style='ent', jupyter=True)

# Counting and Displaying Label Counts

In [69]:
# Get labels of named entities
labels = [x.label_ for x in article.ents]

# Count occurrences of each label
label_counts = Counter(labels)

# Print label counts
print(label_counts)


Counter({'CARDINAL': 9, 'ORG': 8, 'PERSON': 7, 'PRODUCT': 7, 'GPE': 4, 'MONEY': 2, 'PERCENT': 2, 'DATE': 2, 'NORP': 1, 'EVENT': 1})


# Extracting Most Common Named Entities

In [70]:
# Extracting and displaying the most common named entities
items = [x.text for x in article.ents]
Counter(items).most_common(5)

[('50', 1), ('Cap', 1), ('2,674.04', 1), ('113.31', 1), ('46.87%', 1)]

# Splitting Text into Sentences

In [71]:
# Splitting the text into sentences
sentences = [x for x in article.sents]
print(sentences[0])

Top 50 Cryptocurrency Prices, Coin Market Cap, Price Charts And Historical Data | Crypto.comCoins: 20,459Market Cap: $2,674.04 B USD24H Change: -1.99%24H


# Visualizing Named Entities in a Sentence

In [72]:
# Rendering the visualization of named entities in a sentence
displacy.render(nlp(str(sentences[0])), jupyter=True, style='ent')

# Token Analysis in a Sentence

In [73]:
# Analyzing tokens in a sentence and filtering out stop words and punctuation
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences[0])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('50', 'NUM', '50'),
 ('Cryptocurrency', 'PROPN', 'Cryptocurrency'),
 ('Prices', 'NOUN', 'price'),
 ('Coin', 'PROPN', 'Coin'),
 ('Market', 'PROPN', 'Market'),
 ('Cap', 'PROPN', 'Cap'),
 ('Price', 'NOUN', 'price'),
 ('Charts', 'PROPN', 'Charts'),
 ('Historical', 'PROPN', 'Historical'),
 ('Data', 'PROPN', 'Data'),
 ('Crypto.comCoins', 'NOUN', 'crypto.comcoin'),
 ('\xa0', 'SPACE', '\xa0'),
 ('20,459Market', 'PROPN', '20,459Market'),
 ('Cap', 'PROPN', 'Cap'),
 ('\xa0', 'SPACE', '\xa0'),
 ('$', 'SYM', '$'),
 ('2,674.04', 'NUM', '2,674.04'),
 ('B', 'PROPN', 'B'),
 ('USD24H', 'NOUN', 'usd24h'),
 ('Change', 'NOUN', 'change'),
 ('\xa0', 'SPACE', '\xa0'),
 ('-1.99%24H', 'X', '-1.99%24h')]

# Visualizing Dependency Parsing in a Sentence

In [74]:
# Rendering the dependency parsing visualization of a sentence
displacy.render(nlp(str(sentences[0])), style='dep', jupyter = True, options = {'distance': 120})

In [75]:
# The code utilizes spaCy for Named Entity Recognition (NER), processing sample texts and web-scraped data. It identifies entities, visualizes results, and extracts insights. Through tokenization, filtering, and analysis, it showcases spaCy's capability for NER tasks, aiding in information extraction and natural language understanding.