## Import SpaCy in English

In [3]:
# run the next line only once if needed
#!python -m spacy download en_core_web_lg 
import spacy
nlp = spacy.load("en_core_web_lg")

## Trying it out on a sample text

In [6]:
text = "My best friend Ryan Peters likes fancy adventure games." # sample text
doc = nlp(text)
for token in doc:
    print(token, end=" | ") # Print the token followed by a separator (" | ")

My | best | friend | Ryan | Peters | likes | fancy | adventure | games | . | 

In [7]:
import pandas as pd

def display_nlp(doc, include_punct=False):
    """Generate data frame for visualization of spaCy tokens."""
    rows = []
    for i, t in enumerate(doc):
        if not t.is_punct or include_punct:
            row = {'token': i,  'text': t.text, 'lemma_': t.lemma_, 
                   'is_stop': t.is_stop, 'is_alpha': t.is_alpha,
                   'pos_': t.pos_, 'dep_': t.dep_, 
                   'ent_type_': t.ent_type_, 'ent_iob_': t.ent_iob_}
            rows.append(row)
    
    df = pd.DataFrame(rows).set_index('token')  # Create a DataFrame from the list of rows and set the index to 'token'
    df.index.name = None # Remove the index name
    return df
display_nlp(doc)

Unnamed: 0,text,lemma_,is_stop,is_alpha,pos_,dep_,ent_type_,ent_iob_
0,My,my,True,True,PRON,poss,,O
1,best,good,False,True,ADJ,amod,,O
2,friend,friend,False,True,NOUN,nsubj,,O
3,Ryan,Ryan,False,True,PROPN,compound,PERSON,B
4,Peters,Peters,False,True,PROPN,appos,PERSON,I
5,likes,like,False,True,VERB,ROOT,,O
6,fancy,fancy,False,True,ADJ,amod,,O
7,adventure,adventure,False,True,NOUN,compound,,O
8,games,game,False,True,NOUN,dobj,,O


## Removing Stop words using Spacy

In [8]:
text = "Dear Ryan, we need to sit down and talk. Regards, Pete" # sample text
doc = nlp(text)

non_stop = [t for t in doc if not t.is_stop and not t.is_punct] # filter out tokens that are not stop words and not punctuation
print(non_stop) # Print the filtered tokens

[Dear, Ryan, need, sit, talk, Regards, Pete]


## Finding all nouns 

In [10]:
text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)

nouns = [t for t in doc if t.pos_ in ['NOUN', 'PROPN']]
print(nouns)

[friend, Ryan, Peters, adventure, games]


## Named Entity Recognition

In [11]:
text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)

for ent in doc.ents:
    print(f"({ent.text}, {ent.label_})", end=" ")

(Ryan Peters, PERSON) 

In [12]:
text = "James O'Neill, chairman of World Cargo Inc, lives in San Francisco." 
doc = nlp(text)

for ent in doc.ents:
    print(f"({ent.text}, {ent.label_})", end=" ")

(James O'Neill, PERSON) (World Cargo Inc, ORG) (San Francisco, GPE) 

## NERs

In [13]:
from spacy import displacy

displacy.render(doc, style='ent', jupyter=True)

## Pulling an article from the web

In [31]:
# Import necessary libraries
from bs4 import BeautifulSoup
import requests
import re

# Function to extract text content from a URL
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))
ny_bb = url_to_string('https://medium.com/@kai-waehner/apache-kafka-vector-database-llm-real-time-genai-4b5b6e687d85')
article = nlp(ny_bb)
len(article.ents) # Print the number of entities

26

## Visualizing NERs

In [32]:
displacy.render(article, style='ent', jupyter=True)

## Most popular NER types

In [33]:
from collections import Counter
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'PRODUCT': 5,
         'PERSON': 8,
         'DATE': 2,
         'CARDINAL': 1,
         'ORG': 9,
         'GPE': 1})

## Most popular NER

In [34]:
items = [x.text for x in article.ents]
Counter(items).most_common(5)

[('Kafka', 3),
 ('Kai Waehner', 2),
 ('AI', 2),
 ('Apache Kafka', 2),
 ('Apache Kafka + Vector Database', 1)]

In [45]:
sentences = [x for x in article.sents]
print(sentences[12])

Very different scenarios are possible:Data streaming as data fabric for the entire machine learning infrastructureModel scoring with stream processing for real-time predictions and generation of contentGeneration of streaming data pipelines with input text, speech, or imagesReal-time online training of large language modelsI explored these use cases, including real-world examples like Expedia, BMW and Tinder, in the blog post “Apache Kafka as Mission Critical Data Fabric for GenAI”.----7FollowWritten by Kai Waehner2.1K FollowersTechnology Evangelist — www.kai-waehner.de → Big Data Analytics, Data Streaming, Apache Kafka, Middleware, Microservices => linkedin.com/in/kaiwaehnerFollowHelpStatusAboutCareersBlogPrivacyTermsText to speechTeams


## NER tags

In [42]:
displacy.render(nlp(str(sentences[12])), jupyter=True, style='ent')

## Types of words in the sentence

In [43]:
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences[12])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]


[('different', 'ADJ', 'different'),
 ('scenarios', 'NOUN', 'scenario'),
 ('possible', 'ADJ', 'possible'),
 ('Data', 'NOUN', 'datum'),
 ('streaming', 'NOUN', 'streaming'),
 ('data', 'NOUN', 'datum'),
 ('fabric', 'NOUN', 'fabric'),
 ('entire', 'ADJ', 'entire'),
 ('machine', 'NOUN', 'machine'),
 ('learning', 'VERB', 'learn'),
 ('infrastructureModel', 'PROPN', 'infrastructureModel'),
 ('scoring', 'VERB', 'score'),
 ('stream', 'NOUN', 'stream'),
 ('processing', 'NOUN', 'processing'),
 ('real', 'ADJ', 'real'),
 ('time', 'NOUN', 'time'),
 ('predictions', 'NOUN', 'prediction'),
 ('generation', 'NOUN', 'generation'),
 ('contentGeneration', 'NOUN', 'contentgeneration'),
 ('streaming', 'VERB', 'stream'),
 ('data', 'NOUN', 'datum'),
 ('pipelines', 'NOUN', 'pipeline'),
 ('input', 'NOUN', 'input'),
 ('text', 'NOUN', 'text'),
 ('speech', 'NOUN', 'speech'),
 ('imagesReal', 'ADJ', 'imagesreal'),
 ('time', 'NOUN', 'time'),
 ('online', 'ADJ', 'online'),
 ('training', 'NOUN', 'training'),
 ('large', 'ADJ'

## Sentence dependency tree

In [44]:
displacy.render(nlp(str(sentences[12])), style='dep', jupyter = True, options = {'distance': 120})