In [1]:
import spacy

In [2]:
nlp = spacy.load("en_core_web_sm")
# A doc object is a sequence of tokens
doc = nlp("My name is Yusuf Solomon and I live in Lagos.")

In [3]:
# Reading from a file
import pathlib
file_path = pathlib.Path("example.txt")
text = file_path.read_text(encoding="utf-8")
doc = nlp(text)
print([token.text for token in doc[:20]])  # Print first 20 tokens

['My', 'Name', 'is', 'Yusuf', 'Solomon', ',', 'and', 'i', 'am', 'the', 'fastest', 'man', 'Alive', '.', '\n', 'Five', 'years', 'ago', ',', 'i']


In [4]:
# Sentence detection
sentences = list(doc.sents)
print(f"Number of sentences: {len(sentences)}")
for i, sent in enumerate(sentences):
    print(f"Sentence {i+1}: {sent.text}")

Number of sentences: 2
Sentence 1: My Name is Yusuf Solomon, and i am the fastest man Alive. 

Sentence 2: Five years ago, i was struck by lightining; it modified my anatomy and gave me super-human speed.


In [5]:
## Making custom boundaries : Sentence detection with eplipses

ellipsis_text = "This is might be a sentence... And here is another one. Yet another sentence is created."

from spacy.language import Language
@Language.component("set_custom_boundaries")
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text == "...":
            doc[token.i + 1].is_sent_start = True
    return doc

custom_nlp = spacy.load("en_core_web_sm")
custom_nlp.add_pipe("set_custom_boundaries", before = "parser")
custom_elipsis_doc = custom_nlp(ellipsis_text)
custom_sentences = list(custom_elipsis_doc.sents)
for sentence in custom_sentences:
    print(sentence)


This is might be a sentence...
And here is another one.
Yet another sentence is created.


In [6]:
# Tokens have other attributes

for token in doc[:5]:
    print(f"Token: {token.text}, index: {token.idx}, Lemma: {token.lemma_}, POS: {token.pos_}, Tag: {token.tag_}, Shape: {token.shape_}, is_alpha: {token.is_alpha}, is_stop: {token.is_stop}")

Token: My, index: 0, Lemma: my, POS: PRON, Tag: PRP$, Shape: Xx, is_alpha: True, is_stop: True
Token: Name, index: 3, Lemma: name, POS: NOUN, Tag: NN, Shape: Xxxx, is_alpha: True, is_stop: True
Token: is, index: 8, Lemma: be, POS: AUX, Tag: VBZ, Shape: xx, is_alpha: True, is_stop: True
Token: Yusuf, index: 11, Lemma: Yusuf, POS: PROPN, Tag: NNP, Shape: Xxxxx, is_alpha: True, is_stop: False
Token: Solomon, index: 17, Lemma: Solomon, POS: PROPN, Tag: NNP, Shape: Xxxxx, is_alpha: True, is_stop: False


.text_with_ws prints the token text along with any trailing space, if present.
.is_alpha indicates whether the token consists of alphabetic characters or not.
.is_punct indicates whether the token is a punctuation symbol or not.
.is_stop indicates whether the token is a stop word or not. Youâ€™ll be covering stop words a bit later in this tutorial.

Stop words are common words in a language
the is_stop attribute indicates if a token is a stop word

Lemmatization is the breakdown of a word to its root form, called lemma
The .lemma attribute gets the lemma of that token

In [7]:
# Most common words excluding stop words and punctuations
import spacy
from collections import Counter
nlp = spacy.load("en_core_web_sm")
complete_text = (
    "Gus Proto is a Python developer currently"
    " working for a London-based Fintech company. He is"
    " interested in learning Natural Language Processing."
    " There is a developer conference happening on 21 July"
    ' 2019 in London. It is titled "Applications of Natural'
    ' Language Processing". There is a helpline number'
    " available at +44-1234567891. Gus is helping organize it."
    " He keeps organizing local Python meetups and several"
    " internal talks at his workplace. Gus is also presenting"
    ' a talk. The talk will introduce the reader about "Use'
    ' cases of Natural Language Processing in Fintech".'
    " Apart from his work, he is very passionate about music."
    " Gus is learning to play the Piano. He has enrolled"
    " himself in the weekend batch of Great Piano Academy."
    " Great Piano Academy is situated in Mayfair or the City"
    " of London and has world-class piano instructors."
)
complete_doc = nlp(complete_text)

words = [
    token.text
    for token in complete_doc
    if not token.is_stop and not token.is_punct
]

print(Counter(words).most_common(5))

[('Gus', 4), ('London', 3), ('Natural', 3), ('Language', 3), ('Processing', 3)]


Part of Speech Tagging POS

Noun, Pronoun, Adjective, Verb, Adverb, preposition, Conjuction, interjection

That is what .pos_ attribute of a token is 

In [8]:
import spacy
nlp = spacy.load("en_core_web_sm")
about_text = (
 "Gus Proto is a Python developer currently"
 " working for a London-based Fintech"
 " company. He is interested in learning"
 " Natural Language Processing."
)
about_doc = nlp(about_text)
for token in about_doc:
 print(
     f""" TOKEN: {str(token)}
    =====
    TAG: {str(token.tag_):10} POS: {token.pos_}
    EXPLANATION: {spacy.explain(token.tag_)}"""
    )


 TOKEN: Gus
    =====
    TAG: NNP        POS: PROPN
    EXPLANATION: noun, proper singular
 TOKEN: Proto
    =====
    TAG: NNP        POS: PROPN
    EXPLANATION: noun, proper singular
 TOKEN: is
    =====
    TAG: VBZ        POS: AUX
    EXPLANATION: verb, 3rd person singular present
 TOKEN: a
    =====
    TAG: DT         POS: DET
    EXPLANATION: determiner
 TOKEN: Python
    =====
    TAG: NNP        POS: PROPN
    EXPLANATION: noun, proper singular
 TOKEN: developer
    =====
    TAG: NN         POS: NOUN
    EXPLANATION: noun, singular or mass
 TOKEN: currently
    =====
    TAG: RB         POS: ADV
    EXPLANATION: adverb
 TOKEN: working
    =====
    TAG: VBG        POS: VERB
    EXPLANATION: verb, gerund or present participle
 TOKEN: for
    =====
    TAG: IN         POS: ADP
    EXPLANATION: conjunction, subordinating or preposition
 TOKEN: a
    =====
    TAG: DT         POS: DET
    EXPLANATION: determiner
 TOKEN: London
    =====
    TAG: NNP        POS: PROPN
    EXPLANA

In [9]:
from spacy import displacy
## Displaying named entities
displacy.serve(about_doc, style="ent")




Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...



127.0.0.1 - - [13/Dec/2025 21:32:49] "GET / HTTP/1.1" 200 1762
127.0.0.1 - - [13/Dec/2025 21:32:52] "GET /favicon.ico HTTP/1.1" 200 1762
127.0.0.1 - - [13/Dec/2025 21:34:27] "GET / HTTP/1.1" 200 1762
127.0.0.1 - - [13/Dec/2025 21:34:31] "GET /favicon.ico HTTP/1.1" 200 1762


Shutting down server on port 5000.


In [13]:
# preprocessing functions

def is_token_allowed(token):
    return bool(
        token and str(token).strip() and not token.is_stop and not token.is_punct
    ) # return flase id the token is a stop word or punctuation
    
def preprocess_token(token):
    return token.lemma_.strip().lower()
    
filtered_tokens = [preprocess_token(token) for token in about_doc if is_token_allowed(token)]
print(filtered_tokens)

['gus', 'proto', 'python', 'developer', 'currently', 'work', 'london', 'base', 'fintech', 'company', 'interested', 'learn', 'natural', 'language', 'processing']


In [14]:
# Rule based mathcing can be used to extract information from text
# Using particular semantic or grammatical filters, we can extraxt information from text
# We can extract full names, which are proper nouns followed by proper nouns
# We can also extract dates, phone numbers, etc. using patterns

from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

def extract_full_names(doc):
    pattern = [
        {"POS": "PROPN"},
        {"POS": "PROPN"}
    ]
    matcher.add("FULL_NAME", [pattern])
    matches = matcher(doc)
    full_names = []
    for match_id, start, end in matches:
        span = doc[start:end]
        yield span.text
    
next(extract_full_names(about_doc))  # Get the first full name found

'Gus Proto'

In [24]:
conference_org_text = ("There is a developer conference"
    " happening on 21 July 2019 in London. It is titled"
    "Applications of  Processing"
    " There is a helpline number available"
    " at (123) 456-7891")

def extract_phone_number(nlp_doc):
    pattern = [
        {"ORTH": "("},
        {"SHAPE": "ddd"},
        {"ORTH": ")"},
        {"SHAPE": "ddd"},
        {"ORTH": "-", "OP": "?"},
        {"SHAPE": "dddd"},
    ]
    matcher.add("PHONE_NUMBER", [pattern])
    matches = matcher(nlp_doc)
    for match_id, start, end in matches:
        span = nlp_doc[start:end]
        return span.text

conference_org_doc = nlp(conference_org_text)
extract_phone_number(conference_org_doc)


'(123) 456-7891'

In [25]:
# Displaying dependencies
# This shows us the relationship between tokens in a sentence

displacy.serve(about_doc, style="dep")




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [26]:
# Named Entity Recognition
# This allows is to find named entities in text
# And then classif them into categories

for ent in about_doc.ents:
    print(f"Entity: {ent.text}, Label: {ent.label_}, Explanation: {spacy.explain(ent.label_)}")

Entity: Gus Proto, Label: PERSON, Explanation: People, including fictional
Entity: London, Label: GPE, Explanation: Countries, cities, states
Entity: Fintech, Label: ORG, Explanation: Companies, agencies, institutions, etc.
Entity: Natural Language Processing, Label: ORG, Explanation: Companies, agencies, institutions, etc.
