In [3]:
import spacy

nlp = spacy.load('en_core_web_sm')

# Process the text
doc = nlp("I like tree kangaroos and narwhals.")

# Select the first token
first_token = doc[0]

# Print the first token's text
print(first_token.text)

I


In [6]:
import spacy

# Load the small English pipeline
nlp = spacy.load("en_core_web_sm")

# Process a text
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

# Iterate over the tokens
for token in doc:
    # Print the text and the predicted part-of-speech tag
    print(token.text, token.pos_)

for token in doc:
    print(token.text, token.pos_, token.dep_, token.head.text)
    
for ent in doc.ents:
    # Print the entity text and its label
    print(ent.text, ent.label_)

Apple PROPN
is AUX
looking VERB
at ADP
buying VERB
U.K. PROPN
startup NOUN
for ADP
$ SYM
1 NUM
billion NUM
Apple PROPN nsubj looking
is AUX aux looking
looking VERB ROOT looking
at ADP prep looking
buying VERB pcomp at
U.K. PROPN dobj buying
startup NOUN dep looking
for ADP prep startup
$ SYM quantmod billion
1 NUM compound billion
billion NUM pobj for
Apple ORG
U.K. GPE
$1 billion MONEY


In [None]:
for token in doc:
    print(token.text, token.pos_, token.dep_, token.head.text)

In [7]:
import spacy

# Load the "en_core_web_sm" pipeline
nlp = spacy.load("en_core_web_sm")

text = "It’s official: Apple is the first U.S. public company to reach a $1 trillion market value"

# Process the text
doc = nlp(text)

# Print the document text
print(doc.text)

It’s official: Apple is the first U.S. public company to reach a $1 trillion market value


In [9]:
type(doc)

spacy.tokens.doc.Doc

In [10]:
text = "It’s official: Apple is the first U.S. public company to reach a $1 trillion market value"

# Process the text
doc = nlp(text)

for token in doc:
    # Get the token text, part-of-speech tag and dependency label
    token_text = token.text
    token_pos = token.pos_
    token_dep = token.dep_
    # This is for formatting only
    print(f"{token_text:<12}{token_pos:<10}{token_dep:<10}")

It          PRON      nsubj     
’s          VERB      ccomp     
official    NOUN      acomp     
:           PUNCT     punct     
Apple       PROPN     nsubj     
is          AUX       ROOT      
the         DET       det       
first       ADJ       amod      
U.S.        PROPN     nmod      
public      ADJ       amod      
company     NOUN      attr      
to          PART      aux       
reach       VERB      relcl     
a           DET       det       
$           SYM       quantmod  
1           NUM       compound  
trillion    NUM       nummod    
market      NOUN      compound  
value       NOUN      dobj      


In [11]:
text = "Upcoming iPhone X release date leaked as Apple reveals pre-orders"

# Process the text
doc = nlp(text)

# Iterate over the entities
for ent in doc.ents:
    # Print the entity text and label
    print(ent.text, ent.label_)

# Get the span for "iPhone X"
iphone_x = doc[1:3]

# Print the span text
print("Missing entity:", iphone_x.text)

Apple ORG
Missing entity: iPhone X


iPhone X

In [15]:
# Matcher

# Import the Matcher
from spacy.matcher import Matcher

# Initialize the matcher with the shared vocab
matcher = Matcher(nlp.vocab)

# Add the pattern to the matcher
pattern = [{"TEXT": "iPhone"}, {"TEXT": "X"}]
matcher.add("IPHONE_PATTERN", [pattern])

# Process some text
doc = nlp("Upcoming iPhone X release date leaked")

# Call the matcher on the doc
matches = matcher(doc)

matches

[(9528407286733565721, 1, 3)]

In [None]:
# Using matcher

In [16]:

# Import the Matcher
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
doc = nlp("Upcoming iPhone X release date leaked as Apple reveals pre-orders")

# Initialize the Matcher with the shared vocabulary
matcher = Matcher(nlp.vocab)

# Create a pattern matching two tokens: "iPhone" and "X"
pattern = [{"TEXT": "iPhone"}, {"TEXT": "X"}]

# Add the pattern to the matcher
matcher.add("IPHONE_X_PATTERN", [pattern])

# Use the matcher on the doc
matches = matcher(doc)
print("Matches:", [doc[start:end].text for match_id, start, end in matches])

Matches: ['iPhone X']


In [36]:
matcher = Matcher(nlp.vocab)

doc = nlp(
    "After making the iOS update you won't notice a radical system-wide "
    "redesign: nothing like the aesthetic upheaval we got with iOS two and iOS 8. Most of "
    "iOS 11's furniture remains the same as in iOS 10. But you will discover "
    "some tweaks once you delve a little deeper iOS 15."
)

# Write a pattern for full iOS versions ("iOS 7", "iOS 11", "iOS 10")
pattern = [{"TEXT": "iOS"}, {"IS_DIGIT": True}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("IOS_VERSION_PATTERN", [pattern])
matches = matcher(doc)
print("Total matches found:", len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)

Total matches found: 4
Match found: iOS 8
Match found: iOS 11
Match found: iOS 10
Match found: iOS 15


In [37]:
type(matches)

list

In [38]:
doc = nlp(
    "i downloaded Fortnite on my laptop and can't open the game at all. Help? "
    "so when I was downloading Minecraft, I got the Windows version where it "
    "is the '.zip' folder and I used the default program to unpack it... do "
    "I also need to download Winzip?"
)

# Write a pattern that matches a form of "download" plus proper noun
pattern = [{"LEMMA": "download"}, {"POS": "PROPN"}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("DOWNLOAD_THINGS_PATTERN", [pattern])
matches = matcher(doc)
print("Total matches found:", len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)

Total matches found: 3
Match found: downloaded Fortnite
Match found: downloading Minecraft
Match found: download Winzip


In [39]:
doc = nlp("The new iPhone X is expected to be released in September")
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

September 47 56 DATE


In [40]:
import random
from spacy.util import minibatch, compounding

TRAIN_DATA = [
    ("New iPhone X release date leaked", {"entities": [(4, 12, "PRODUCT")]}),
    ("The iPhone 8 is coming soon", {"entities": [(4, 12, "PRODUCT")]}),
    ("Should I wait for the iPhone X?", {"entities": [(23, 29, "PRODUCT")]}),
    ("iPhone 11 leaks reveal triple-camera setup", {"entities": [(0, 7, "PRODUCT"), (13, 18, "PRODUCT")]}),
    ("The iPhone 9 will be the most affordable iPhone ever", {"entities": [(4, 12, "PRODUCT")]}),
    ("Apple announces the release of the new iPad Pro", {"entities": [(32, 41, "PRODUCT")]}),
    ("I'm excited to upgrade to the new MacBook Pro", {"entities": [(32, 44, "PRODUCT")]}),
    ("The new Apple Watch Series 5 is now available", {"entities": [(4, 18, "PRODUCT"), (19, 30, "PRODUCT")]}),
    ("Apple unveils new AirPods Pro with noise cancellation", {"entities": [(16, 22, "PRODUCT"), (23, 28, "PRODUCT")]}),
]

nlp = spacy.load("en_core_web_sm")

# Add the new entity label to the entity recognizer
ner = nlp.get_pipe("ner")
ner.add_label("PRODUCT")

# Only train the ner component
pipe_exceptions = ["ner"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

# Train the ner component
with nlp.disable_pipes(*unaffected_pipes):
    optimizer = nlp.begin_training()
    for i in range(10):
        random.shuffle(TRAIN_DATA)
        losses = {}
        batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
        print("Losses", losses)

# Test the trained model


ValueError: [E955] Can't find table(s) lexeme_norm for language 'en' in spacy-lookups-data. Make sure you have the package installed or provide your own lookup tables if no default lookups are available for your language.

In [None]:
doc = nlp("The new iPhone X is expected to be released in September")
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

In [56]:
pip install spacy-lookups-data

Collecting spacy-lookups-data
  Downloading spacy_lookups_data-1.0.3-py2.py3-none-any.whl (98.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.5/98.5 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
Installing collected packages: spacy-lookups-data
Successfully installed spacy-lookups-data-1.0.3
Note: you may need to restart the kernel to use updated packages.


In [57]:
pip install -U spacy spacy-lookups-data


Note: you may need to restart the kernel to use updated packages.


In [73]:
python -m spacy download en_lookups

SyntaxError: invalid syntax (2949279678.py, line 1)

In [75]:
doc = nlp("The new iPhone X is expected to be released in September")
for ent in doc.ents:
    if ent.label_ == "PRODUCT":
        print(ent.text)

In [77]:
import random
import spacy
from spacy.util import minibatch, compounding

TRAIN_DATA = [
    ("New iPhone X release date leaked", {"entities": [(4, 12, "PRODUCT")]}),
    ("The iPhone 8 is coming soon", {"entities": [(4, 12, "PRODUCT")]}),
    ("Should I wait for the iPhone X?", {"entities": [(23, 29, "PRODUCT")]}),
    ("iPhone 11 leaks reveal triple-camera setup", {"entities": [(0, 7, "PRODUCT"), (13, 18, "PRODUCT")]}),
    ("The iPhone 9 will be the most affordable iPhone ever", {"entities": [(4, 12, "PRODUCT")]}),
    ("Apple announces the release of the new iPad Pro", {"entities": [(32, 41, "PRODUCT")]}),
    ("I'm excited to upgrade to the new MacBook Pro", {"entities": [(32, 44, "PRODUCT")]}),
    ("The new Apple Watch Series 5 is now available", {"entities": [(4, 18, "PRODUCT"), (19, 30, "PRODUCT")]}),
    ("Apple unveils new AirPods Pro with noise cancellation", {"entities": [(16, 22, "PRODUCT"), (23, 28, "PRODUCT")]})]

nlp = spacy.load("en_core_web_sm")

# Add the new entity label to the entity recognizer
ner = nlp.get_pipe("ner")
ner.add_label("PRODUCT")

# Only train the ner component
pipe_exceptions = ["ner"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

# Train the ner component
with nlp.disable_pipes(*unaffected_pipes):
    optimizer = nlp.begin_training()
    for i in range(10):
        random.shuffle(TRAIN_DATA)
        losses = {}
        batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
        print("Losses", losses)



ValueError: [E955] Can't find table(s) lexeme_norm for language 'en' in spacy-lookups-data. Make sure you have the package installed or provide your own lookup tables if no default lookups are available for your language.

In [None]:
# Test the trained model
doc = nlp("The new iPhone X is expected to be released in September")
for ent in doc.ents:
    if ent.label_ == "PRODUCT":
        print(ent.text)