Reference : https://course.spacy.io/en/chapter1

conda install spacy --channel conda-forge
python -m spacy download en_core_web_sm 

In [1]:
# C1 - E2

import spacy

nlp = spacy.blank("en")

doc = nlp("This is a sentence.")

print(doc.text)

This is a sentence.


Here 'nlp' is a blank "en"(Language) object. Passing a string object to 'nlp' returns a Doc object. 

In [2]:
# C1 - E3
# Import spaCy and create the English nlp object
import spacy

nlp = spacy.blank("en")
print(f"nlp is an object of: {type(nlp)}")

# Process the text
doc = nlp("I like tree kangaroos and narwhals.")
print(f"doc is an object of: {type(doc)}")

# Select the first token
first_token = doc[0]
print(f"first_token is an object of: {type(first_token)}")

# Print the first token's text
print(first_token.text)
print(f"first_token.text is an object of: {type(first_token.text)}")

# A slice of the Doc for "tree kangaroos"
tree_kangaroos = doc[2:4]
print(tree_kangaroos.text)

# A slice of the Doc for "tree kangaroos and narwhals" (without the ".")
tree_kangaroos_and_narwhals = doc[2:6]
print(tree_kangaroos_and_narwhals.text)

len(doc)

nlp is an object of: <class 'spacy.lang.en.English'>
doc is an object of: <class 'spacy.tokens.doc.Doc'>
first_token is an object of: <class 'spacy.tokens.token.Token'>
I
first_token.text is an object of: <class 'str'>
tree kangaroos
tree kangaroos and narwhals


7

In [8]:
string_object = "I love Bangladesh."

nlp = spacy.load("en_core_web_sm")

doc = nlp(string_object)

A Doc object, processes a string object and runs several analytical components on it and attaches various metadata on the various parts of the string object by tokenizing it.

In [14]:
# GPE = Geopolitical Entity

for token in doc:
    print(f"{token.text:<12} | {token.pos_:<12}")

for ent in doc.ents:
    print(f"\n{ent.text:<12} | {ent.label_:<12}")


I            | PRON        
love         | VERB        
Bangladesh   | PROPN       
.            | PUNCT       

Bangladesh   | GPE         


In [8]:
# C1 - E4

import spacy

nlp = spacy.blank("en")

# Process the text
doc = nlp(
    "In 1990, more than 60% of people in East Asia were in extreme poverty. "
    "Now less than 4% are."
)

# Iterate over the tokens in the doc
for token in doc:
    # Check if the token resembles a number
    if token.like_num:
        # Get the next token in the document
        next_token = doc[token.i + 1]
        # Check if the next token's text equals "%"
        if next_token.text == "%":
            print("Percentage found:", token.text)

Percentage found: 60
Percentage found: 4


In [10]:
# C1 - E7
import spacy

# Load the en_core_web_sm trained pipeline
nlp = spacy.load("en_core_web_sm")
print(f"nlp is an object of: {type(nlp)}\n")

text = "It’s official: Apple is the first U.S. public company to reach a $1 trillion market value"

doc = nlp(text)
print(f"doc is an object of: {type(doc)}\n")

print(f"doc.text-> {doc.text}\n")
print(f"doc.text is an object of: {type(doc.text)}")

nlp is an object of: <class 'spacy.lang.en.English'>

doc is an object of: <class 'spacy.tokens.doc.Doc'>

doc.text-> It’s official: Apple is the first U.S. public company to reach a $1 trillion market value

doc.text is an object of: <class 'str'>


In [15]:
# C1 - E8 - P1

import spacy

nlp = spacy.load("en_core_web_sm")

text = "It’s official: Apple is the first U.S. public company to reach a $1 trillion market value"

# Process the text
doc = nlp(text)

print(f"{'Text':<12}{'PartOfSpeech':<14}{'Dependency':<10}")
print(f"{'----':<12}{'------------':<14}{'----------':<10}")
for token in doc:
    # Get the token text, part-of-speech tag and dependency label
    token_text = token.text
    token_pos = token.pos_
    token_dep = token.dep_
    # This is for formatting only
    # (<12 - This means assign 12 character 'slots' for this variable when printing,
    # the variable takes less than 12 characters to print populate the empty slots
    # with whitespace)
    print(f"{token_text:<12}{token_pos:<14}{token_dep:<10}")

Text        PartOfSpeech  Dependency
----        ------------  ----------
It          PRON          nsubj     
’s          VERB          ROOT      
official    ADJ           acomp     
:           PUNCT         punct     
Apple       PROPN         nsubj     
is          AUX           ROOT      
the         DET           det       
first       ADJ           amod      
U.S.        PROPN         nmod      
public      ADJ           amod      
company     NOUN          attr      
to          PART          aux       
reach       VERB          relcl     
a           DET           det       
$           SYM           quantmod  
1           NUM           compound  
trillion    NUM           nummod    
market      NOUN          compound  
value       NOUN          dobj      


In [21]:
# C1 - E8 - P2
import spacy

nlp = spacy.load("en_core_web_sm")

text_1 = "It’s official: Apple is the first U.S. public company to reach a 1 trillion market value"

text_2 = "I Love Apple"

# Process the text
doc_1 = nlp(text_1)
doc_2 = nlp(text_2)

# Iterate over the predicted entities
for token in doc_1:
    print(f"{token.text:<12} | {token.pos_:<12}")

for ent in doc_1.ents:
    # Print the entity text and its label
    print(ent.text, ent.label_)

for token in doc_2:
    print(f"{token.text:<12} | {token.pos_:<12}")

for ent in doc_2.ents:
    # Print the entity text and its label
    print(ent.text, ent.label_)

It           | PRON        
’s           | VERB        
official     | ADJ         
:            | PUNCT       
Apple        | PROPN       
is           | AUX         
the          | DET         
first        | ADJ         
U.S.         | PROPN       
public       | ADJ         
company      | NOUN        
to           | PART        
reach        | VERB        
a            | DET         
1            | NUM         
trillion     | NUM         
market       | NOUN        
value        | NOUN        
Apple ORG
first ORDINAL
U.S. GPE
1 trillion CARDINAL
I            | PRON        
Love         | VERB        
Apple        | PROPN       


In [17]:
# C1-E9

import spacy

nlp = spacy.load("en_core_web_sm")

text = "Upcoming iPhone X release date leaked as Apple reveals pre-orders"

# Process the text
doc = nlp(text)

# Iterate over the entities
for ent in doc.ents:
    # Print the entity text and label
    print(ent.text, ent.label_)

# Get the span for "iPhone X"
iphone_x = doc[1:3]

# Print the span text
print("Missing entity:", iphone_x.text)

Apple ORG
Missing entity: iPhone X


In [18]:
# C1 - E11

import spacy

# Import the Matcher
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
doc = nlp("Upcoming iPhone X release date leaked as Apple reveals pre-orders")

# Initialize the Matcher with the shared vocabulary
matcher = Matcher(nlp.vocab)

# Create a pattern matching two tokens: "iPhone" and "X"
pattern = [{"TEXT": "iPhone"}, {"TEXT": "X"}]

# Add the pattern to the matcher
matcher.add("IPHONE_X_PATTERN", [pattern])

# Use the matcher on the doc
matches = matcher(doc)
print("Matches:", [doc[start:end].text for match_id, start, end in matches])

Matches: ['iPhone X']


In [19]:
# C1 - E12 - P1

import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

doc = nlp(
    "After making the iOS update you won't notice a radical system-wide "
    "redesign: nothing like the aesthetic upheaval we got with iOS 7. Most of "
    "iOS 11's furniture remains the same as in iOS 10. But you will discover "
    "some tweaks once you delve a little deeper."
)

# Write a pattern for full iOS versions ("iOS 7", "iOS 11", "iOS 10")
pattern = [{"TEXT": "iOS"}, {"IS_DIGIT": True}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("IOS_VERSION_PATTERN", [pattern])
matches = matcher(doc)
print("Total matches found:", len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)

Total matches found: 3
Match found: iOS 7
Match found: iOS 11
Match found: iOS 10


In [20]:
# C1 - E12 - P2

import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

doc = nlp(
    "i downloaded Fortnite on my laptop and can't open the game at all. Help? "
    "so when I was downloading Minecraft, I got the Windows version where it "
    "is the '.zip' folder and I used the default program to unpack it... do "
    "I also need to download Winzip?"
)

# Write a pattern that matches a form of "download" plus proper noun
pattern = [{"LEMMA": "download"}, {"POS": "PROPN"}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("DOWNLOAD_THINGS_PATTERN", [pattern])
matches = matcher(doc)
print("Total matches found:", len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)

Total matches found: 3
Match found: downloaded Fortnite
Match found: downloading Minecraft
Match found: download Winzip


In [22]:
# C1 - E12 - P3

import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

doc = nlp(
    "Features of the app include a beautiful design, smart search, automatic "
    "labels and optional voice responses."
)

# Write a pattern for adjective plus one or two nouns
pattern = [{"POS": "ADJ"}, {"POS": "NOUN"}, {"POS": "NOUN", "OP": "?"}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("ADJ_NOUN_PATTERN", [pattern])
matches = matcher(doc)
print("Total matches found:", len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)

Total matches found: 5
Match found: beautiful design
Match found: smart search
Match found: automatic labels
Match found: optional voice
Match found: optional voice responses
