In [1]:
import spacy

nlp = spacy.load("en_core_web_lg")

In [8]:
text = "It’s official: Apple is the first U.S. public company to reach a $1 trillion market value"

# Process the text
doc = nlp(text)

print(f"{'Text':<12}{'PartOfSpeech':<14}{'Dependency':<10}")
print(f"{'----':<12}{'------------':<14}{'----------':<10}")
for token in doc:
    # Get the token text, part-of-speech tag and dependency label
    token_text = token.text
    token_pos = token.pos_
    token_dep = token.dep_
    # This is for formatting only
    # (<12 - This means assign 12 character 'slots' for this variable when printing,
    # the variable takes less than 12 characters to print populate the empty slots
    # with whitespace)
    print(f"{token_text:<12}{token_pos:<14}{token_dep:<10}")

print(f"\n{'Entity':<15}{'Label':<12}")
print(f"{'------':<15}{'-----':<12}")
for ent in doc.ents:
    print(f"{ent.text:<15}{ent.label_:<12}")

Text        PartOfSpeech  Dependency
----        ------------  ----------
It          PRON          nsubj     
’s          VERB          ccomp     
official    NOUN          acomp     
:           PUNCT         punct     
Apple       PROPN         nsubj     
is          AUX           ROOT      
the         DET           det       
first       ADJ           amod      
U.S.        PROPN         nmod      
public      ADJ           amod      
company     NOUN          attr      
to          PART          aux       
reach       VERB          relcl     
a           DET           det       
$           SYM           quantmod  
1           NUM           compound  
trillion    NUM           nummod    
market      NOUN          compound  
value       NOUN          dobj      

Entity         Label       
------         -----       
Apple          ORG         
first          ORDINAL     
U.S.           GPE         
$1 trillion    MONEY       


In [14]:
text_2 = "Upcoming iPhone X release date leaked as Apple reveals pre-orders"

doc_2 = nlp(text_2)

print(f"\n{'Entity':<15}{'Label':<12}")
print(f"{'------':<15}{'-----':<12}")
for ent in doc_2.ents:
    print(f"{ent.text:<15}{ent.label_:<12}")

print(f"\n{'Index':<15}{'Token Text':<12}")
print(f"{'------':<15}{'----------':<12}")
for index, token in enumerate(doc_2):
    print(f"{index:<15}{token.text:<15}")


Entity         Label       
------         -----       
Apple          ORG         

Index          Token Text  
------         ----------  
0              Upcoming       
1              iPhone         
2              X              
3              release        
4              date           
5              leaked         
6              as             
7              Apple          
8              reveals        
9              pre            
10             -              
11             orders         


In [15]:
missed_entity = doc_2[1:3]

print(f"\n{'Index':<15}{'Token Text':<12}")
print(f"{'------':<15}{'----------':<12}")
for index, token in enumerate(missed_entity):
    print(f"{index:<15}{token.text:<15}")


Index          Token Text  
------         ----------  
0              iPhone         
1              X              


In [19]:
from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab)

pattern = [{"TEXT": "iPhone"}, {"TEXT": "X"}]

matcher.add("IPHONE_X_PATTERN", [pattern])

matches = matcher(doc_2)

print(f"(ID, Match_Start, Match_End) -> {matches}")
print("Matches:", [doc_2[start:end].text for match_id, start, end in matches])

(ID, Match_Start, Match_End) -> [(15275443334065424288, 1, 3)]
Matches: ['iPhone X']


In [21]:
doc_3 = nlp(
    "After making the iOS update you won't notice a radical system-wide "
    "redesign: nothing like the aesthetic upheaval we got with iOS 7. Most of "
    "iOS 11's furniture remains the same as in iOS 10. But you will discover "
    "some tweaks once you delve a little deeper."
)

pattern = [{"TEXT": "iOS"}, {"IS_DIGIT": True}]

matcher.add("IOS_VERSION_PATTERN", [pattern])

matches = matcher(doc_3)

print("Total matches found:", len(matches))

for _, start, end in matches:
    print("Match found:", doc_3[start:end].text)

Total matches found: 3
Match found: iOS 7
Match found: iOS 11
Match found: iOS 10


In [23]:
doc_4 = nlp(
    "i downloaded Fortnite on my laptop and can't open the game at all. Help? "
    "so when I was downloading Minecraft, I got the Windows version where it "
    "is the '.zip' folder and I used the default program to unpack it... do "
    "I also need to download Winzip?"
)

pattern = [{"LEMMA": "download"}, {"POS": "PROPN"}]

matcher.add("DOWNLOAD_PATTERN", [pattern])
matches = matcher(doc_4)

print(f"Total matches found: {len(matches)}")

# Iterate over the matches and print the span text
for _, start, end in matches:
    print("Match found:", doc_4[start:end].text)

Total matches found: 3
Match found: downloaded Fortnite
Match found: downloading Minecraft
Match found: download Winzip
