# spacy
04-08-2021

## basics

In [3]:
%store -r Nightvale_df

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [4]:
# import the English library from spacy
from spacy.lang.en import English

In [5]:
# initialize nlp obect
nlp = English()

# nlp object analyzes text
# - contains proccesing pipeline
# - includes language-specific rules for tokenization

In [6]:
# when you pass a string of text through the nlp object,
#     it becomes a "document" object
doc = nlp(Nightvale_df.Text[0][:100])

In [7]:
for token in doc:
    print(token)


CECIL
:
As
a
matter
of
fact
,
the
facts
do
n’t
matter
.
Welcome
to
Night
Vale
 
Listeners
,
it
is
upon
us
.


In [8]:
# Indexing
doc[1]

:

In [9]:
# span
# spanning DOESN'T create new information, it's just a view of the doc

span = doc[-4:-2]
print(span.text)

is upon


In [10]:
# token attributes (aka lexical attributes)

doc = nlp('It costs $5.')
print('Index:     ', [token.i for token in doc])
print('Text:      ', [token.text for token in doc])
print('is_alpha:  ', [token.is_alpha for token in doc])
print('is_punct:  ', [token.is_punct for token in doc])
print('like_num:  ', [(token, token.like_num) for token in doc])

Index:      [0, 1, 2, 3, 4]
Text:       ['It', 'costs', '$', '5', '.']
is_alpha:   [True, True, False, False, False]
is_punct:   [False, False, False, False, True]
like_num:   [(It, False), (costs, False), ($, False), (5, True), (., False)]


In [11]:
import spacy

# "small english model"
# en_core_web_sm is a nlp object trained on lots of language data
#     that can predict part of speech and other language metadata
nlp = spacy.load('en_core_web_sm')

In [12]:
string = 'She ate the pizza'

In [13]:
doc = nlp(string)
for token in doc:
    print(token.text, token.pos_)
    print(token.text, token.pos_, token.dep_, token.head.text)
    # syntactic dependency info
    
# attributes that return strings end with an underscore.  
# attributes that return IDs end without an underscore.

She PRON
She PRON nsubj ate
ate VERB
ate VERB ROOT ate
the DET
the DET det pizza
pizza NOUN
pizza NOUN dobj ate


In [14]:
string = u'Apple is looking at buying a U.K. startup for $1 billion.'

In [15]:
doc = nlp(string)

# named entities (proper nouns)
# returns an iterator of span objects
for ent in doc.ents:
    print(ent.text, ent.label_)
    spacy.explain(ent.label_) # spacy.explain('label')!!!!

Apple ORG


'Companies, agencies, institutions, etc.'

U.K. GPE


'Countries, cities, states'

$1 billion MONEY


'Monetary values, including unit'

In [16]:
string = u'Why does Aidan insist on chewing on his sleeves 24/7?'

In [17]:
doc = nlp(string)
for token in doc:
    # Get the token text, part-of-speech tag and dependency label
    token_text = token.text
    token_pos = token.pos_
    token_dep = token.dep_
    # This is for formatting only
    print('{:<12}{:<10}{:<10}'.format(token_text, token_pos, token_dep))

Why         ADV       advmod    
does        AUX       aux       
Aidan       PROPN     nsubj     
insist      VERB      ROOT      
on          ADP       prep      
chewing     VERB      pcomp     
on          ADP       prep      
his         PRON      poss      
sleeves     NOUN      pobj      
24/7        NUM       appos     
?           PUNCT     punct     


In [18]:
# Iterate over the predicted entities
for ent in doc.ents:
    # print the entity text and its label
    print(ent)
    print(ent.text, ent.label_)
    spacy.explain(ent.label_)

24/7
24/7 CARDINAL


'Numerals that do not fall under another type'

In [19]:
# the model didn't predict "Aidan".  Manually add it
name = doc[2:3]
print('Missing entity: {}'.format(name))

Missing entity: Aidan


## matcher

### Rule-based

In [20]:
# import and initialize matcher
from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab)

In [21]:
def basic_pattern(string):
    
    doc = nlp(string)
    matcher = Matcher(nlp.vocab)
    
    patterns = []
    
    add_items = True
    add_dict = True    
    
    while add_dict:        
        match_single = {}                    
            
        while add_items:       
            tag = input('enter tag(lowercase):  ').upper()
            if tag == '0':
                add_dict = False
                break

            string = input('enter string:  ')
            if string == 'true' or string == 'false':
                string = bool(string)

            if tag == 'POS':
                string = string.upper()
            
            match_single[tag] = string
            
            add_items = input('add more to this dict?(y/n)  ')
            if add_items == 'n':
                patterns.append(match_single)
                break
        
        continue
    
    if len(patterns) == 0:
        return
        
    for p in patterns:
        print(p)
   
    matcher.add('pattern', [patterns])
    matches = matcher(doc)
    
    
    match_strings = []
    for match_id, start, end in matches:
        matched_span = doc[start:end]
        match_strings.append(matched_span)
    print('{} matches found'.format(len(match_strings)))
    return match_strings

In [25]:
basic_pattern('The dog went there.')

enter tag(lowercase):  pos
enter string:  det
add more to this dict?(y/n)  n
enter tag(lowercase):  pos
enter string:  noun
add more to this dict?(y/n)  n
enter tag(lowercase):  pos
enter string:  verb
add more to this dict?(y/n)  n
enter tag(lowercase):  0
{'POS': 'DET'}
{'POS': 'NOUN'}
{'POS': 'VERB'}
1 matches found


[The dog went]

In [21]:
# using matcher to look for specific tokens
pattern = [
    {'IS_DIGIT': True},
    {'LOWER': 'fifa'},
    {'LOWER': 'world'},
    {'LOWER': 'cup'},
    {'IS_PUNCT': True}
]

matcher.add('FIFA_WC', [pattern])

In [22]:
doc = nlp('2018 FIFA World Cup: France won!')
matches = matcher(doc)

for match_id, start, end in matches:
    matched_span = doc[start:end]
    print(matched_span)

2018 FIFA World Cup:


In [23]:
pattern = [
    {'LEMMA':'love', 'POS':'VERB'},
    {'POS':'NOUN'}
]

In [24]:
doc = nlp('I loved dogs but now I love cats more.')
    
# the matcher is more inclusive than regex because it will include 
#      words with the same root but different affixes -- i.e. 'loved'

In [25]:
# operators and quanitifiers

pattern = [
    {'LEMMA':'buy'},
    {'POS':'DET','OP':'?'},
    {'POS':'NOUN'}
]

In [26]:
basic_pattern('I bought a smartphone.  Now I\'m buying apps.')

enter tag(lowercase):  0


operators and quanitifers
(include these in the dictionary of the desired match pattern)

- {'OP':'!'} Negation - match 0 times
- {'OP':'?'} Optional - match 0 or 1 times
- {'OP':'+'} Match 1 or more times
- {'OP':'*'} Match 0 or more times

In [27]:
string = "After making the iOS update you won't notice a radical system-wide redesign: nothing like the aesthetic upheaval we got with iOS 7. Most of iOS 11's furniture remains the same as in iOS 10. But you will discover some tweaks once you delve a little deeper."

basic_pattern(string)
# pattern = [{'TEXT': 'iOS'}, {'IS_DIGIT': True}]

enter tag(lowercase):  0


In [28]:
string = 'Why won\'t Aidan stop eating his sleeves??!!'

basic_pattern(string)
# {'POS': 'PROPN'} matches proper nouns

enter tag(lowercase):  0


In [29]:
string = 'Features of the app include a beautiful design, smart search, automatic labels and optional voice responses.'

basic_pattern(string)
# returns an adjective followed by one or two nouns (one noun and one optional noun)
# {'POS': 'ADJ'}
# {'POS': 'NOUN'}
# {'POS': 'NOUN', 'OP': '?'}

enter tag(lowercase):  0


### Statistical

In [30]:
matcher = Matcher(nlp.vocab)
basic_pattern('I have a Golden Retriever')[0].span.root.head.text

enter tag(lowercase):  0


TypeError: 'NoneType' object is not subscriptable

In [31]:
# phrase matcher
from spacy.matcher import PhraseMatcher

matcher = PhraseMatcher(nlp.vocab)
pattern = nlp('Golden Retriever')

pattern = [
    {'LOWER':'golden'},
    {'LOWER':'retriever'}
]

matcher.add('dog', [pattern])


TypeError: unhashable type: 'dict'

## Shared vocab and string storage

In [None]:
doc = nlp('My kitties Leo and Piccolo are the cutest.')


In [None]:
# strings are stored with a "hash" number.  spacy must "see" both to store it.  Lookup goes in both directions
pickles_hash = nlp.vocab.strings['Piccolo']
pickles_string = nlp.vocab.strings[pickles_hash]
pickles_lexeme = nlp.vocab['Piccolo']

In [None]:
print('Piccolo hash #: ', pickles_hash)
print('Piccolo string: ', pickles_string)
print('Piccolo lexeme\'s text: ', pickles_lexeme.text)
print('Piccolo lexeme\'s orth: ', pickles_lexeme.orth)
print('Is the Piccolo lexeme alpha? ', pickles_lexeme.is_alpha)
print('Lexemes have the same attributes as tokens, except for POS tags, dependency info, or entity labels.')

## The Doc and Span objects

In [None]:
# create an nlp object
from spacy.lang.en import English
nlp = English()

# Import the Doc class
from spacy.tokens import Doc

# doc elements
words = ['Hello','world','!']
spaces = [True, False, False] # indicate whether the word is followed by a space

# manually create doc from 3 arguments
doc = Doc(nlp.vocab, words=words, spaces=spaces)

In [None]:
# import doc and span classes
from spacy.tokens import Doc, Span # need a doc to have a span!

# create a span from a doc manually
span = Span(doc, 0, 2)

# add an optional label
labeled_span = Span(doc, 1, 3, label='CURRENT_PLANET') # convention is to write label names in caps

# add span to the doc.ents
doc.ents = [labeled_span]

#### Best practices when using Docs and Spans
- Doc and span are mainly used to hold references and relationships of words and sentences
    - convert result to string as LATE as possible
    - use token attributes if possible (like token.i for the token's index)
- Don't forget to pass in the shared vocab as the first argument

#### practice

In [None]:
words = ['Only', 'two', 'more', 'weeks', '!']
spaces = [True, True, True, False, False]

time_left = Doc(nlp.vocab, words=words, spaces=spaces)
time_left
time_left.text


In [None]:
words = ['I', 'like','Leo', 'the','Lion','and','Piccolo','the','Princess','.']
spaces = [True, True, True, True, True, True, True, True, False, False]

doc = Doc(nlp.vocab, words=words, spaces=spaces)
leo = Span(doc, 2, 5, label='LEO')
piccolo = Span(doc, 6, 9, label='PICCOLO')

print(leo.text, leo.label_)
print(piccolo.text, piccolo.label_)

In [None]:
# write my span into ents
doc.ents = [leo, piccolo]
print([(ent.text, ent.label_) for ent in doc.ents])

In [None]:
for (token, token.pos)

In [None]:
words = ['Alex', 'likes','Leo', 'the','Lion','and','Piccolo','the','Princess','.']
spaces = [True, True, True, True, True, True, True, True, False, False]
doc = Doc(nlp.vocab, words=words, spaces=spaces)

for token in doc:
    # Check if the current token is a proper noun
    if token.pos_ == 'PROPN':
        # Check if the next token is a verb
        if doc[token.i + 1].pos_ == 'VERB':
            print('Found a verb after a proper noun!')

## Comparing semantic similarity

- spaCy can compare two objects (of type token, doc, or span) and predict similarity with . . .
    - Doc.similarity()
    - Span.similarity()
    - Token.similarity()
    
- return similarity score as float between 0 and 1

- TO DO THIS, YOU MUST BE USING EITHER MEDIUM OR LARGE LANGUAGE MODELS because they contain word vector data
    - CAN use: en_core_web_md (medium english model)
    - CAN use: en_core_web_md (large english model)
    - can NOT use: en_core_web_sm (small english model)

In [None]:
# load model
nlp = spacy.load('en_core_web_md')

In [None]:
doc1 = nlp('I like fast food')
doc2 = nlp('I like pizza')

# order doesn't matter
doc1.similarity(doc2)
doc2.similarity(doc1)

In [None]:
doc = nlp('I like pizza and pasta')
token1 = doc[2] # pizza
token2 = doc[4] # pasta

token1.similarity(token2)

In [None]:
# compare b/t different object types
doc = nlp('I like pizza')
token = nlp('soap')[0]
doc.similarity(token)

span = nlp('I like pizza and pasta')[2:5]  # pizza and pasta
doc = nlp('McDonalds sells burgers')
span.similarity(doc)



### Using word vectors

In [None]:
# intialize a doc
doc = nlp('I have a banana')

# access the vector via the token.vector attribute
print(doc[3].vector)

In [None]:
right_doc = nlp('I like cats')
wrong_doc = nlp('I hate cats')

right_doc.similarity(wrong_doc)

# similarity scores are subjective!  These opposite statements are rated as very similar because they express sentiment
#     about cats.  Keep in mind the objective of the program you're writing when it comes to similarity usefulness

In [None]:
doc = nlp("This was a great restaurant. Afterwards, we went to a really nice bar.")

# Create spans for "great restaurant" and "really nice bar"
span1 = doc[3:5]
print(span1)
span2 = doc[-4:-1]
print(span2)

similarity = span1.similarity(span2)
print('Similarity:', similarity)

In [32]:
dir(doc[0])

['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 'ancestors',
 'check_flag',
 'children',
 'cluster',
 'conjuncts',
 'dep',
 'dep_',
 'doc',
 'ent_id',
 'ent_id_',
 'ent_iob',
 'ent_iob_',
 'ent_kb_id',
 'ent_kb_id_',
 'ent_type',
 'ent_type_',
 'get_extension',
 'has_dep',
 'has_extension',
 'has_head',
 'has_morph',
 'has_vector',
 'head',
 'i',
 'idx',
 'iob_strings',
 'is_alpha',
 'is_ancestor',
 'is_ascii',
 'is_bracket',
 'is_currency',
 'is_digit',
 'is_left_punct',
 'is_lower',
 'is_oov',
 'is_punct',
 'is_quote',
 'is_right_punct',
 'is_sent_end',
 'is_sent_start',
 'is_space',
 'is_stop',
 'is_title',
 'is_upper',
 'lang',
 'lang_',
 'le