In [19]:
import pandas as pd
import glob, re

import spacy
from spacy.matcher import Matcher
from spacy import displacy
import visualise_spacy_tree

from IPython.display import Image, display

nlp = spacy.load('en_core_web_sm')
# load englis language model
# nlp = spacy.load('en_core_web_sm', disable=['ner','textcat'])

In [4]:
# Folder Path
# folders = glob.glob('data/data-text')

# Dataframe
# df = pd.DataFrame(columns={'original','Sentence'})

# Read in all text files into DF
# df = pd.read_csv('../data/data-text/01 Antonet - Four-Canals-Carocci.txt',sep=" ",header=None)
# what is happening here is that each word is split into separate cell in df
# this is not what I want!

In [78]:
text = "An engineer had to plan the construction of an artificial lake to produce electric energy. To feed the lake he thought to build a unique wide canal collecting water coming from a near valley. However, a mason pointed out that during the flood periods the stream of water flowing along the canal might be too strong and might damage the surrounding areas; by contrast, during the drought periods a unique stream of water might be insufficient to feed the lake. In order to avoid these mishaps, the mason suggested to build, instead of a unique wide canal, four small canals whose total flow was the same as the unique wide canal previously planned. These small canals were placed around the lake so that they conveyed water coming from four different valleys. In this way only small amounts of water could flow in each canal and thus during flood periods dangerous overflowing might not occur. At the same time, the lake was fed by water from various belts, so that also during drought periods it was sufficiency that the fed."

In [69]:
doc = nlp(text)
type(doc[0].text) #str
type(doc[0]) # spacy.tokens.token.Token
word_in_text = [token.text for token in doc]
coares_grained_POS = [token.pos_ for token in doc]
fine_grained_POS = [token.tag_ for token in doc]
# Syntactic dependencies (predicted by statistical model)
# Dependency labels
dep_labels = [token.dep_ for token in doc]
# Syntactic head token (governor)
governor = [token.head.text for token in doc]


In [74]:
# Named Entities (predicted by statistical model)
ner = [(ent.text, ent.label_) for ent in doc.ents]

text = "It’s official: Apple is the first U.S. public company to reach a $1 trillion market value"

# Process the text
doc = nlp(text)

for token in doc:
    # Get the token text, part-of-speech tag and dependency label
    token_text = token.text
    token_pos = token.pos_
    token_dep = token.dep_
    # This is for formatting only
    print('{:<12}{:<10}{:<10}'.format(token_text, token_pos, token_dep))

print('\n')
# Iterate over the predicted entities
for ent in doc.ents:
    # Print the entity text and its label
    print(ent.text, ent.label_)

It          PRON      nsubj     
’s          VERB      ccomp     
official    ADJ       dobj      
:           PUNCT     punct     
Apple       PROPN     nsubj     
is          AUX       ROOT      
the         DET       det       
first       ADJ       amod      
U.S.        PROPN     nmod      
public      ADJ       amod      
company     NOUN      attr      
to          PART      aux       
reach       VERB      relcl     
a           DET       det       
$           SYM       quantmod  
1           NUM       compound  
trillion    NUM       nummod    
market      NOUN      compound  
value       NOUN      dobj      


Apple ORG
first ORDINAL
U.S. GPE
$1 trillion MONEY


In [75]:
text = "New iPhone X release date leaked as Apple reveals pre-orders by mistake"

# Process the text
doc = nlp(text)

# Iterate over the entities
for ent in doc.ents:
    # print the entity text and label
    print(ent.text, ent.label_)

# Get the span for "iPhone X"
iphone_x = doc[1:3]

# Print the span text
print('Missing entity:', iphone_x.text)

Apple ORG
Missing entity: iPhone X


## Pre-processing 
### Clean text

In [6]:
# Function for cleaning text
# no lemmatization or change of caps, as it could cange POS tag of a word.
def clean(text):
    # removing new line character
    text = re.sub('\n','', str(text))
    text = re.sub('\n ','',str(text))
    # removing apostrophes
    text = re.sub("'s",'',str(text))
    # removing hyphens
    text = re.sub("-",' ',str(text))
    text = re.sub("- ",'',str(text))
    # removing quotation marks
    text = re.sub('\"','',str(text))
    return text


Data I was given its more or less claned, so the above step could not be necessary.

## Split text into different sentences!
Spliting text into differnet sentences will allow us to extract information from each sentence.

In [79]:
doc = nlp(text)
# doc.sents is a generator that yields sentence spans
[sent.text for sent in doc.sents]

['An engineer had to plan the construction of an artificial lake to produce electric energy.',
 'To feed the lake he thought to build a unique wide canal collecting water coming from a near valley.',
 'However, a mason pointed out that during the flood periods the stream of water flowing along the canal might be too strong and might damage the surrounding areas; by contrast, during the drought periods a unique stream of water might be insufficient to feed the lake.',
 'In order to avoid these mishaps, the mason suggested to build, instead of a unique wide canal, four small canals whose total flow was the same as the unique wide canal previously planned.',
 'These small canals were placed around the lake so that they conveyed water coming from four different valleys.',
 'In this way only small amounts of water could flow in each canal and thus during flood periods dangerous overflowing might not occur.',
 'At the same time, the lake was fed by water from various belts, so that also duri

In [30]:
def sentence(text):
    splitted = []
    tokens = nlp(text)
    for sent in tokens.sents:
        splitted.append(sent.string.strip())
        print(sent.string.strip())
    return splitted
splitted = sentence(text)

An engineer had to plan the construction of an artificial lake to produce electric energy.
To feed the lake he thought to build a unique wide canal collecting water coming from a near valley.
However, a mason pointed out that during the flood periods the stream of water flowing along the canal might be too strong and might damage the surrounding areas; by contrast, during the drought periods a unique stream of water might be insufficient to feed the lake.
In order to avoid these mishaps, the mason suggested to build, instead of a unique wide canal, four small canals whose total flow was the same as the unique wide canal previously planned.
These small canals were placed around the lake so that they conveyed water coming from four different valleys.
In this way only small amounts of water could flow in each canal and thus during flood periods dangerous overflowing might not occur.
At the same time, the lake was fed by water from various belts, so that also during drought periods it was 

## Base noun phrases (needs the tagger and parser)

In [80]:
# doc = nlp("I have a brown car")
doc = nlp(text)
# doc.noun_chunks is a generator that yields spans
[chunk.text for chunk in doc.noun_chunks]

['An engineer',
 'the construction',
 'an artificial lake',
 'electric energy',
 'the lake',
 'he',
 'a unique wide canal collecting water',
 'a near valley',
 'a mason',
 'the flood periods',
 'the stream',
 'water',
 'the canal',
 'the surrounding areas',
 'contrast',
 'a unique stream',
 'water',
 'the lake',
 'order',
 'these mishaps',
 'the mason',
 'a unique wide canal',
 'four small canals',
 'whose total flow',
 'the unique wide canal',
 'These small canals',
 'the lake',
 'they',
 'water',
 'four different valleys',
 'this way',
 'only small amounts',
 'water',
 'each canal',
 'flood periods dangerous overflowing',
 'the same time',
 'the lake',
 'water',
 'various belts',
 'drought periods',
 'it']

## Label explanations

In [1]:
spacy.explain('NN')
spacy.explain('GPE')
spacy.explain('ORG')
spacy.explain('PP')
spacy.explain('PREP')

NameError: name 'spacy' is not defined

## Information Extraction
Noun-Verb-Noun Phrases 

In [16]:
# function for rule 1: noun(subject), verb, noun(object)
def rule1(text):
    
    doc = nlp(text)
    sent = []

    for token in doc:
        # if the token is a verb
        if (token.pos_=='VERB'):
            phrase =''
            # only extract noun or pronoun subjects
            for sub_tok in token.lefts:
                if (sub_tok.dep_ in ['nsubj','nsubjpass']) and (sub_tok.pos_ in ['NOUN','PROPN','PRON']):
                    # add subject to the phrase
                    phrase += sub_tok.text
                    # save the root of the verb in phrase
                    phrase += ' '+token.lemma_ 
                    # check for noun or pronoun direct objects
                    for sub_tok in token.rights:  
                        # save the object in the phrase
                        if (sub_tok.dep_ in ['dobj']) and (sub_tok.pos_ in ['NOUN','PROPN']):          
                            phrase += ' '+sub_tok.text
                            sent.append(phrase)
    return sent

In [54]:
# function for rule 1: noun(subject), verb, noun(object)
def rule1(text):
    """
     Returns noun(subject)-verb-noun(object) relation.

            Parameters:
                    text (str): Sinlge string with sentence
                    

            Returns:
                    sent (str): Noun-verb-noun triple
    """
    
    doc = nlp(text)
    sent = []

    for token in doc:
        # print(token.text, token.pos_)
        if token.pos_ == 'NOUN':
            print(token)
            sent.append(token)
        if token.pos_ == 'VERB':
            print(token)
            sent.append(token)

    print(sent)

rule1('An engineer had to plan the construction of an artificial lake to produce electric energy.')


engineer
plan
construction
lake
produce
energy
[engineer, plan, construction, lake, produce, energy]


## Visualizing

In [87]:
# displacy.render(doc, style="dep")

In [88]:
displacy.render(doc, style='ent')

## Word vectors and similarity
- To use word vectors, you need to install the larger models ending in md or lg , for example en_core_web_md. ## Comparing similarity

In [89]:
doc1 = nlp("I like cats")
doc2 = nlp("I like dogs")
# Compare 2 documents
doc1.similarity(doc2)

0.881463757291801

In [90]:
# Compare 2 tokens
doc1[2].similarity(doc2[2])

0.73113453

In [91]:
# Compare tokens and spans
doc1[0].similarity(doc2[1:3])

0.020106245

## Accessing word vectors

In [93]:
# Vector as a numpy array
doc = nlp(text)
# The L2 norm of the token's vector
doc[2].vector

array([ 0.7485528 , -0.5302772 , -0.28065848,  1.6679082 , -2.5105572 ,
       -0.69197637,  2.3380795 ,  2.8330586 ,  0.24699098,  1.1241426 ,
       -0.36723453,  1.8007439 , -3.748637  ,  1.5548012 ,  1.6259949 ,
       -2.7272515 ,  0.8689956 , -4.114587  ,  0.99831045,  2.4935718 ,
       -0.7050187 ,  0.04261035, -1.9289498 , -0.6119052 , -0.86213654,
        2.5025008 , -1.6169056 , -2.8115811 ,  0.7515799 , -3.806582  ,
        2.930271  , -3.2458625 ,  0.35243738,  2.603886  , -4.441632  ,
        2.7368166 , -2.590376  ,  0.871832  , -1.7094355 ,  1.9728762 ,
        1.351961  ,  3.2549381 ,  2.276856  , -0.27092087, -0.03417099,
        0.3526801 ,  0.89299   ,  1.2564688 ,  0.16750032,  0.01128793,
        6.106284  , -0.4530599 , -0.0590274 ,  0.9944489 ,  3.5443711 ,
        1.2056826 ,  2.0902636 ,  1.393859  ,  1.638968  , -1.2946205 ,
       -0.14186192, -5.1092973 , -3.9598703 , -1.2180623 , -3.8225663 ,
        3.1138935 , -1.486633  , -1.4576025 ,  0.80167985, -2.52

In [94]:
doc[2].vector_norm

21.839558

## Pipeline components
Functions that take a Doc object, modify it and return it.

### Pipeline infrormation

In [96]:
nlp = spacy.load("en_core_web_sm")
nlp.pipe_names

['tagger', 'parser', 'ner']

In [97]:
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x1234c0370>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x1234ccb80>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x1234cce20>)]

### Custom component

In [99]:
# Function that modifies the doc and returns it
def custom_component(doc):
    print("Do something to the doc here!")
    return doc

# Add the component first in the pipeline
nlp.add_pipe(custom_component, first=True)

Components can be added first, last (default), or before or after an existing component.

## Extension Attributes
Custom attributes that are registered on the global Doc, Token and Span classes and become available as ._.

In [100]:
from spacy.tokens import Doc, Token, Span
doc = nlp("The sky over Guwahati is blue")

Do something to the doc here!


## Attribute extensions (with default value)

In [102]:
# Register custom attribute on Token class
Token.set_extension("is_color", default=False)
# Overwrite extension attribute with default value
doc[5]._.is_color = True

## Property Extension (with getter & setter)

In [104]:
# Register custom attribute on Doc class
get_reversed = lambda doc: doc.text[::-1]
Doc.set_extension("reversed", getter=get_reversed)
# Compute value of extension attribute with getter
doc._.reversed

'eulb si itahawuG revo yks ehT'

## Method extension (callable method)

In [106]:
# Register custom attribute on Span class
has_label = lambda span, label: span.label_ == label
Span.set_extension("has_label", method=has_label)
# Compute value of extension attribute with method
doc[3:5]._.has_label("GPE")

False

## Rule-based matching
### Using the Matcher

In [108]:
# Matcher is initialized with the shared vocab
from spacy.matcher import Matcher
# Each dict represents one token and its attributes
matcher = Matcher(nlp.vocab)
# Add with ID, optional callback and pattern(s)
pattern = [{"LOWER": "new"}, {"LOWER": "york"}]
matcher.add('CITIES', None, pattern)
# Match by calling the matcher on a Doc object
doc = nlp("I live in New York")
matches = matcher(doc)
# Matches are (match_id, start, end) tuples
for match_id, start, end in matches:
     # Get the matched span by slicing the Doc
     span = doc[start:end]
     print(span.text)

Do something to the doc here!
New York


In [110]:
doc = nlp("After making the iOS update you won't notice a radical system-wide redesign: nothing like the aesthetic upheaval we got with iOS 7. Most of iOS 11's furniture remains the same as in iOS 10. But you will discover some tweaks once you delve a little deeper.")

# Write a pattern for full iOS versions ("iOS 7", "iOS 11", "iOS 10")
pattern = [{'TEXT': 'iOS'}, {'IS_DIGIT': True}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add('IOS_VERSION_PATTERN', None, pattern)
matches = matcher(doc)
print('Total matches found:', len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print('Match found:', doc[start:end].text)

Do something to the doc here!
Total matches found: 3
Match found: iOS 7
Match found: iOS 11
Match found: iOS 10


In [111]:
doc = nlp("i downloaded Fortnite on my laptop and can't open the game at all. Help? so when I was downloading Minecraft, I got the Windows version where it is the '.zip' folder and I used the default program to unpack it... do I also need to download Winzip?")

# Write a pattern that matches a form of "download" plus proper noun
pattern = [{'LEMMA': 'download'}, {'POS': 'PROPN'}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add('DOWNLOAD_THINGS_PATTERN', None, pattern)
matches = matcher(doc)
print('Total matches found:', len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print('Match found:', doc[start:end].text)

Do something to the doc here!
Total matches found: 2
Match found: downloaded Fortnite
Match found: downloading Minecraft


In [112]:
doc = nlp("Features of the app include a beautiful design, smart search, automatic labels and optional voice responses.")

# Write a pattern for adjective plus one or two nouns
pattern = [{'POS': 'ADJ'}, {'POS': 'NOUN'}, {'POS': 'NOUN', 'OP': '?'}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add('ADJ_NOUN_PATTERN', None, pattern)
matches = matcher(doc)
print('Total matches found:', len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print('Match found:', doc[start:end].text)

Do something to the doc here!
Total matches found: 5
Match found: beautiful design
Match found: smart search
Match found: automatic labels
Match found: optional voice
Match found: optional voice responses


### Token patterns

In [113]:
# "love cats", "loving cats", "loved cats"
pattern1 = [{"LEMMA": "love"}, {"LOWER": "cats"}]
# "10 people", "twenty people"
pattern2 = [{"LIKE_NUM": True}, {"TEXT": "people"}]
# "book", "a cat", "the sea" (noun + optional article)
pattern3 = [{"POS": "DET", "OP": "?"}, {"POS": "NOUN"}]