In [None]:
import re
import spacy, nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nlp = spacy.load('en_core_web_sm')
nltk.download(['stopwords', 'punkt', 'wordnet'])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

# 1) Regular Expression

In [None]:
# match
sent = 'Hello! Data Science Rock, what do you think about Future of Data?'

result = re.match(r'Hello', sent)
print(result.group(0))
print (result.start())
print (result.end())

result = re.match(r'Data', sent)
print(result)

result = re.search(r'Analytics', 'AV Analytics Vidhya AV')
print(result.group(0))

Hello
0
5
None
Analytics


In [None]:
# search
result = re.search(r'Data', sent)
print(result.group(0))

# findall
result = re.findall(r'Data', sent)
print(result)

Data
['Data', 'Data']


In [None]:
# split
result = re.split(r' ', sent)
print(result)

result = re.split(r' ', sent, maxsplit=2)
print(result)

['Hello!', 'Data', 'Science', 'Rock,', 'what', 'do', 'you', 'think', 'about', 'Future', 'of', 'Data?']
['Hello!', 'Data', 'Science Rock, what do you think about Future of Data?']


In [None]:
# sub, to replace
result = re.sub(r'Rock', 'is Shit!', sent)
print(result)

Hello! Data Science is Shit!, what do you think about Future of Data?


In [None]:
# regex
# https://www.analyticsvidhya.com/blog/2015/06/regular-expression-python/?utm_source=blog&utm_medium=learning-path-nlp-2020

# all alphabet
result = re.findall(r'\w+', sent)
print(result)

# consecutive two characters of each word
result = re.findall(r'\w\w', sent)
print (result)

# consecutive two characters of each word
result = re.findall(r'\b\w.', sent)
print (result)

['Hello', 'Data', 'Science', 'Rock', 'what', 'do', 'you', 'think', 'about', 'Future', 'of', 'Data']
['He', 'll', 'Da', 'ta', 'Sc', 'ie', 'nc', 'Ro', 'ck', 'wh', 'at', 'do', 'yo', 'th', 'in', 'ab', 'ou', 'Fu', 'tu', 're', 'of', 'Da', 'ta']
['He', 'Da', 'Sc', 'Ro', 'wh', 'do', 'yo', 'th', 'ab', 'Fu', 'of', 'Da']


In [None]:
# extract all characters after “@”
sent = 'abc.test@gmail.com, xyz@test.in, test.first@icloud.com'
result = re.findall(r'@\w+', sent) 
print(result)

# extract all characters after “@” with com
result = re.findall(r'@\w+.\w+', sent) 
print(result)

# extract domain name, use ()
result = re.findall(r'@\w+.(\w+)', sent) 
print(result)

['@gmail', '@test', '@icloud']
['@gmail.com', '@test.in', '@icloud.com']
['com', 'in', 'com']


In [None]:
# date from given string
sent = 'Amit 34-3456 12-05-2007, XYZ 56-4532 11-11-2011, ABC 67-8945 12-01-2009'
result = re.findall(r'\d{2}-\d{2}-\d{4}', sent)
print(result)

['12-05-2007', '11-11-2011', '12-01-2009']


In [None]:
sent = 'Hello! Data Science Rock, what do you think about Future of Data?'

# word that start with vowel
result = re.findall(r'\b[aeiouAEIOU]\w+', sent)
print(result)

# not vowel
result = re.findall(r'\b[^aeiouAEIOU]\w+', sent)
print(result)

['about', 'of']
['Hello', 'Data', ' Science', ' Rock', 'what', ' do', ' you', ' think', ' about', ' Future', ' of', ' Data']


In [None]:
# return phone number
sent = '9999999999 999999-999 99999x9999 6599999999'
result = re.findall(r'[6]{1}[0-9]{9}', sent)
print(result)

['6599999999']


In [None]:
# Split a string with multiple delimiters
sent = 'asdf fjdk;afed,fjek,asdf,foo'
result = re.split(r'[;,\s]', sent)
print(result)

result= re.sub(r'[;,\s]', ' ', sent)
print(result)

['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']
asdf fjdk afed fjek asdf foo


In [None]:
# html
sent = '''
<tr align="center"><td>1</td> <td>Noah</td> <td>Emma</td></tr>
<tr align="center"><td>2</td> <td>Liam</td> <td>Olivia</td></tr>
<tr align="center"><td>3</td> <td>Mason</td> <td>Sophia</td></tr>
<tr align="center"><td>4</td> <td>Jacob</td> <td>Isabella</td></tr>
<tr align="center"><td>5</td> <td>William</td> <td>Ava</td></tr>
<tr align="center"><td>6</td> <td>Ethan</td> <td>Mia</td></tr>
<tr align="center"><td>7</td> <td HTML>Michael</td> <td>Emily</td></tr>
'''

result = re.findall(r'<td>\w+</td>\s<td>(\w+)</td>\s<td>(\w+)</td>', sent)
print(result)

[('Noah', 'Emma'), ('Liam', 'Olivia'), ('Mason', 'Sophia'), ('Jacob', 'Isabella'), ('William', 'Ava'), ('Ethan', 'Mia')]


# 2) Spacy Basic

**spaCy** (https://spacy.io/) is an open-source Python library that parses and "understands" large volumes of text. Separate models are available that cater to specific languages (English, French, German, etc.).

run this command to download englush module <br/>

<div class="alert alert-info"> 
    python -m spacy download en
</div>

## 2.1 Pipelines

When we run `nlp`, our text enters a *processing pipeline* that first breaks down the text and then performs a series of operations to tag, parse and describe the data.   Image source: https://spacy.io/usage/spacy-101#pipelines

In [None]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

## 2.2 Tokenization

For a full list of POS Tags visit https://spacy.io/api/annotation#pos-tagging <br/>
    
For a full list of Syntactic Dependencies visit https://spacy.io/api/annotation#dependency-parsing
<br>A good explanation of typed dependencies can be found [here](https://nlp.stanford.edu/software/dependencies_manual.pdf)

|Tag|Description|doc2[0].tag|
|:------|:------:|:------|
|`.text`|The original word text<!-- .element: style="text-align:left;" -->|`Tesla`|
|`.lemma_`|The base form of the word|`tesla`|
|`.pos_`|The simple part-of-speech tag|`PROPN`/`proper noun`|
|`.tag_`|The detailed part-of-speech tag|`NNP`/`noun, proper singular`|
|`.shape_`|The word shape – capitalization, punctuation, digits|`Xxxxx`|
|`.is_alpha`|Is the token an alpha character?|`True`|
|`.is_stop`|Is the token part of a stop list, i.e. the most common words of the language?|`False`|

In [None]:
doc = nlp('"We\'re moving to L.A.!"')
[token for token in doc]

[", We, 're, moving, to, L.A., !, "]

<img src="https://ashutoshtripathicom.files.wordpress.com/2020/04/tokenization.png" width="500" height='300'>

-  **Prefix**:	Character(s) at the beginning &#9656; `$ ( “ ¿`
-  **Suffix**:	Character(s) at the end &#9656; `km ) , . ! ”`
-  **Infix**:	Character(s) in between &#9656; `- -- / ...`
-  **Exception**: Special-case rule to split a string into several tokens or prevent a token from being split when punctuation rules are applied &#9656; `St. U.S.`

In [None]:
paragraph = """I have three visions for India. In 3000 years of our history, people from all over 
               the world have come and invaded us, captured our lands, conquered our minds. 
               From Alexander onwards, the Greeks, the Turks, the Moguls, the Portuguese, the British,
               the French, the Dutch, all of them came and looted us, took over what was ours. 
               Yet we have not done this to any other nation. We have not conquered anyone. 
               We have not grabbed their land, their culture, 
               their history and tried to enforce our way of life on them. 
               Why? Because we respect the freedom of others.That is why my 
               first vision is that of freedom. I believe that India got its first vision of 
               this in 1857, when we started the War of Independence. It is this freedom that
               we must protect and nurture and build on. If we are not free, no one will respect us.
               My second vision for India’s development. For fifty years we have been a developing nation.
               It is time we see ourselves as a developed nation. We are among the top 5 nations of the world
               in terms of GDP. We have a 10 percent growth rate in most areas. Our poverty levels are falling.
               Our achievements are being globally recognised today. Yet we lack the self-confidence to
               see ourselves as a developed nation, self-reliant and self-assured. Isn’t this incorrect?
               I have a third vision. India must stand up to the world. Because I believe that unless India 
               stands up to the world, no one will respect us. Only strength respects strength. We must be 
               strong not only as a military power but also as an economic power. Both must go hand-in-hand. 
               My good fortune was to have worked with three great minds. Dr. Vikram Sarabhai of the Dept. of 
               space, Professor Satish Dhawan, who succeeded him and Dr. Brahm Prakash, father of nuclear material.
               I was lucky to have worked with all three of them closely and consider this the great opportunity of my life. 
               I see four milestones in my career"""

In [None]:
# Tokenizing sentences
sentences = nltk.sent_tokenize(paragraph)
words = nltk.word_tokenize(paragraph)
print(sentences)
print("\n")
print(words)

['I have three visions for India.', 'In 3000 years of our history, people from all over \n               the world have come and invaded us, captured our lands, conquered our minds.', 'From Alexander onwards, the Greeks, the Turks, the Moguls, the Portuguese, the British,\n               the French, the Dutch, all of them came and looted us, took over what was ours.', 'Yet we have not done this to any other nation.', 'We have not conquered anyone.', 'We have not grabbed their land, their culture, \n               their history and tried to enforce our way of life on them.', 'Why?', 'Because we respect the freedom of others.That is why my \n               first vision is that of freedom.', 'I believe that India got its first vision of \n               this in 1857, when we started the War of Independence.', 'It is this freedom that\n               we must protect and nurture and build on.', 'If we are not free, no one will respect us.', 'My second vision for India’s development.', 'For 

## 2.3 Stemming

"boat" would be the **stem** for [boat, boater, boating, boats]. <br>
spaCy doesn't include a stemmer, opting instead to rely entirely on lemmatization.

In [None]:
from nltk.stem.porter import *

### 2.3.1 Porter Stemmer

One of the most common - and effective - stemming tools is [*Porter's Algorithm*](https://tartarus.org/martin/PorterStemmer/) developed by Martin Porter in [1980](https://tartarus.org/martin/PorterStemmer/def.txt). The algorithm employs five phases of word reduction, each with its own set of mapping rules. In the first phase, simple suffix mapping rules are defined, such as:

From a given set of stemming rules only one rule is applied, based on the longest suffix S1. Thus, `caresses` reduces to `caress` but not `cares`.

More sophisticated phases consider the length/complexity of the word before applying a rule. For example:

![stemming1.png](https://miro.medium.com/max/1310/1*2cUR76NeWoPMiFsur0hQtw.png)

In [None]:
p_stemmer = PorterStemmer()
words = ['run', 'runner', 'running', 'ran', 'runs', 'easily', 'fairly']

for word in words:
    print(word + ' --> ' + p_stemmer.stem(word))

run --> run
runner --> runner
running --> run
ran --> ran
runs --> run
easily --> easili
fairly --> fairli


### 2.3.2 Snowball Stemmer
The algorithm offers a slight improvement over the original Porter stemmer, both in logic and speed.

In [None]:
from nltk.stem.snowball import SnowballStemmer

In [None]:
s_stemmer = SnowballStemmer(language='english')

In [None]:
words = ['run', 'runner', 'running', 'ran', 'runs', 'easily', 'fairly', 'fairness']
# words = ['generous','generation','generously','generate']
for word in words:
    print(word + ' --> ' + s_stemmer.stem(word))

run --> run
runner --> runner
running --> run
ran --> ran
runs --> run
easily --> easili
fairly --> fair
fairness --> fair


Lemmatization is better at doing this!

## 2.4 Lemmatization

In contrast to stemming, lemmatization looks beyond word reduction, and considers a language's full vocabulary to apply a morphological analysis to words. The lemma of 'meeting' might be 'meet' or 'meeting' depending on its use in a sentence (surrounding words).

In [None]:
# nltk
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize('I am a runner running in a race because I love to run since I ran today')

'I am a runner running in a race because I love to run since I ran today'

In [None]:
# spacy
doc1 = nlp(u"I am a runner running in a race because I love to run since I ran today")

for token in doc1:
    print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma_:{8}} {token.lemma:<{22}}')

I            PRON   -PRON-   561228191312463089    
am           AUX    be       10382539506755952630  
a            DET    a        11901859001352538922  
runner       NOUN   runner   12640964157389618806  
running      VERB   run      12767647472892411841  
in           ADP    in       3002984154512732771   
a            DET    a        11901859001352538922  
race         NOUN   race     8048469955494714898   
because      SCONJ  because  16950148841647037698  
I            PRON   -PRON-   561228191312463089    
love         VERB   love     3702023516439754181   
to           PART   to       3791531372978436496   
run          VERB   run      12767647472892411841  
since        SCONJ  since    10066841407251338481  
I            PRON   -PRON-   561228191312463089    
ran          VERB   run      12767647472892411841  
today        NOUN   today    11042482332948150395  


## 2.5 Stopwords

spaCy holds a built-in list of some 326 English stop words, while only 179 for nltk

In [None]:
# stopwords for nltk
print(len(stopwords.words('english')))

# stopwords for spacy
print(len(nlp.Defaults.stop_words))

179
326


In [None]:
print(len(nlp.Defaults.stop_words))
nlp.vocab['myself'].is_stop

326


True

In [None]:
# Add the word to the set of stop words. Use lowercase!
nlp.Defaults.stop_words.add('btw')
nlp.vocab['btw'].is_stop = True
print(len(nlp.Defaults.stop_words))

327


In [None]:
# Remove the word from the set of stop words
nlp.Defaults.stop_words.remove('beyond')
nlp.vocab['beyond'].is_stop = False
nlp.vocab['beyond'].is_stop

False

## 2.6 Pattern Matching

In [None]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

### 2.6.1 Rule Based Matching

* `pattern1` looks for a single token whose lowercase text reads 'solarpower'
* `pattern2` looks for two adjacent tokens that read 'solar' and 'power' in that order
* `pattern3` looks for three adjacent tokens, with a middle token that can be any punctuation.<font color=green>*</font>

<font color=green>\* Remember that single spaces are not tokenized, so they don't count as punctuation.</font>
<br>Once we define our patterns, we pass them into `matcher` with the name 'SolarPower', and set *callbacks* to `None` (more on callbacks later).

In [None]:
pattern1 = [{'LOWER': 'solarpower'}]
pattern2 = [{'LOWER': 'solar'}, {'IS_PUNCT': True, 'OP': '*'}, {'LOWER': 'power'}]
matcher.add('SolarPower', None, pattern1, pattern2)

The following quantifiers can be passed to the `'OP'` key:
<table><tr><th>OP</th><th>Description</th></tr>

<tr ><td><span >\!</span></td><td>Negate the pattern, by requiring it to match exactly 0 times</td></tr>
<tr ><td><span >?</span></td><td>Make the pattern optional, by allowing it to match 0 or 1 times</td></tr>
<tr ><td><span >\+</span></td><td>Require the pattern to match 1 or more times</td></tr>
<tr ><td><span >\*</span></td><td>Allow the pattern to match zero or more times</td></tr>
</table>

In [None]:
doc = nlp(u'The Solar Power industry continues to grow as demand \
for solarpower increases. Solar-power cars are gaining popularity.')

found_matches = matcher(doc)
print(found_matches)
[doc[start : end] for _id, start, end in found_matches]

[(8656102463236116519, 1, 3), (8656102463236116519, 10, 11), (8656102463236116519, 13, 16)]


[Solar Power, solarpower, Solar-power]

There are a variety of token attributes we can use to determine matching rules:
<table><tr><th>Attribute</th><th>Description</th></tr>

<tr ><td><span >`ORTH`</span></td><td>The exact verbatim text of a token</td></tr>
<tr ><td><span >`LOWER`</span></td><td>The lowercase form of the token text</td></tr>
<tr ><td><span >`LENGTH`</span></td><td>The length of the token text</td></tr>
<tr ><td><span >`IS_ALPHA`, `IS_ASCII`, `IS_DIGIT`</span></td><td>Token text consists of alphanumeric characters, ASCII characters, digits</td></tr>
<tr ><td><span >`IS_LOWER`, `IS_UPPER`, `IS_TITLE`</span></td><td>Token text is in lowercase, uppercase, titlecase</td></tr>
<tr ><td><span >`IS_PUNCT`, `IS_SPACE`, `IS_STOP`</span></td><td>Token is punctuation, whitespace, stop word</td></tr>
<tr ><td><span >`LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL`</span></td><td>Token text resembles a number, URL, email</td></tr>
<tr ><td><span >`POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE`</span></td><td>The token's simple and extended part-of-speech tag, dependency label, lemma, shape</td></tr>
<tr ><td><span >`ENT_TYPE`</span></td><td>The token's entity label</td></tr>

</table>

You can pass an empty dictionary `{}` as a wildcard to represent **any token**. For example, you might want to retrieve hashtags without knowing what might follow the `#` character:
>`[{'ORTH': '#'}, {}]`

In [None]:
doc = nlp("GDP in developing countries such as Vietnam will continue growing at a high rate.")
for tok in doc: 
    print(f"{tok.text:{15}} {tok.dep_:{10}} {tok.pos_} ")

GDP             nsubj      NOUN 
in              prep       ADP 
developing      amod       VERB 
countries       pobj       NOUN 
such            amod       ADJ 
as              prep       SCONJ 
Vietnam         pobj       PROPN 
will            aux        VERB 
continue        ROOT       VERB 
growing         xcomp      VERB 
at              prep       ADP 
a               det        DET 
high            amod       ADJ 
rate            pobj       NOUN 
.               punct      PUNCT 


In [None]:
# create pattern
pattern = [{'POS': 'NOUN'}, {'LOWER': 'such'}, {'LOWER': 'as'}, {'POS': 'PROPN'}]
matcher.add("matching_1", None, pattern) 

matches = matcher(doc)
print(doc[matches[0][1] : matches[0][2]])

# define pattern
pattern = [{'DEP':'amod', 'OP':"?"}, {'POS':'NOUN'}, {'LOWER': 'such'}, {'LOWER': 'as'}, {'POS': 'PROPN'}]
matcher.add("matching_1", None, pattern) 

matches = matcher(doc)
print(doc[matches[0][1] : matches[0][2]])

countries such as Vietnam
developing countries such as Vietnam


In [None]:
doc = nlp("Here is how you can keep your car and other vehicles clean.") 
doc_2 = nlp("Here is how you can keep your car or other vehicles clean.") 

# define the pattern to match OR AND
pattern = [{'DEP':'amod', 'OP':"?"}, 
           {'POS':'NOUN'}, 
           {'LOWER': 'and', 'OP':"?"}, 
           {'LOWER': 'or', 'OP':"?"}, 
           {'LOWER': 'other'}, 
           {'POS': 'NOUN'}] 
matcher.add("matching_1", None, pattern) 

matches = matcher(doc)
print(doc[matches[0][1] : matches[0][2]])

matches = matcher(doc_2)
print(doc_2[matches[0][1] : matches[0][2]])

car and other vehicles
car or other vehicles


In [None]:
doc = nlp("A healthy eating pattern includes fruits, especially whole fruits.") 

# define the pattern to match OR AND
pattern = [{'DEP':'nummod', 'OP':"?"}, 
           {'DEP':'amod', 'OP':"?"}, 
           {'POS':'NOUN'}, 
           {'IS_PUNCT':True}, 
           {'LOWER':'especially'}, 
           {'DEP':'nummod', 'OP':"?"}, 
           {'DEP':'amod', 'OP':"?"}, 
           {'POS':'NOUN'}] 

matcher.add("matching_1", None, pattern) 

matches = matcher(doc)
print(doc[matches[0][1] : matches[0][2]])

fruits, especially whole fruits


### 2.6.2 PhraseMatcher
Alternatively, use PhraseMatcher to create a Doc object from a list of phrases, and pass that into `matcher` instead.

In [None]:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

Source: https://en.wikipedia.org/wiki/Reaganomics

In [None]:
with open('data/reaganomics.txt', encoding='unicode_escape') as f:
    doc = nlp(f.read())

# print sentences
[senc for senc in doc.sents][1:3]

[https://en.wikipedia.org/wiki/Reaganomics
 ,
 Reaganomics (a portmanteau of [Ronald] Reagan and economics attributed to Paul Harvey)[1] refers to the economic policies promoted by U.S. President Ronald Reagan during the 1980s.]

In [None]:
phrase_list = ['voodoo economics', 'supply-side economics', 'trickle-down economics', 'free-market economics']

# convert each phrase to a Doc object:
phrase_patterns = [nlp(text) for text in phrase_list]
# Pass each Doc object into matcher:
matcher.add('VoodooEconomics', None, *phrase_patterns)
matches = matcher(doc)
[doc[start : end] for _id, start, end in matches]

[supply-side economics,
 trickle-down economics,
 voodoo economics,
 free-market economics,
 supply-side economics,
 trickle-down economics]

In [None]:
matches

[(3473369816841043438, 41, 45),
 (3473369816841043438, 49, 53),
 (3473369816841043438, 54, 56),
 (3473369816841043438, 61, 65),
 (3473369816841043438, 673, 677),
 (3473369816841043438, 2986, 2990)]

# 3) Part of Speech

* To view the coarse POS tag use `token.pos_`
* To view the fine-grained tag use `token.tag_`
* To view the description of either type of tag use `spacy.explain(tag)`

<div class="alert alert-success">Note that `token.pos` and `token.tag` return integer hash values; by adding the underscores we get the text equivalent that lives in **doc.vocab**.</div>

<table>
<tr><th>POS</th><th>Description</th><th>Fine-grained Tag</th><th>Description</th><th>Morphology</th></tr>
<tr><td>ADJ</td><td>adjective</td><td>AFX</td><td>affix</td><td>Hyph=yes</td></tr>
<tr><td>ADJ</td><td></td><td>JJ</td><td>adjective</td><td>Degree=pos</td></tr>
<tr><td>ADJ</td><td></td><td>JJR</td><td>adjective, comparative</td><td>Degree=comp</td></tr>
<tr><td>ADJ</td><td></td><td>JJS</td><td>adjective, superlative</td><td>Degree=sup</td></tr>
<tr><td>ADJ</td><td></td><td>PDT</td><td>predeterminer</td><td>AdjType=pdt PronType=prn</td></tr>
<tr><td>ADJ</td><td></td><td>PRP\$</td><td>pronoun, possessive</td><td>PronType=prs Poss=yes</td></tr>
<tr><td>ADJ</td><td></td><td>WDT</td><td>wh-determiner</td><td>PronType=int rel</td></tr>
<tr><td>ADJ</td><td></td><td>WP\$</td><td>wh-pronoun, possessive</td><td>Poss=yes PronType=int rel</td></tr>
<tr><td>ADP</td><td>adposition</td><td>IN</td><td>conjunction, subordinating or preposition</td><td></td></tr>
<tr><td>ADV</td><td>adverb</td><td>EX</td><td>existential there</td><td>AdvType=ex</td></tr>
<tr><td>ADV</td><td></td><td>RB</td><td>adverb</td><td>Degree=pos</td></tr>
<tr><td>ADV</td><td></td><td>RBR</td><td>adverb, comparative</td><td>Degree=comp</td></tr>
<tr><td>ADV</td><td></td><td>RBS</td><td>adverb, superlative</td><td>Degree=sup</td></tr>
<tr><td>ADV</td><td></td><td>WRB</td><td>wh-adverb</td><td>PronType=int rel</td></tr>
<tr><td>CONJ</td><td>conjunction</td><td>CC</td><td>conjunction, coordinating</td><td>ConjType=coor</td></tr>
<tr><td>DET</td><td>determiner</td><td>DT</td><td>determiner</td><td></td></tr>
<tr><td>INTJ</td><td>interjection</td><td>UH</td><td>interjection</td><td></td></tr>
<tr><td>NOUN</td><td>noun</td><td>NN</td><td>noun, singular or mass</td><td>Number=sing</td></tr>
<tr><td>NOUN</td><td></td><td>NNS</td><td>noun, plural</td><td>Number=plur</td></tr>
<tr><td>NOUN</td><td></td><td>WP</td><td>wh-pronoun, personal</td><td>PronType=int rel</td></tr>
<tr><td>NUM</td><td>numeral</td><td>CD</td><td>cardinal number</td><td>NumType=card</td></tr>
<tr><td>PART</td><td>particle</td><td>POS</td><td>possessive ending</td><td>Poss=yes</td></tr>
<tr><td>PART</td><td></td><td>RP</td><td>adverb, particle</td><td></td></tr>
<tr><td>PART</td><td></td><td>TO</td><td>infinitival to</td><td>PartType=inf VerbForm=inf</td></tr>
<tr><td>PRON</td><td>pronoun</td><td>PRP</td><td>pronoun, personal</td><td>PronType=prs</td></tr>
<tr><td>PROPN</td><td>proper noun</td><td>NNP</td><td>noun, proper singular</td><td>NounType=prop Number=sign</td></tr>
<tr><td>PROPN</td><td></td><td>NNPS</td><td>noun, proper plural</td><td>NounType=prop Number=plur</td></tr>
<tr><td>PUNCT</td><td>punctuation</td><td>-LRB-</td><td>left round bracket</td><td>PunctType=brck PunctSide=ini</td></tr>
<tr><td>PUNCT</td><td></td><td>-RRB-</td><td>right round bracket</td><td>PunctType=brck PunctSide=fin</td></tr>
<tr><td>PUNCT</td><td></td><td>,</td><td>punctuation mark, comma</td><td>PunctType=comm</td></tr>
<tr><td>PUNCT</td><td></td><td>:</td><td>punctuation mark, colon or ellipsis</td><td></td></tr>
<tr><td>PUNCT</td><td></td><td>.</td><td>punctuation mark, sentence closer</td><td>PunctType=peri</td></tr>
<tr><td>PUNCT</td><td></td><td>''</td><td>closing quotation mark</td><td>PunctType=quot PunctSide=fin</td></tr>
<tr><td>PUNCT</td><td></td><td>""</td><td>closing quotation mark</td><td>PunctType=quot PunctSide=fin</td></tr>
<tr><td>PUNCT</td><td></td><td>``</td><td>opening quotation mark</td><td>PunctType=quot PunctSide=ini</td></tr>
<tr><td>PUNCT</td><td></td><td>HYPH</td><td>punctuation mark, hyphen</td><td>PunctType=dash</td></tr>
<tr><td>PUNCT</td><td></td><td>LS</td><td>list item marker</td><td>NumType=ord</td></tr>
<tr><td>PUNCT</td><td></td><td>NFP</td><td>superfluous punctuation</td><td></td></tr>
<tr><td>SYM</td><td>symbol</td><td>#</td><td>symbol, number sign</td><td>SymType=numbersign</td></tr>
<tr><td>SYM</td><td></td><td>\$</td><td>symbol, currency</td><td>SymType=currency</td></tr>
<tr><td>SYM</td><td></td><td>SYM</td><td>symbol</td><td></td></tr>
<tr><td>VERB</td><td>verb</td><td>BES</td><td>auxiliary "be"</td><td></td></tr>
<tr><td>VERB</td><td></td><td>HVS</td><td>forms of "have"</td><td></td></tr>
<tr><td>VERB</td><td></td><td>MD</td><td>verb, modal auxiliary</td><td>VerbType=mod</td></tr>
<tr><td>VERB</td><td></td><td>VB</td><td>verb, base form</td><td>VerbForm=inf</td></tr>
<tr><td>VERB</td><td></td><td>VBD</td><td>verb, past tense</td><td>VerbForm=fin Tense=past</td></tr>
<tr><td>VERB</td><td></td><td>VBG</td><td>verb, gerund or present participle</td><td>VerbForm=part Tense=pres Aspect=prog</td></tr>
<tr><td>VERB</td><td></td><td>VBN</td><td>verb, past participle</td><td>VerbForm=part Tense=past Aspect=perf</td></tr>
<tr><td>VERB</td><td></td><td>VBP</td><td>verb, non-3rd person singular present</td><td>VerbForm=fin Tense=pres</td></tr>
<tr><td>VERB</td><td></td><td>VBZ</td><td>verb, 3rd person singular present</td><td>VerbForm=fin Tense=pres Number=sing Person=3</td></tr>
<tr><td>X</td><td>other</td><td>ADD</td><td>email</td><td></td></tr>
<tr><td>X</td><td></td><td>FW</td><td>foreign word</td><td>Foreign=yes</td></tr>
<tr><td>X</td><td></td><td>GW</td><td>additional word in multi-word expression</td><td></td></tr>
<tr><td>X</td><td></td><td>XX</td><td>unknown</td><td></td></tr>
<tr><td>SPACE</td><td>space</td><td>_SP</td><td>space</td><td></td></tr>
<tr><td></td><td></td><td>NIL</td><td>missing tag</td><td></td></tr>
</table>

In [None]:
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")
for token in doc:
    print(f'{token.text:{10}} {token.pos_:{8}} {token.tag_:{6}} {spacy.explain(token.tag_)}')

The        DET      DT     determiner
quick      ADJ      JJ     adjective (English), other noun-modifier (Chinese)
brown      ADJ      JJ     adjective (English), other noun-modifier (Chinese)
fox        NOUN     NN     noun, singular or mass
jumped     VERB     VBD    verb, past tense
over       ADP      IN     conjunction, subordinating or preposition
the        DET      DT     determiner
lazy       ADJ      JJ     adjective (English), other noun-modifier (Chinese)
dog        NOUN     NN     noun, singular or mass
's         PART     POS    possessive ending
back       NOUN     NN     noun, singular or mass
.          PUNCT    .      punctuation mark, sentence closer


In [None]:
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")

# dependency
for token in doc:
    print(f'{token.text:{10}} {token.dep_ :{8}} {spacy.explain(token.dep_)}')

# Count the frequencies of different coarse-grained POS tags:
print('\n')
POS_counts = doc.count_by(spacy.attrs.POS)
print(POS_counts)
print(doc.vocab[90].text)

The        det      determiner
quick      amod     adjectival modifier
brown      amod     adjectival modifier
fox        nsubj    nominal subject
jumped     ROOT     root
over       prep     prepositional modifier
the        det      determiner
lazy       amod     adjectival modifier
dog        poss     possession modifier
's         case     case marking
back       pobj     object of preposition
.          punct    punctuation


{90: 2, 84: 3, 92: 3, 100: 1, 85: 1, 94: 1, 97: 1}
DET


In [None]:
for k, v in sorted(POS_counts.items()):
    print(f'{k}. {doc.vocab[k].text:{5}}: {v}')

84. ADJ  : 3
85. ADP  : 1
90. DET  : 2
92. NOUN : 3
94. PART : 1
97. PUNCT: 1
100. VERB : 1


In [None]:
# Count the different dependencies:
DEP_counts = doc.count_by(spacy.attrs.DEP)

for k, v in sorted(DEP_counts.items()):
    print(f'{k}. {doc.vocab[k].text:{4}}: {v}')

402. amod: 3
415. det : 2
429. nsubj: 1
439. pobj: 1
440. poss: 1
443. prep: 1
445. punct: 1
8110129090154140942. case: 1
8206900633647566924. ROOT: 1


# 4) Named Entity Recognition (NER)

## 4.1 NER label

In [None]:
from spacy.tokens import Span
from spacy.matcher import PhraseMatcher
from spacy import displacy

Tags are accessible through the `.label_` property of an entity.
<table>
<tr><th>TYPE</th><th>DESCRIPTION</th><th>EXAMPLE</th></tr>
<tr><td>`PERSON`</td><td>People, including fictional.</td><td>*Fred Flintstone*</td></tr>
<tr><td>`NORP`</td><td>Nationalities or religious or political groups.</td><td>*The Republican Party*</td></tr>
<tr><td>`FAC`</td><td>Buildings, airports, highways, bridges, etc.</td><td>*Logan International Airport, The Golden Gate*</td></tr>
<tr><td>`ORG`</td><td>Companies, agencies, institutions, etc.</td><td>*Microsoft, FBI, MIT*</td></tr>
<tr><td>`GPE`</td><td>Countries, cities, states.</td><td>*France, UAR, Chicago, Idaho*</td></tr>
<tr><td>`LOC`</td><td>Non-GPE locations, mountain ranges, bodies of water.</td><td>*Europe, Nile River, Midwest*</td></tr>
<tr><td>`PRODUCT`</td><td>Objects, vehicles, foods, etc. (Not services.)</td><td>*Formula 1*</td></tr>
<tr><td>`EVENT`</td><td>Named hurricanes, battles, wars, sports events, etc.</td><td>*Olympic Games*</td></tr>
<tr><td>`WORK_OF_ART`</td><td>Titles of books, songs, etc.</td><td>*The Mona Lisa*</td></tr>
<tr><td>`LAW`</td><td>Named documents made into laws.</td><td>*Roe v. Wade*</td></tr>
<tr><td>`LANGUAGE`</td><td>Any named language.</td><td>*English*</td></tr>
<tr><td>`DATE`</td><td>Absolute or relative dates or periods.</td><td>*20 July 1969*</td></tr>
<tr><td>`TIME`</td><td>Times smaller than a day.</td><td>*Four hours*</td></tr>
<tr><td>`PERCENT`</td><td>Percentage, including "%".</td><td>*Eighty percent*</td></tr>
<tr><td>`MONEY`</td><td>Monetary values, including unit.</td><td>*Twenty Cents*</td></tr>
<tr><td>`QUANTITY`</td><td>Measurements, as of weight or distance.</td><td>*Several kilometers, 55kg*</td></tr>
<tr><td>`ORDINAL`</td><td>"first", "second", etc.</td><td>*9th, Ninth*</td></tr>
<tr><td>`CARDINAL`</td><td>Numerals that do not fall under another type.</td><td>*2, Two, Fifty-two*</td></tr>
</table>

In [None]:
# Write a function to display basic entity info:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text + ' - ' + ent.label_ + ' - ' + str(spacy.explain(ent.label_)))
    else:
        print('No named entities found.')

In [None]:
doc = nlp(u'May I go to Washington, DC next May to see the Washington Monument?')
show_ents(doc)

Washington - GPE - Countries, cities, states
next May - DATE - Absolute or relative dates or periods
the Washington Monument - ORG - Companies, agencies, institutions, etc.


In [None]:
doc = nlp(u'Originally priced at $29.50, the sweater was marked down to five dollars.')
show_ents(doc)
len([ent for ent in doc.ents if ent.label_=='MONEY'])

29.50 - MONEY - Monetary values, including unit
five dollars - MONEY - Monetary values, including unit


2

## 4.2 Redefining Entity (useless)

In [None]:
# tesla supposed to be ORG instead of ordinal
doc = nlp(u'Tesla to build a U.K. factory for $6 million')
show_ents(doc)

U.K. - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [None]:
# Create a Span for the new entity, Tesla
new_ent = Span(doc, 0, 1, label=doc.vocab.strings[u'ORG'])
doc.ents = list(doc.ents[1:]) + [new_ent]
show_ents(doc)

Tesla - ORG - Companies, agencies, institutions, etc.
$6 million - MONEY - Monetary values, including unit


In [None]:
# add new entity, vacuum cleanner
doc = nlp(u'Our company plans to introduce a new vacuum cleaner. '
          u'If successful, the vacuum cleaner will be our first product.')

show_ents(doc)

first - ORDINAL - "first", "second", etc.


In [None]:
# Create the desired phrase patterns for vacuum cleaner:
phrase_list = ['vacuum cleaner', 'vacuum-cleaner']
phrase_patterns = [nlp(text) for text in phrase_list]
matcher.add('newproduct', None, *phrase_patterns)
matches = matcher(doc)

PROD = doc.vocab.strings[u'PRODUCT']
new_ents = [Span(doc, match[1], match[2], label=PROD) for match in matches]
doc.ents = list(doc.ents) + new_ents
show_ents(doc)

vacuum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
vacuum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
first - ORDINAL - "first", "second", etc.


## 4.3 Creating New NER On top of current model

In [None]:
import random
from spacy.util import minibatch, compounding
from pathlib import Path

In [None]:
# Testing the model, Alto wasn't labelled
doc = nlp("I was driving a Alto")
show_ents(doc)

Alto - LOC - Non-GPE locations, mountain ranges, bodies of water


In [None]:
# create training data
ner = nlp.get_pipe("ner")
TRAIN_DATA = [
              ("Walmart is a leading e-commerce company", {"entities": [(0, 7, "ORG")]}),
              ("I reached Chennai yesterday.", {"entities": [(19, 28, "GPE")]}),
              ("I recently ordered a book from Amazon", {"entities": [(24,32, "ORG")]}),
              ("I was driving a BMW", {"entities": [(16,19, "PRODUCT")]}),
              ("I ordered this from ShopClues", {"entities": [(20,29, "ORG")]}),
              ("Fridge can be ordered in Amazon ", {"entities": [(0,6, "PRODUCT")]}),
              ("I bought a new Washer", {"entities": [(16,22, "PRODUCT")]}),
              ("I bought a old table", {"entities": [(16,21, "PRODUCT")]}),
              ("I bought a fancy dress", {"entities": [(18,23, "PRODUCT")]}),
              ("I rented a camera", {"entities": [(12,18, "PRODUCT")]}),
              ("I rented a tent for our trip", {"entities": [(12,16, "PRODUCT")]}),
              ("I rented a screwdriver from our neighbour", {"entities": [(12,22, "PRODUCT")]}),
              ("I repaired my computer", {"entities": [(15,23, "PRODUCT")]}),
              ("I got my clock fixed", {"entities": [(16,21, "PRODUCT")]}),
              ("I got my truck fixed", {"entities": [(16,21, "PRODUCT")]}),
              ("Flipkart started it's journey from zero", {"entities": [(0,8, "ORG")]}),
              ("I recently ordered from Max", {"entities": [(24,27, "ORG")]}),
              ("Flipkart is recognized as leader in market",{"entities": [(0,8, "ORG")]}),
              ("I recently ordered from Swiggy", {"entities": [(24,29, "ORG")]})
              ]

In [None]:
# Adding labels to the "ner" pipelines
for _, annotations in TRAIN_DATA:
    ner.add_label(annotations.get("entities")[0][2])

# Disable pipeline so the remaining components are not affected
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

# TRAINING THE MODEL
with nlp.disable_pipes(*unaffected_pipes):

    # 30 iterations
    for iteration in range(30):
        # shuufling examples  before every iteration
        random.shuffle(TRAIN_DATA)
        losses = {}
        
        # batch up the examples using spaCy's minibatch
        batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            # update parameters
            nlp.update(texts, annotations, drop=0.5, losses=losses,)
            print("Losses", losses)

Losses {'ner': 4.7075730151264}
Losses {'ner': 11.291058545803935}
Losses {'ner': 11.410128636593981}
Losses {'ner': 15.288225471350415}
Losses {'ner': 17.025395761742402}
Losses {'ner': 7.68321802187711}
Losses {'ner': 7.951826624354567}
Losses {'ner': 11.695518680342161}
Losses {'ner': 14.532071038554221}
Losses {'ner': 16.25689758299015}
Losses {'ner': 2.812831834075041}
Losses {'ner': 6.58895729429787}
Losses {'ner': 9.937370880101298}
Losses {'ner': 13.274950128803539}
Losses {'ner': 14.139055101210033}
Losses {'ner': 3.937129848971381}
Losses {'ner': 5.350726642689551}
Losses {'ner': 7.185156038312925}
Losses {'ner': 10.40546959690073}
Losses {'ner': 11.822959143542903}
Losses {'ner': 0.03259142026581685}
Losses {'ner': 1.8550721893370792}
Losses {'ner': 6.219412686819851}
Losses {'ner': 7.896257873297145}
Losses {'ner': 7.896804773854001}
Losses {'ner': 3.5918119903644765}
Losses {'ner': 4.489453857058212}
Losses {'ner': 4.498348061453104}
Losses {'ner': 4.961513775043674}
Losse

In [None]:
# Testing the model, Alto wasn't in the trained data
doc = nlp("I was driving a Alto")
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

Entities [('Alto', 'PRODUCT')]


In [None]:
# Save the  model to directory
output_dir = Path('./data/improved_ner')
nlp.to_disk(output_dir)

# Load the saved model and predict
nlp_updated = spacy.load(output_dir)
doc = nlp_updated("Fridge can be ordered in FlipKart" )
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

Entities [('Fridge', 'PRODUCT'), ('FlipKart', 'ORG')]


## 4.4 New NER Model Training

In [None]:
nlp = spacy.blank("en")
nlp.add_pipe(nlp.create_pipe('ner'))
nlp.begin_training()
ner = nlp.get_pipe('ner')

In [None]:
# all empty
doc = nlp(u'Tesla to build a U.K. factory for $6 million')
print(ner.move_names)
show_ents(doc)

['O']
No named entities found.


In [None]:
# Training examples in the required format
LABEL = "FOOD"

TRAIN_DATA =[ ("Pizza is a common fast food.", {"entities": [(0, 5, "FOOD")]}), 
              ("I love to eat Salmon.", {"entities": [(14, 20, "FOOD")]}),
              ("Seafood is my love", {"entities": [(0, 7, "FOOD")]}),
              ("John and I were eating Prata yesterday", {"entities": [(23, 28, "FOOD")]}),
              ("My mom always fry chicken for me", {"entities": [(18, 25, "FOOD")]}),
              ("Pasta is an italian recipe", {"entities": [(0, 5, "FOOD")]}),
              ("China's noodles are very famous", {"entities": [(8,14, "FOOD")]}),
              ("Shrimps are famous in China too", {"entities": [(0,7, "FOOD")]}),
              ("Lasagna is another classic of Italy", {"entities": [(0,7, "FOOD")]}),
              ("Sushi is extemely famous and expensive Japanese dish", {"entities": [(0,5, "FOOD")]}),
              ("Unagi is a famous seafood of Japan", {"entities": [(0,5, "FOOD")]}),
              ("Tempura , Soba are other famous dishes of Japan", {"entities": [(0,7, "FOOD")]}),
              ("Udon is a healthy type of noodles", {"entities": [(0,4, "FOOD")]}),
              ("Chocolate soufflé is extremely famous french cuisine", {"entities": [(0,17, "FOOD")]}),
              ("Flamiche is french pastry", {"entities": [(0,8, "FOOD")]}),
              ("Burgers are the most commonly consumed fastfood", {"entities": [(0,7, "FOOD")]}),
              ("Burgers are the most commonly consumed fastfood", {"entities": [(0,7, "FOOD")]}),
              ("Frenchfries are considered too oily", {"entities": [(0,11, "FOOD")]})
           ]

In [None]:
# Add the new label to ner
ner.add_label(LABEL)
optimizer = nlp.resume_training()
move_names = list(ner.move_names)

# List of pipes you want to train
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]

# List of pipes which should remain unaffected in training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

In [None]:
with nlp.disable_pipes(*other_pipes) :
    sizes = compounding(1.0, 4.0, 1.001)
    # Training for 30 iterations     
    for itn in range(30):
        
        # shuffle examples before training
        random.shuffle(TRAIN_DATA)

        # batch up the examples using spaCy's minibatch
        batches = minibatch(TRAIN_DATA, size=sizes)
        losses = {}
        
        for batch in batches:
            texts, annotations = zip(*batch)
            # Calling update() over the iteration
            nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
            print("Losses", losses)

Losses {'ner': 4.333333536982536}
Losses {'ner': 8.34936510026455}
Losses {'ner': 13.678405687212944}
Losses {'ner': 18.483883425593376}
Losses {'ner': 23.298324689269066}
Losses {'ner': 29.305401608347893}
Losses {'ner': 33.89787818491459}
Losses {'ner': 36.488719031214714}
Losses {'ner': 40.99931572377682}
Losses {'ner': 44.28863982856274}
Losses {'ner': 47.41815260052681}
Losses {'ner': 49.36406448483467}
Losses {'ner': 52.46723387390375}
Losses {'ner': 55.458259373903275}
Losses {'ner': 57.32397414371371}
Losses {'ner': 61.18454149737954}
Losses {'ner': 62.41640319302678}
Losses {'ner': 65.99534492357634}
Losses {'ner': 2.1427696174941957}
Losses {'ner': 4.965696997474879}
Losses {'ner': 7.617429203237407}
Losses {'ner': 9.54202292021364}
Losses {'ner': 12.064622287638485}
Losses {'ner': 13.400128441862762}
Losses {'ner': 17.26184952018957}
Losses {'ner': 18.372560377625632}
Losses {'ner': 19.7250577408995}
Losses {'ner': 22.545481929992093}
Losses {'ner': 24.095736923190998}
Losse

In [None]:
test_text = "I ate Sushi yesterday. Maggi is a common fast food "
doc = nlp(test_text)
show_ents(doc)

Sushi - FOOD - None
Maggi - FOOD - None


# 5) Noun Chunk

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
# noun chuncks
doc = nlp(u"Red cars do not carry higher insurance rates.")
[chunk for chunk in doc.noun_chunks]

[Red cars, higher insurance rates]

https://spacy.io/usage/visualizers

In [None]:
doc = nlp(u'Apple is going to build a U.K. factory for $6 million.')
displacy.render(doc, style='dep', jupyter=True, options={'distance': 80})

In [None]:
doc = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million.')
displacy.render(doc, style='ent', jupyter=True)

# 6) Sentence Segmentation

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
# From Spacy Basics:
doc = nlp(u'This is the first sentence. This is another sentence. This is the last sentence.')
list(doc.sents)

[This is the first sentence.,
 This is another sentence.,
 This is the last sentence.]

In [None]:
# not able to separate sentece based on ;
doc = nlp(u'"Management is doing things right; leadership is doing the right things;" -Peter Drucker')
list(doc.sents)

["Management is doing things right; leadership is doing the right things;" -Peter Drucker]

In [None]:
# ADD A NEW RULE TO THE PIPELINE
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text == ';':
            doc[token.i + 1].is_sent_start = True
    return doc

nlp.add_pipe(set_custom_boundaries, before='parser')
nlp.pipe_names

['tagger', 'set_custom_boundaries', 'parser', 'ner']

In [None]:
doc = nlp(u'"Management is doing things right; leadership is doing the right things;" -Peter Drucker')
[sent for sent in doc.sents]

["Management is doing things right;,
 leadership is doing the right things;,
 " -Peter Drucker]

In [None]:
# alternative to changing of rule
# reset to the original
nlp = spacy.load('en_core_web_sm')

# SPACY DEFAULT BEHAVIOR:
doc = nlp(u"This is a sentence. This is another.\n\nThis is a \nthird sentence.")

for sent in doc.sents:
    print([token.text for token in sent])

['This', 'is', 'a', 'sentence', '.']
['This', 'is', 'another', '.', '\n\n']
['This', 'is', 'a', '\n', 'third', 'sentence', '.']


In [None]:
# CHANGING THE RULES
from spacy.pipeline import SentenceSegmenter

def split_on_newlines(doc):
    start = 0
    seen_newline = False
    for word in doc:
        if seen_newline:
            yield doc[start : word.i]
            start = word.i
            seen_newline = False
        elif word.text.startswith('\n'): # handles multiple occurrences
            seen_newline = True
    yield doc[start:]      # handles the last group of tokens

sbd = SentenceSegmenter(nlp.vocab, strategy=split_on_newlines)
nlp.add_pipe(sbd)
for sent in doc.sents:
    print([token.text for token in sent])

['This', 'is', 'a', 'sentence', '.']
['This', 'is', 'another', '.', '\n\n']
['This', 'is', 'a', '\n', 'third', 'sentence', '.']
