### NLP Pipeline

Data Acquisition
Text Extraction & Cleanup
Pre-Processing
Feature Engineering
Model Building
Evaluation
Deployment
Monitor & Updatey

### Spacy 

In [1]:
import spacy

nlp = spacy.blank("en")
doc = nlp("Dr. Strange loves mumbai and it cost only 2$ per plate")

for token in doc:
    print(token)

Dr.
Strange
loves
mumbai
and
it
cost
only
2
$
per
plate


In [2]:
doc = nlp("Tony gave two $ to Peter.")
token0 = doc[0]
token0

Tony

In [None]:
dir(token0)

In [8]:
with open("students.txt") as f:
    text = f.readlines()

text = ' '.join(text)
doc = nlp(text)
emails = []
for token in doc:
    if token.like_email:
        emails.append(token.text)
emails

['virat@kohli.com',
 'maria@sharapova.com',
 'serena@williams.com',
 'joe@root.com']

### Customizing Tokenizer

In [9]:
from spacy.symbols import ORTH

nlp = spacy.blank("en")
doc = nlp("gimme double cheese extra large healthy pizza")
tokens = [token.text for token in doc]
tokens

['gimme', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

In [10]:
nlp.tokenizer.add_special_case("gimme", [
    {ORTH: "gim"},
    {ORTH: "me"},
])
doc = nlp("gimme double cheese extra large healthy pizza")
tokens = [token.text for token in doc]
tokens

['gim', 'me', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

### Sentence Tokenization or Segmentation

In [11]:
doc = nlp("Dr. Strange loves pav bhaji of mumbai. Hulk loves chat of delhi")
for sentence in doc.sents:
    print(sentence)

ValueError: [E030] Sentence boundaries unset. You can add the 'sentencizer' component to the pipeline with: `nlp.add_pipe('sentencizer')`. Alternatively, add the dependency parser or sentence recognizer, or set sentence boundaries by setting `doc[i].is_sent_start`.

In [21]:
nlp.add_pipe('sentencizer')

ValueError: [E007] 'sentencizer' already exists in pipeline. Existing names: ['sentencizer']

In [22]:
doc = nlp("Dr. Strange loves pav bhaji of mumbai. Hulk loves chat of delhi")
for sentence in doc.sents:
    print(sentence)

Dr. Strange loves pav bhaji of mumbai.
Hulk loves chat of delhi


### Exercise

In [45]:
import spacy

text='''
Look for data to help you address the question. Governments are good
sources because data from public research is often freely available. Good
places to start include http://www.data.gov/, and http://www.science.
gov/, and in the United Kingdom, http://data.gov.uk/.
Two of my favorite data sets are the General Social Survey at http://www3.norc.org/gss+website/, 
and the European Social Survey at http://www.europeansocialsurvey.org/.
'''

nlp = spacy.blank("en")
doc = nlp(text)

data_urls = [token.text for token in doc if token.like_url]
data_urls

['http://www.data.gov/',
 'http://www.science',
 'http://data.gov.uk/.',
 'http://www3.norc.org/gss+website/',
 'http://www.europeansocialsurvey.org/.']

In [54]:
transactions = "Tony gave two $ to Peter, Bruce gave 500 € to Steve"
doc = nlp(transactions)
for token in doc:
    # print(f'int: {token.i} | string: {token}')
    if token.like_num and doc[token.i+1].is_currency:
        print(token.text, doc[token.i+1].text)   

two $
500 €
