# spaCy Demo
spaCy is a library for advanced Natural Language Processing in Python and Cython.
It's built on the very latest research, and was designed from day one to be used in real products.

In [1]:
# initialization
import spacy

## 1. Tokenization of a document using spaCy

#### Basic Tokenization

In [2]:
# We would be separating a general document of string into various tokens
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English

nlp = English()
#Creating a blank Tokenizer with just the English vocabulary
tokenizer = Tokenizer(nlp.vocab) 
tokens = tokenizer("This is a demo-string")
for token in tokens:
    print(token)

This
is
a
demo-string


#### Adding special cases while Tokenizing

In [6]:
from spacy.symbols import ORTH
print("Before adding special cases")
print([w.text for w in nlp("gimme that")])

# Defining new rules
special_case = [{ORTH: "gim"}, {ORTH: "me"}]

# adding new rules to the tokenizer
nlp.tokenizer.add_special_case("gimme", special_case)

print("\nAfter adding special case")
print([w.text for w in nlp("gimme that")])

Before adding special cases
['gimme', 'that']

After adding special case
['gim', 'me', 'that']


## 2. Getting Parts of Speech using spaCY
For this part we will be using **spaCY's pre-trained** model. We will subject a paragraph for tokenization for this purpose.
Following is the lise of all UPOS (Universal Parts of Speech) Symbols

* ADJ: adjective
* ADV: adverb
* AUX: auxiliary verb
* NOUN: noun
* NUM: numeral
* PART: particle
* PRON: pronoun
* PROPN: proper noun
* VERB: verb

In [9]:
# load model
nlp = spacy.load("en_core_web_sm")

text = "Apple is looking at buying U.K. startup for $1 billion"
doc = nlp(text)

#define a dictionary of POS descriptions
pos_text = {"ADJ": "adjective", "ADV": "adverb","AUX": "auxiliary verb",
            "NOUN": "noun","NUM": "numeral","PART": "particle",
            "PRON": "pronoun","PROPN": "proper noun","VERB": "verb"}

for token in doc:
    if token.pos_ in pos_text:
        print(token.text, pos_text[token.pos_], sep=" : ")

Apple : proper noun
is : auxiliary verb
looking : verb
buying : verb
U.K. : proper noun
startup : noun
1 : numeral
billion : numeral


In [5]:
## 3. Named entity recognition
spaCy features an extremely fast statistical entity recognition system, that assigns labels to contiguous spans of tokens.
The default model identifies a variety of named and numeric entities, including companies, locations, organizations and products. You can add arbitrary classes to the entity recognition system, and update the model with new examples.

Google : ORG
Microsoft : ORG
Bing : ORG
$787.6 billion : MONEY


In [6]:
# Example 1
text = "Google surpassed Microsoft's Bing search engine and is now worth $787.6 billion"
doc = nlp(text)
for entity in doc.ents:
    print(entity.text, entity.label_, sep=" : ")

India:GPE
$12 billion:MONEY
US:GPE


In [9]:
# Example 2
text = "India currently has a GDP of $12 billion and it aims to overcome US some day"
doc = nlp(text)
for entity in doc.ents:
    print(entity.text, entity.label_, sep=" : ")

Jeff Bezos : PERSON
Amazon : ORG
Airports : ORG
the J.F.K. International Airport : FAC
Amazon : ORG
Amazon : ORG
15% : PERCENT
English : NORP
first : ORDINAL
22nd August 2019 : DATE
10 pm : TIME


In [None]:
# Example 3
text = "Jeff Bezos is trying to sponsor Amazon in Airports, such as the J.F.K. International Airport." \
    "Amazon is also trying to enter the food market by selling groceries, fruits etc." \
    "Currently Amazon holds 15% of the food market." \
    "This was a part of my report for the English essay." \
    "I got the first prize for submitting this report" \
    "The competition was held on 22nd August 2019 and I got the prize by 10 pm"
doc = nlp(text)
for entity in doc.ents:
    print(entity.text, entity.label_, sep=" : ")

In [3]:
## 4. Word Vectors and Cosine Similarity

dog True 19.702013 True
can True 19.43369 True
banana True 17.848612 True


In [4]:
nlp = spacy.load("en_core_web_sm")
tokens = nlp("dog can banana")
for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

dog dog 1.0
dog can 0.086810715
dog banana 0.34545124
can dog 0.086810715
can can 1.0
can banana 0.019619917
banana dog 0.34545124
banana can 0.019619917
banana banana 1.0


  This is separate from the ipykernel package so we can avoid doing imports until


for token1 in tokens:
    for token2 in tokens:
        print(token1.text, token2.text, token1.similarity(token2))

## 5. Training a custom model on spaCy

In [None]:
import random
train_data = [
    ("Uber blew through $1 million a week", {"entities" : [(0, 4, 'ORG')]}),
    ("Android Pay expands to Canada", {"entities": [(0, 11, 'PRODUCT'), (23, 30, 'GPE')]}),
    ("Spotify steps up Asia expansion", {"entities":[(0, 8, "ORG"), (17, 21, "LOC")]}),
    ("Google Maps launches location sharing",{"entities": [(0, 11, "PRODUCT")]}),
    ("Google rebrands its business apps", {"entities":[(0, 6, "ORG")]}),
    ("look what i found on google!", {"entities":[(21, 27, "PRODUCT")]})
]

blank_model = spacy.blank("en")
optimizer = blank_model.begin_training()
for i in range(20):
    random.shuffle(train_data)
    for text, annotations in train_data:
        blank_model.update([text], [annotations], sgd=optimizer)

# Saving the trained model
blank_model.to_disk('//projects//nlp-hands-on//DigitalAssignment1//model')

## 6. Visualizers

In [2]:
from spacy import displacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("This is a sentence")
displacy.render(doc, style="dep", jupyter=True)

In [16]:
text = "The quick brown fox jumps over the lazy dog"

# general case
displacy.render(nlp(text), style="dep", jupyter=True)

# using user-defined options
displacy.render(nlp(text), style="dep", jupyter=True, options={"distance": 100})

### Visualizing Named Entities in a sentence

In [None]:
text = "Jeff Bezos is trying to sponsor Amazon in Airports, such as the J.F.K. International Airport." \
    "Amazon is also trying to enter the food market by selling groceries, fruits etc." \
    "Currently Amazon holds 15% of the food market." \
    "This was a part of my report for the English essay." \
    "I got the first prize for submitting this report" \
    "The competition was held on 22nd August 2019 and I got the prize by 10 pm"
doc = nlp(text)
displacy.render(doc, style="ent", jupyter=True)

In [4]:
text = "Jeff Bezos is trying to sponsor Amazon in Airports, such as the J.F.K. International Airport." \
    "Amazon is also trying to enter the food market by selling groceries, fruits etc." \
    "Currently Amazon holds 15% of the food market." \
    "This was a part of my report for the English essay." \
    "I got the first prize for submitting this report" \
    "The competition was held on 22nd August 2019 and I got the prize by 10 pm"
doc = nlp(text)
displacy.render(doc, style="ent", jupyter=True)