# Spacy
Spacy is a open Source natural Language Processing Library.
For some specific taska Spacy is more efficient and fatser at the cost of the user not being able to choose the algorithmic implementation

In [3]:
#importing the spacy library.
import spacy

In [2]:
#Load the language library.en_core_web_sm --> small version of the language library.
nlp = spacy.load('en_core_web_sm')

In [6]:
#Create a Doc object/ Document object
#u --> it is used to pass the unicode string.
# using the language library ('en_core_web_sm) it will pass the entire string in seperate components
#each word will become a token
doc= nlp(u'Tesla is looking at buying U.S. Startup for $6 million')


In [7]:

for token in doc:
    print(token.text)

Tesla
is
looking
at
buying
U.S.
Startup
for
$
6
million


In [8]:
#Parts of Speech (POS)

for token in doc:
    print(token.text,token.pos)

Tesla 96
is 87
looking 100
at 85
buying 100
U.S. 96
Startup 96
for 85
$ 99
6 93
million 93


In [9]:
#Parts of Speech (POS)
# pos_ what part of speech it is.. like proper noun, verb, number......
for token in doc:
    print(token.text,token.pos_)

Tesla PROPN
is AUX
looking VERB
at ADP
buying VERB
U.S. PROPN
Startup PROPN
for ADP
$ SYM
6 NUM
million NUM


In [11]:
#Parts of Speech (POS)
# dep_ ----> syntactic dependency
for token in doc:
    print(token.text,token.dep_)

Tesla nsubj
is aux
looking ROOT
at prep
buying pcomp
U.S. compound
Startup dobj
for prep
$ quantmod
6 compound
million pobj


# PipeLine Object
1. tagger
2. parser
3. ner < name entity recognition >

In [12]:
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x20e2657b288>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x20e26459ca8>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x20e26459c48>)]

In [13]:
nlp.pipe_names

['tagger', 'parser', 'ner']

# Tokenization
The first step in processing any text is to split up all the components parts i.e words and punctuation into Tokens.

In [15]:
doc2= nlp(u"Tesla isn't looking startups anymore.")

In [16]:
#Parts of Speech (POS)

for token in doc2:
    print(token.text,token.pos_,token.dep_)

Tesla PROPN nsubj
is AUX aux
n't PART neg
looking VERB ROOT
startups NOUN dobj
anymore ADV advmod
. PUNCT punct


In [17]:
# puttinga lot of space in the doc
doc3 = nlp(u"Tesla isn't       looking startups anymore.")

In [19]:
for token in doc3:
    print(token.text,token.pos_,token.dep_)

Tesla PROPN nsubj
is AUX ROOT
n't PART neg
       SPACE 
looking VERB amod
startups NOUN attr
anymore ADV advmod
. PUNCT punct


# Using Indexing to Grab Token Indivigually.

In [20]:
doc2[0]

Tesla

In [21]:
doc[0].pos_ #---> parts of speech

'PROPN'

In [22]:
doc[0].dep_ #--> syntactic dependency

'nsubj'

In [23]:
doc[0].tag_ #--> the detailed part-of-speech tag.

'NNP'

In [24]:
doc[0].shape_ #--> the word shape.. Capitalization, punctuation, digits

'Xxxxx'

In [25]:
doc[0].is_stop #--> is the token part of a Stop list (STOPWORDS)

False

# SPAN
Sometimes Doc word objects can be very large so we can use SPAN.

Span is  a slice of a doc word object. with a  start and a stop.

Form: Doc [start:stop]

In [42]:
doc4 = nlp(u'Although commonly attributed to John Lennon from his song :Beautiful Boy",\
the phrase "Life is what happens to us while we are making other plans" was written by \
cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.')

In [43]:
life_quote = doc4[16:30]

In [44]:
print(life_quote)

what happens to us while we are making other plans" was written by


In [46]:
doc5 = nlp(u"This is the first Sentence. This is the second sentence.This is one more sentence.")

In [47]:
#Spacy understands that period(.) and a space is a change in sentence.
for sentence in doc5.sents:
    print(sentence)

This is the first Sentence.
This is the second sentence.
This is one more sentence.


In [48]:
doc5[6]

This

In [49]:
#is_sent_start-->is a start of a sentence?output will be yes if it's a start and nothing if false
doc5[6].is_sent_start

True

In [52]:
doc5[8].is_sent_start