# Intro to SPACY

In [1]:
import pandas as pd    


# Load spacy resources
import spacy
en_nlp = spacy.load('en')


In [9]:
# Process text
doc = en_nlp(u'Dive into NLTK: Part-of-speech tagging and POS Tagger')

# Access to the tokens
for token in doc:
    print(token)

Dive
into
NLTK
:
Part
-
of
-
speech
tagging
and
POS
Tagger


In [10]:
# Access to the sentences
for sent in doc.sents:
    print(sent)

Dive into NLTK: Part-of-speech tagging and POS Tagger


In [11]:
# For each token, print corresponding part of speech tag
for token in doc:
    print(token, ' --> ',token.pos_)

Dive  -->  VERB
into  -->  ADP
NLTK  -->  PROPN
:  -->  PUNCT
Part  -->  NOUN
-  -->  PUNCT
of  -->  ADP
-  -->  PUNCT
speech  -->  NOUN
tagging  -->  NOUN
and  -->  CCONJ
POS  -->  PROPN
Tagger  -->  PROPN


In [12]:
# Several annotations available

en_doc = en_nlp(u'They told us to 10 duck.')
token_list = []
for token in en_doc:
    token_list += [(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
          token.shape_, token.is_alpha, token.is_stop)]

pd.DataFrame(token_list, columns=['text', 'lemma', 'pos', 'tag', 'dep',
          'shape', 'is_alpha', 'is_stop'])

Unnamed: 0,text,lemma,pos,tag,dep,shape,is_alpha,is_stop
0,They,-PRON-,PRON,PRP,nsubj,Xxxx,True,False
1,told,tell,VERB,VBD,ROOT,xxxx,True,False
2,us,-PRON-,PRON,PRP,dobj,xx,True,True
3,to,to,ADP,IN,prep,xx,True,True
4,10,10,NUM,CD,nummod,dd,False,False
5,duck,duck,NOUN,NN,pobj,xxxx,True,False
6,.,.,PUNCT,.,punct,.,False,False


In [None]:
doc.

## Parsing tree visualization

Spacy perform a parsing analysis to obtaing the tree structure od the prhases.

In [13]:
from spacy import displacy

doc = en_nlp(u'This is a sentence.')
displacy.render(doc, style='dep', jupyter=True)

## Chunks


In [14]:
# noun_chunks:  noun plus the words describing the noun
doc = en_nlp(u'Autonomous cars shift insurance liability toward manufacturers')
print('Chunk - root - root dep - root head')
for chunk in doc.noun_chunks:
    print(chunk.text, ' -', chunk.root.text, ' -', chunk.root.dep_, ' -', chunk.root.head.text)

Chunk - root - root dep - root head
Autonomous cars  - cars  - nsubj  - shift
insurance liability  - liability  - dobj  - shift
manufacturers  - manufacturers  - pobj  - toward


In [8]:
# Another example of noun chunks
doc_2 = en_nlp(u"The boy saw the yellow dog")
print([chunk for chunk in doc_2.noun_chunks])

[The boy, the yellow dog]


## Name entities recognition

In [9]:
# Name entities
doc = en_nlp(u'Apple is looking at buying U.K. startup for $1 billion')

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY


In [10]:
# Display the results into the notebook
doc = en_nlp(u"Rami Eid is studying at Stony Brook University in New York")
displacy.render(doc, style='ent', jupyter=True)

In [11]:
# Another example
doc = en_nlp(u"I went to Paris in 2017 where I met my old friend Jack from uni.")
displacy.render(doc, style='ent', jupyter=True)

## 

# Other languages

## Available models in: 
    English (en)
    German (de)
    Spanish (sp)
    Portuguese (pt)
    French (fr)
    Italian (it)
    Dutch (nl)

## Languages in alpha version
    Swedish (sv)
    Finnish (fi)
    Norwegian Bokmål (nb)
    Danish (da)
    Hungarian (hu)
    Polish(pl)
    Russian (ru)
    Romanian (ro)
    Croatian (hr)
    Turkish (tr)
    Hebrew (he)
    Persian (fa)
    Irish (ga)
    Bengali (bn)
    Hindi (hi)
    Indonesian (id)
    Thai (th)
    Chinese (zh)
    Japanese (ja)

In [15]:
# Basic NLP in spanish
es_nlp = spacy.load('es')

es_doc = es_nlp(u'Hola Mundo. Aqui tenemos dos sentencias.')

token_list = []
for token in es_doc:
    token_list += [(token.text, token.lemma_, token.pos_, token.dep_,
          token.shape_, token.is_alpha, token.is_stop)]

pd.DataFrame(token_list, columns=['text', 'lemma', 'pos', 'dep',
          'shape', 'is_alpha', 'is_stop'])

Unnamed: 0,text,lemma,pos,dep,shape,is_alpha,is_stop
0,Hola,Hola,PROPN,ROOT,Xxxx,True,False
1,Mundo,Mundo,PROPN,flat,Xxxxx,True,False
2,.,.,PUNCT,punct,.,False,False
3,Aqui,Aqui,PROPN,nsubj,Xxxx,True,False
4,tenemos,tener,VERB,ROOT,xxxx,True,True
5,dos,do,NUM,nummod,xxx,True,True
6,sentencias,sentenciar,NOUN,obj,xxxx,True,False
7,.,.,PUNCT,punct,.,False,False


In [18]:
[t.lemma_ for t in es_doc]

['Hola', 'Mundo', '.', 'Aqui', 'tener', 'do', 'sentenciar', '.']