# Intro to SPACY

In [None]:
import pandas as pd    


# Load spacy resources
import spacy
en_nlp = spacy.load('en')


In [None]:
# Process text
doc = en_nlp(u'Dive into NLTK: Part-of-speech tagging and POS Tagger')

# Access to the tokens
for token in doc:
    print(token)

In [None]:
# Access to the sentences
for sent in doc.sents:
    print(sent)

In [None]:
# For each token, print corresponding part of speech tag
for token in doc:
    print(token, ' --> ',token.pos_)

In [None]:
# Several annotations available

en_doc = en_nlp(u'They told us to 10 duck.')
token_list = []
for token in en_doc:
    token_list += [(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
          token.shape_, token.is_alpha, token.is_stop)]

pd.DataFrame(token_list, columns=['text', 'lemma', 'pos', 'tag', 'dep',
          'shape', 'is_alpha', 'is_stop'])

## Parsing tree visualization

Spacy perform a parsing analysis to obtaing the tree structure od the prhases.

In [None]:
from spacy import displacy

doc = en_nlp(u'This is a sentence.')
displacy.render(doc, style='dep', jupyter=True)

## Chunks


In [None]:
# noun_chunks:  noun plus the words describing the noun
doc = en_nlp(u'Autonomous cars shift insurance liability toward manufacturers')
print('Chunk - root - root dep - root head')
for chunk in doc.noun_chunks:
    print(chunk.text, ' -', chunk.root.text, ' -', chunk.root.dep_, ' -', chunk.root.head.text)

In [None]:
# Another example of noun chunks
doc_2 = en_nlp(u"The boy saw the yellow dog")
print([chunk for chunk in doc_2.noun_chunks])

## Name entities recognition

In [None]:
# Name entities
doc = en_nlp(u'Apple is looking at buying U.K. startup for $1 billion')

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

In [None]:
# Display the results into the notebook
doc = en_nlp(u"Rami Eid is studying at Stony Brook University in New York")
displacy.render(doc, style='ent', jupyter=True)

In [None]:
# Another example
doc = en_nlp(u"I went to Paris in 2017 where I met my old friend Jack from uni.")
displacy.render(doc, style='ent', jupyter=True)

## 

# Other languages

## Available models in: 
    English (en)
    German (de)
    Spanish (sp)
    Portuguese (pt)
    French (fr)
    Italian (it)
    Dutch (nl)

## Languages in alpha version
    Swedish (sv)
    Finnish (fi)
    Norwegian Bokmål (nb)
    Danish (da)
    Hungarian (hu)
    Polish(pl)
    Russian (ru)
    Romanian (ro)
    Croatian (hr)
    Turkish (tr)
    Hebrew (he)
    Persian (fa)
    Irish (ga)
    Bengali (bn)
    Hindi (hi)
    Indonesian (id)
    Thai (th)
    Chinese (zh)
    Japanese (ja)

In [None]:
# Basic NLP in spanish
es_nlp = spacy.load('es')

es_doc = es_nlp(u'Hola Mundo. Aqui tenemos 2 frases.')

token_list = []
for token in es_doc:
    token_list += [(token.text, token.lemma_, token.pos_, token.dep_,
          token.shape_, token.is_alpha, token.is_stop)]

pd.DataFrame(token_list, columns=['text', 'lemma', 'pos', 'dep',
          'shape', 'is_alpha', 'is_stop'])

In [None]:
[t.lemma_ for t in es_doc if not(t.is_stop)]