## NLP Foundations 2021
# Adam Tucker, Lois Rink and Eva den Uijl

# Spacy default parser

In [4]:
import spacy
from spacy import displacy
nlp= spacy.load('en')



doc = nlp("""A plane has crashed in a Taliban-controlled area of Afghanistan amid conflicting reports over who was the owner of the aircraft. Senior Afghan government officials initially said an aircraft from the state-owned carrier Ariana Afghan Airlines had crashed in the central Ghazni province around 1:10 pm local time on Monday, Al Jazeera reported. The plane came down in the Sado Khel area of Deh Yak district. Ariana, which has a small fleet of passenger jets, denied it was one of their planes. Its acting CEO Mirwais Mirzakwal told Reuters: "The two flights managed by Ariana today from Herat to Kabul and Herat to Delhi are safe.""")

# for token in doc:
#     token_pos = token.text,token.pos_
#     spacey_tokens_with_pos_tags = tuple(token_pos)
#     print(spacey_tokens_with_pos_tags)
num_pos = doc.count_by(spacy.attrs.POS)
print(num_pos)
for k,v in sorted(num_pos.items()):
    print(f'{k}. {doc.vocab[k].text:{8}}: {v}')
    

tree_structure = displacy.render(doc, jupyter=False, style='dep')

output_path = 'spacy_tree_structure.svg'
with open(output_path, 'w') as outfile:
    outfile.write(tree_structure)

displacy.render(doc, jupyter=True, style='dep')
    

{90: 14, 92: 22, 87: 6, 100: 11, 85: 17, 96: 23, 97: 12, 84: 7, 95: 2, 86: 2, 93: 3, 89: 1}
84. ADJ     : 7
85. ADP     : 17
86. ADV     : 2
87. AUX     : 6
89. CCONJ   : 1
90. DET     : 14
92. NOUN    : 22
93. NUM     : 3
95. PRON    : 2
96. PROPN   : 23
97. PUNCT   : 12
100. VERB    : 11


## Stanza Parser 

 https://nlp.stanford.edu/software/lex-parser.html
 
 https://stanfordnlp.github.io/stanza/#citing-stanza-in-papers



In [5]:
import stanza
from spacy_stanza import StanzaLanguage

# stanza.download('en')
# stanza_nlp = stanza.Pipeline('en')

snlp = stanza.Pipeline(lang="en")
nlp = StanzaLanguage(snlp)

doc = nlp("""A plane has crashed in a Taliban-controlled area of Afghanistan amid conflicting reports over who was the owner of the aircraft. Senior Afghan government officials initially said an aircraft from the state-owned carrier Ariana Afghan Airlines had crashed in the central Ghazni province around 1:10 pm local time on Monday, Al Jazeera reported. The plane came down in the Sado Khel area of Deh Yak district. Ariana, which has a small fleet of passenger jets, denied it was one of their planes. Its acting CEO Mirwais Mirzakwal told Reuters: "The two flights managed by Ariana today from Herat to Kabul and Herat to Delhi are safe.""")


# doc = nlp("""A plane has crashed in a Taliban-controlled area of Afghanistan amid conflicting 
# reports over who was the owner of the aircraft. Senior Afghan government officials initially said 
# an aircraft from the state-owned carrier Ariana Afghan Airlines had crashed in the central Ghazni 
# province around 1:10 pm local time on Monday, Al Jazeera reported. The plane came down in the Sado Khel 
# area of Deh Yak district. Ariana, which has a small fleet of passenger jets, denied it was one of 
# their planes. Its acting CEO Mirwais Mirzakwal told Reuters: "The two flights managed by Ariana today 
# from Herat to Kabul and Herat to Delhi are safe.""")

# for token in doc:
#     print(token.text, token.lemma_, token.pos_, token.dep_, token.ent_type_)
# print(doc.ents)
# for token in doc:
#     token_pos = token.text,token.pos_
#     spacey_tokens_with_pos_tags = tuple(token_pos)
#     print(spacey_tokens_with_pos_tags)

num_pos = doc.count_by(spacy.attrs.POS)
print(num_pos)
for k,v in sorted(num_pos.items()):
    print(f'{k}. {doc.vocab[k].text:{8}}: {v}')
tree_structure = displacy.render(doc, jupyter=False, style='dep')

output_path = 'stanza_spacy_tree_structure.svg'
with open(output_path, 'w') as outfile:
    outfile.write(tree_structure)

displacy.render(doc, jupyter=True, style='dep')

2021-02-12 15:05:18 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | ewt       |
| pos       | ewt       |
| lemma     | ewt       |
| depparse  | ewt       |
| sentiment | sstplus   |
| ner       | ontonotes |

2021-02-12 15:05:18 INFO: Use device: cpu
2021-02-12 15:05:18 INFO: Loading: tokenize
2021-02-12 15:05:18 INFO: Loading: pos
2021-02-12 15:05:19 INFO: Loading: lemma
2021-02-12 15:05:19 INFO: Loading: depparse
2021-02-12 15:05:21 INFO: Loading: sentiment
2021-02-12 15:05:23 INFO: Loading: ner
2021-02-12 15:05:24 INFO: Done loading processors!


{90: 11, 92: 23, 87: 6, 100: 13, 85: 16, 96: 22, 97: 12, 98: 1, 95: 5, 84: 6, 86: 2, 93: 3, 89: 1}
84. ADJ     : 6
85. ADP     : 16
86. ADV     : 2
87. AUX     : 6
89. CCONJ   : 1
90. DET     : 11
92. NOUN    : 23
93. NUM     : 3
95. PRON    : 5
96. PROPN   : 22
97. PUNCT   : 12
98. SCONJ   : 1
100. VERB    : 13


## Berkeley_Neural_Parser with training model benepar_en2

https://github.com/nikitakit/self-attentive-parser/releases

https://pypi.org/project/benepar/ 

In [3]:
import tensorflow
from benepar.spacy_plugin import BeneparComponent

nlp = spacy.load('en')
nlp.add_pipe(BeneparComponent('benepar_en2'))
doc = nlp("""A plane has crashed in a Taliban-controlled area of Afghanistan amid conflicting reports over who was the owner of the aircraft. Senior Afghan government officials initially said an aircraft from the state-owned carrier Ariana Afghan Airlines had crashed in the central Ghazni province around 1:10 pm local time on Monday, Al Jazeera reported. The plane came down in the Sado Khel area of Deh Yak district. Ariana, which has a small fleet of passenger jets, denied it was one of their planes. Its acting CEO Mirwais Mirzakwal told Reuters: "The two flights managed by Ariana today from Herat to Kabul and Herat to Delhi are safe.""")


# for token in doc:
#     print(token.text, token.lemma_, token.pos_, token.dep_, token.ent_type_)
# print(doc.ents)
# for token in doc:
#     token_pos = token.text,token.pos_
#     spacey_tokens_with_pos_tags = tuple(token_pos)
#     print(spacey_tokens_with_pos_tags)
num_pos = doc.count_by(spacy.attrs.POS)
print(num_pos)
for k,v in sorted(num_pos.items()):
    print(f'{k}. {doc.vocab[k].text:{8}}: {v}')
tree_structure = displacy.render(doc, jupyter=False, style='dep')

output_path = 'Berkeley_Neural_Parser_spacy_tree_structure.svg'
with open(output_path, 'w') as outfile:
    outfile.write(tree_structure)

displacy.render(doc, jupyter=True, style='dep')

{90: 14, 92: 21, 87: 6, 100: 9, 85: 15, 96: 24, 97: 11, 84: 9, 95: 2, 86: 3, 93: 3, 94: 2, 89: 1}
84. ADJ     : 9
85. ADP     : 15
86. ADV     : 3
87. AUX     : 6
89. CCONJ   : 1
90. DET     : 14
92. NOUN    : 21
93. NUM     : 3
94. PART    : 2
95. PRON    : 2
96. PROPN   : 24
97. PUNCT   : 11
100. VERB    : 9
