# Stanza

In [2]:
import stanza

# Download Stanza models
stanza.download('en')

# Initialize the default English pipeline
nlp = stanza.Pipeline(lang="en") 

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.3.0.json:   0%|   …

2023-01-30 19:54:45 INFO: Downloading default packages for language: en (English)...
2023-01-30 19:54:56 INFO: File exists: /home/claire/stanza_resources/en/default.zip.
2023-01-30 19:55:04 INFO: Finished downloading models and saved to /home/claire/stanza_resources.
2023-01-30 19:55:04 INFO: Loading these models for language: en (English):
| Processor    | Package   |
----------------------------
| tokenize     | combined  |
| pos          | combined  |
| lemma        | combined  |
| depparse     | combined  |
| sentiment    | sstplus   |
| constituency | wsj       |
| ner          | ontonotes |

2023-01-30 19:55:04 INFO: Use device: gpu
2023-01-30 19:55:04 INFO: Loading: tokenize
2023-01-30 19:56:32 INFO: Loading: pos
2023-01-30 19:56:32 INFO: Loading: lemma
2023-01-30 19:56:33 INFO: Loading: depparse
2023-01-30 19:56:33 INFO: Loading: sentiment
2023-01-30 19:56:33 INFO: Loading: constituency
2023-01-30 19:56:33 INFO: Loading: ner
2023-01-30 19:56:34 INFO: Done loading processors!


### Sentence segmentation

In [3]:
text = 'This is a sentence. This is another sentence'
doc = nlp(text)

In [17]:
# doc is the text above together with its Stanza created annotations
for i, sentence in enumerate(doc.sentences):
    print(f'====== Sentence {i+1}  =======')
    print(sentence.text, sep='\n')

This is a sentences.
This is another sentence


### Tokenization

In [6]:
# doc is the text above together with its Stanza created annotations
# start_char is the offset of each token in the input document
for i, sentence in enumerate(doc.sentences):
    print(f'====== Sentence {i+1} tokens =======')
    print(*[f'{word.start_char}' for word in sentence.words], sep='\n')

0
5
8
10
18
20
25
28
36


In [7]:
# Apply pipeline
sz_doc = nlp('This is a test sentence for stanza. This is another sentence.')
# Get the sentences (Sentence segmentation)
for i, sentence in enumerate(sz_doc.sentences):
    print(f'====== Sentence {i+1} tokens =======')
# Get tokens using the "words" attribute (Tokenization)
    print(*[f'{word.text}' for word in sentence.words], sep='\n')

This
is
a
test
sentence
for
stanza
.
This
is
another
sentence
.


### Lemmatizing

In [4]:
# doc is the text above together with its Stanza created annotations
for i, sentence in enumerate(doc.sentences):
    print(f'====== Sentence {i+1} tokens =======')
    print(*[f'{word.text}\t\t {word.lemma}' for word in sentence.words], sep='\n')

This		 this
is		 be
a		 a
test		 test
sentence		 sentence
for		 for
stanza		 stanza
.		 .
This		 this
is		 be
another		 another
sentence		 sentence
.		 .


### Morphological Analysis

In [5]:
# doc is the text above together with its Stanza created annotations
for i, sentence in enumerate(doc.sentences):
    print(f'====== Sentence {i+1} tokens =======')
    print(*[f'{word.text}\t\t {word.feats}' for word in sentence.words], sep='\n')

This		 Number=Sing|PronType=Dem
is		 Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin
a		 Definite=Ind|PronType=Art
test		 Number=Sing
sentence		 Number=Sing
for		 None
stanza		 Number=Sing
.		 None
This		 Number=Sing|PronType=Dem
is		 Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin
another		 None
sentence		 Number=Sing
.		 None


### Dependency Parsing

In [6]:
# doc is the text above together with its Stanza created annotations
for i, sentence in enumerate(doc.sentences):
    print(f'====== Sentence {i+1} tokens =======')
    print(*[f'{word.text}\t\t {word.deprel}' for word in sentence.words], sep='\n')

This		 nsubj
is		 cop
a		 det
test		 compound
sentence		 root
for		 case
stanza		 nmod
.		 punct
This		 nsubj
is		 cop
another		 det
sentence		 root
.		 punct


### Constituency Parsing

In [7]:
# doc is the text above together with its Stanza created annotations
for i, sentence in enumerate(doc.sentences):
    print(f'====== Sentence {i+1} tokens =======')
    print(sentence.constituency)

(ROOT (S (NP (DT This)) (VP (VBZ is) (NP (DT another) (NN sentence))) (. .)))
(ROOT (S (NP (DT This)) (VP (VBZ is) (NP (NP (DT a) (NN test) (NN sentence)) (PP (IN for) (NP (NN stanza))))) (. .)))


### Named Entity Recognition (NER)

In [8]:
# doc is the text above together with its Stanza created annotations
doc = nlp("Simone de Beauvoir was born in Paris. She lived in France.")
print(*[f'entity: {ent.text}\ttype: {ent.type}' for ent in doc.ents], sep='\n')

entity: Simone de Beauvoir	type: PERSON
entity: Paris	type: GPE
entity: France	type: GPE


### Processing multiple documents

In [None]:
# Documents to process
documents = ["This is a test document. John sleeps.", "I wrote another document for fun. Mary dreams."] 
# Wrap each document with a stanza.Document object
in_docs = [stanza.Document([], text=d) for d in documents] 
# Call the neural pipeline on this list of documents
docs = nlp(in_docs) 

# Look at the annotations produced for the first sentence by Stanza
print(docs[0])