# Stanza

In [1]:
import stanza

# Download Stanza models
stanza.download('en')

# Initialize the default English pipeline
nlp = stanza.Pipeline(lang="en") 

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.3.0.json:   0%|   …

2024-03-15 08:59:22 INFO: Downloading default packages for language: en (English)...
2024-03-15 08:59:27 INFO: File exists: /home/claire/stanza_resources/en/default.zip.
2024-03-15 08:59:35 INFO: Finished downloading models and saved to /home/claire/stanza_resources.
2024-03-15 08:59:35 INFO: Loading these models for language: en (English):
| Processor    | Package   |
----------------------------
| tokenize     | combined  |
| pos          | combined  |
| lemma        | combined  |
| depparse     | combined  |
| sentiment    | sstplus   |
| constituency | wsj       |
| ner          | ontonotes |

2024-03-15 08:59:35 INFO: Use device: gpu
2024-03-15 08:59:35 INFO: Loading: tokenize
2024-03-15 08:59:51 INFO: Loading: pos
2024-03-15 08:59:51 INFO: Loading: lemma
2024-03-15 08:59:51 INFO: Loading: depparse
2024-03-15 08:59:51 INFO: Loading: sentiment
2024-03-15 08:59:51 INFO: Loading: constituency
2024-03-15 08:59:52 INFO: Loading: ner
2024-03-15 08:59:52 INFO: Done loading processors!


### Sentence segmentation

In [2]:
text = 'This is a sentence. This is another sentence'
doc = nlp(text)

In [3]:
# doc is the text above together with its Stanza created annotations
for i, sentence in enumerate(doc.sentences):
    print(f'====== Sentence {i+1}  =======')
    print(sentence.text, sep='\n')

This is a sentence.
This is another sentence


### Tokenization

In [4]:
# doc is the text above together with its Stanza created annotations
# start_char is the offset of each token in the input document
for i, sentence in enumerate(doc.sentences):
    print(f'====== Sentence {i+1} tokens =======')
    print(*[f'{word.start_char}' for word in sentence.words], sep='\n')

0
5
8
10
18
20
25
28
36


In [5]:
# Apply pipeline
sz_doc = nlp('This is a test sentence for stanza. This is another sentence.')
# Get the sentences (Sentence segmentation)
for i, sentence in enumerate(sz_doc.sentences):
    print(f'====== Sentence {i+1} tokens =======')
# Get tokens using the "words" attribute (Tokenization)
    print(*[f'{word.text}' for word in sentence.words], sep='\n')

This
is
a
test
sentence
for
stanza
.
This
is
another
sentence
.


### Lemmatizing

In [6]:
# doc is the text above together with its Stanza created annotations
for i, sentence in enumerate(doc.sentences):
    print(f'====== Sentence {i+1} tokens =======')
    print(*[f'{word.text}\t\t {word.lemma}' for word in sentence.words], sep='\n')

This		 this
is		 be
a		 a
sentence		 sentence
.		 .
This		 this
is		 be
another		 another
sentence		 sentence


### Morphological Analysis

In [7]:
# doc is the text above together with its Stanza created annotations
for i, sentence in enumerate(doc.sentences):
    print(f'====== Sentence {i+1} tokens =======')
    print(*[f'{word.text}\t\t {word.feats}' for word in sentence.words], sep='\n')

This		 Number=Sing|PronType=Dem
is		 Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin
a		 Definite=Ind|PronType=Art
sentence		 Number=Sing
.		 None
This		 Number=Sing|PronType=Dem
is		 Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin
another		 None
sentence		 Number=Sing


### Dependency Parsing

In [8]:
# doc is the text above together with its Stanza created annotations
for i, sentence in enumerate(doc.sentences):
    print(f'====== Sentence {i+1} tokens =======')
    print(*[f'{word.text}\t\t {word.deprel}' for word in sentence.words], sep='\n')

This		 nsubj
is		 cop
a		 det
sentence		 root
.		 punct
This		 nsubj
is		 cop
another		 det
sentence		 root


### Constituency Parsing

In [9]:
# doc is the text above together with its Stanza created annotations
for i, sentence in enumerate(doc.sentences):
    print(f'====== Sentence {i+1} tokens =======')
    print(sentence.constituency)

(ROOT (S (NP (DT This)) (VP (VBZ is) (NP (DT another) (NN sentence)))))
(ROOT (S (NP (DT This)) (VP (VBZ is) (NP (DT a) (NN sentence))) (. .)))


### Named Entity Recognition (NER)

In [10]:
# doc is the text above together with its Stanza created annotations
doc = nlp("Simone de Beauvoir was born in Paris. She lived in France.")
print(*[f'entity: {ent.text}\ttype: {ent.type}' for ent in doc.ents], sep='\n')

entity: Simone de Beauvoir	type: PERSON
entity: Paris	type: GPE
entity: France	type: GPE


### Processing multiple documents

In [11]:
# Documents to process
documents = ["This is a test document. John sleeps.", "I wrote another document for fun. Mary dreams."] 
# Wrap each document with a stanza.Document object
in_docs = [stanza.Document([], text=d) for d in documents] 
# Call the neural pipeline on this list of documents
docs = nlp(in_docs) 

# Look at the annotations produced for the first sentence by Stanza
print(docs[0])

[
  [
    {
      "id": 1,
      "text": "This",
      "lemma": "this",
      "upos": "PRON",
      "xpos": "DT",
      "feats": "Number=Sing|PronType=Dem",
      "head": 5,
      "deprel": "nsubj",
      "start_char": 0,
      "end_char": 4,
      "ner": "O"
    },
    {
      "id": 2,
      "text": "is",
      "lemma": "be",
      "upos": "AUX",
      "xpos": "VBZ",
      "feats": "Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin",
      "head": 5,
      "deprel": "cop",
      "start_char": 5,
      "end_char": 7,
      "ner": "O"
    },
    {
      "id": 3,
      "text": "a",
      "lemma": "a",
      "upos": "DET",
      "xpos": "DT",
      "feats": "Definite=Ind|PronType=Art",
      "head": 5,
      "deprel": "det",
      "start_char": 8,
      "end_char": 9,
      "ner": "O"
    },
    {
      "id": 4,
      "text": "test",
      "lemma": "test",
      "upos": "NOUN",
      "xpos": "NN",
      "feats": "Number=Sing",
      "head": 5,
      "deprel": "compound",
      "start_