In [1]:
import stanza
from e2edutch.predict import Predictor
import e2edutch.download
import io
import e2edutch.conll

Instructions for updating:
non-resource variables are not supported in the long term


Download data, if needed, for stanza and e2edutch

In [2]:
stanza.download('nl')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.1.0.json: 122kB [00:00, 24.0MB/s]                    
2021-01-14 14:40:58 INFO: Downloading default packages for language: nl (Dutch)...
2021-01-14 14:40:59 INFO: File exists: /home/dafne/stanza_resources/nl/default.zip.
2021-01-14 14:41:10 INFO: Finished downloading models and saved to /home/dafne/stanza_resources.


In [3]:
e2edutch.download.download_data()

Here's the piece of text we want to analyze:

In [4]:
txt = 'Dit is een stukje tekst. Deze tekst dient als voorbeeld.'

First run stanza

In [5]:
nlp = stanza.Pipeline('nl', processors='tokenize,lemma,pos,depparse')

2021-01-14 14:41:10 INFO: Loading these models for language: nl (Dutch):
| Processor | Package |
-----------------------
| tokenize  | alpino  |
| pos       | alpino  |
| lemma     | alpino  |
| depparse  | alpino  |

2021-01-14 14:41:10 INFO: Use device: cpu
2021-01-14 14:41:10 INFO: Loading: tokenize
2021-01-14 14:41:10 INFO: Loading: pos
2021-01-14 14:41:12 INFO: Loading: lemma
2021-01-14 14:41:12 INFO: Loading: depparse
2021-01-14 14:41:14 INFO: Done loading processors!


In [6]:
stanza_doc = nlp(txt)

In [7]:
stanza_doc

[
  [
    {
      "id": 1,
      "text": "Dit",
      "lemma": "dit",
      "upos": "PRON",
      "xpos": "VNW|aanw|pron|stan|vol|3o|ev",
      "feats": "Person=3|PronType=Dem",
      "head": 4,
      "deprel": "nsubj",
      "misc": "start_char=0|end_char=3"
    },
    {
      "id": 2,
      "text": "is",
      "lemma": "zijn",
      "upos": "AUX",
      "xpos": "WW|pv|tgw|ev",
      "feats": "Number=Sing|Tense=Pres|VerbForm=Fin",
      "head": 4,
      "deprel": "cop",
      "misc": "start_char=4|end_char=6"
    },
    {
      "id": 3,
      "text": "een",
      "lemma": "een",
      "upos": "DET",
      "xpos": "LID|onbep|stan|agr",
      "feats": "Definite=Ind",
      "head": 4,
      "deprel": "det",
      "misc": "start_char=7|end_char=10"
    },
    {
      "id": 4,
      "text": "stukje",
      "lemma": "stuk",
      "upos": "NOUN",
      "xpos": "N|soort|ev|dim|onz|stan",
      "feats": "Gender=Neut|Number=Sing",
      "head": 0,
      "deprel": "root",
      "misc": "start_ch

To use the output of stanza for e2edutch, we only need the output of the tokenization:

In [8]:
e2e_doc = {'doc_key': 'example',
          'sentences': [[token['text'] for token in sentence]
                       for sentence in stanza_doc.to_dict()]}

In [9]:
predictor = Predictor()

Instructions for updating:
Use `tf.cast` instead.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))`, which is equivalent to this API
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Use `tf.cast` instead.


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


INFO:tensorflow:Restoring parameters from /home/dafne/filter-bubble/e2e-Dutch/e2edutch/data/final/model.max.ckpt


In [10]:
e2e_doc['predicted_clusters'] = predictor.predict(e2e_doc)

In [11]:
e2e_doc['predicted_clusters']

[((0, 0),), ((6, 7), (10, 10))]

Let's print out the tokens together with the clusters, in conll-2012 format

In [12]:
with io.StringIO() as sout:
    e2edutch.conll.output_conll(sout, {'example': e2e_doc['sentences']}, 
                                      {'example': e2e_doc['predicted_clusters']})
    conll_e2e = sout.getvalue()

In [13]:
print(conll_e2e)

#begin document (example);

example	0	Dit	(0)
example	1	is	-
example	2	een	-
example	3	stukje	-
example	4	tekst	-
example	5	.	-

example	0	Deze	(1
example	1	tekst	1)
example	2	dient	-
example	3	als	-
example	4	voorbeeld	(1)
example	5	.	-

#end document



Combine the stanza tokens with the clusters:

In [14]:
clusters_brackets = e2edutch.conll.clusters_to_brackets(e2e_doc['sentences'], e2e_doc['predicted_clusters'])
clusters_brackets

[['(0)', '-', '-', '-', '-', '-'], ['(1', '1)', '-', '-', '(1)', '-']]

In [15]:
conll_list_stanza = stanza.utils.conll.CoNLL.convert_dict(stanza_doc.to_dict())
for sent_stanza, sent_e2e in zip(conll_list_stanza, clusters_brackets):
    for tok_stanza, tok_e2e in zip(sent_stanza, sent_e2e):
        print('\t'.join(tok_stanza)+'\t'+tok_e2e)
    print('\n')

1	Dit	dit	PRON	VNW|aanw|pron|stan|vol|3o|ev	Person=3|PronType=Dem	4	nsubj	_	start_char=0|end_char=3	(0)
2	is	zijn	AUX	WW|pv|tgw|ev	Number=Sing|Tense=Pres|VerbForm=Fin	4	cop	_	start_char=4|end_char=6	-
3	een	een	DET	LID|onbep|stan|agr	Definite=Ind	4	det	_	start_char=7|end_char=10	-
4	stukje	stuk	NOUN	N|soort|ev|dim|onz|stan	Gender=Neut|Number=Sing	0	root	_	start_char=11|end_char=17	-
5	tekst	tekst	NOUN	N|soort|ev|basis|zijd|stan	Gender=Com|Number=Sing	4	nmod	_	start_char=18|end_char=23	-
6	.	.	PUNCT	LET	_	4	punct	_	start_char=23|end_char=24	-


1	Deze	deze	DET	VNW|aanw|det|stan|prenom|met-e|rest	_	2	det	_	start_char=25|end_char=29	(1
2	tekst	tekst	NOUN	N|soort|ev|basis|zijd|stan	Gender=Com|Number=Sing	3	nsubj	_	start_char=30|end_char=35	1)
3	dient	dienen	VERB	WW|pv|tgw|met-t	Number=Sing|Tense=Pres|VerbForm=Fin	0	root	_	start_char=36|end_char=41	-
4	als	als	ADP	VZ|init	_	5	mark	_	start_char=42|end_char=45	-
5	voorbeeld	voorbeeld	NOUN	N|soort|ev|basis|onz|stan	Gender=Neut|Number=Sing	3	xc