# Data

In [1]:
import amcatclient
import json
import pandas as pd
import re

## 1. Download data (skip)

In [None]:
def download_data():
    conn = amcatclient.AmcatAPI("https://vu.amcat.nl")
    docs = conn.get_articles(project=69, articleset=3521, start_date="2021-01-01", columns=["id", "title", "date", "url", "publisher", "text"])
    data = []
    for doc in docs:
        data.append(doc)
    return data

In [None]:
def store_data_csv(data, file_name):
    pd.DataFrame(data).to_csv(file_name, index=False)

In [None]:
def store_data_doccano(data, file_name):
    out_file = open(file_name, "w")
    for d in data:
        print(json.dumps(d), file=out_file)
    out_file.close()

In [None]:
data = download_data()

In [None]:
store_data_csv(data, "../data/teletekst.csv")

In [None]:
store_data_doccano(data, "../data/teletekst.json")

## 2. Extract entities from data

Documentation for VU linguistic processing docker: https://vu-rm-pip3.readthedocs.io/en/latest/docker.html

In [2]:
from bs4 import BeautifulSoup
from bs4 import Comment
import os
import tempfile
import timeit

In [3]:
def read_data(file_name):
    return pd.read_csv(file_name).to_dict(orient="records")

In [4]:
def store_text_in_file(text):
    _, tmp_file_name_with_dir = tempfile.mkstemp(dir="tmp")
    tmp_file = open(tmp_file_name_with_dir, "w")
    print(text, file=tmp_file)
    tmp_file.close()
    return tmp_file_name_with_dir

In [5]:
def annotate_process(tmp_file_name_with_dir):
    tmp_file_name_without_dir = tmp_file_name_with_dir.split("/")[-1]
    os.system(f"docker run -v $(pwd)/tmp/:/wrk/ vucltl/vu-rm-pip3 -m entities /wrk/{tmp_file_name_without_dir} > {tmp_file_name_with_dir}.out 2> {tmp_file_name_with_dir}.log")

In [6]:
def read_annotations(tmp_file_name_with_dir):
    processed_file = open(f"{tmp_file_name_with_dir}.out", "r")
    processed_data = ""
    for line in processed_file:
        processed_data += line
    processed_file.close()
    return BeautifulSoup(processed_data)

In [7]:
def cleanup_files(tmp_file_name_with_dir):
    os.unlink(tmp_file_name_with_dir)
    os.unlink(f"{tmp_file_name_with_dir}.out")
    os.unlink(f"{tmp_file_name_with_dir}.log")

In [8]:
def annotate_text(text):
    tmp_file_name_with_dir = store_text_in_file(text)
    annotate_process(tmp_file_name_with_dir)
    annotation_soup = read_annotations(tmp_file_name_with_dir)
    cleanup_files(tmp_file_name_with_dir)
    return annotation_soup

In [9]:
def get_words(soup):
    words = {}
    for word in soup.find_all("wf"):
        if word["id"] not in words:
            words[word["id"]] = [ word.text ] 
        else:
            words[word["id"]].append(word.text)
    return words

In [10]:
def get_terms(soup):
    terms = {}
    for term in soup.find_all("term"):
        for target in term.find_all("target"):
            if term["id"] not in terms:
                terms[term["id"]] = [ target["id"] ]
            else:
                terms[term["id"]].append(target["id"])
    return terms

In [11]:
def get_entities(soup):
    entities = {}
    for entity in soup.find_all("entity"):
        for target in entity.find_all("target"):
            if entity["id"] not in entities:
                entities[entity["id"]] = { "type": entity["type"], "targets": [ target["id"] ] }
            else:
                entities[entity["id"]]["targets"].append(target["id"])
    return entities

In [12]:
data = read_data("../data/teletekst.csv")

In [13]:
soup = annotate_text(data[2]["text"])
words = get_words(soup)
terms = get_terms(soup)
entities = get_entities(soup)

In [14]:
data[2]["text"]

'Het dodental van het ongeluk met een kabelbaan in Noord-Italië is opgelopen naar veertien. Vanavond overleed in een ziekenhuis in Turijn een van de twee zwaargewonde kinderen die uit de neergestorte gondel waren gehaald.\n\nEen cabine van de kabelbaan van Stresa aan het Lago Maggiore naar de top van de Monte Mottarone stortte in de diepte toen een kabel het begaf. In de cabine werden daarna negen lichamen gevonden, en in het bos op de berg nog eens vier.\n\nDe kabelbaan was pas weer open, na een versoepeling van de coronamaatregelen. Het laatste grote onderhoud was tussen 2014 en 2016 uitgevoerd.'

In [15]:
for entity in entities:
    print(entities[entity]["type"], end=" ")
    for target in entities[entity]["targets"]:
        for term in terms[target]:
            for word in words[term]:
                print(word, end=" ")
    print("")

LOC Noord-Italië 
LOC Turijn 
LOC Stresa 
ORG Lago Maggiore 
ORG Monte Mottarone 


## 3. e2e

Instructions: https://github.com/Filter-Bubble/e2e-Dutch

First download stanza nl with: `stanza.download('nl')`

The command `stanza.Pipeline` reports that a model is missing. It can be downloaded with `git clone https://huggingface.co/GroNLP/bert-base-dutch-cased` but it is unclear how to proceed next.

In [24]:
import stanza
import e2edutch.stanza

nlp = stanza.Pipeline(lang='nl', processors='tokenize,coref')

doc = nlp('Dit is een test document. Dit document bevat coreferenties.')
print ([[span.text for span in cluster] for cluster in doc.clusters])

2021-07-06 17:32:41 INFO: Loading these models for language: nl (Dutch):
| Processor | Package |
-----------------------
| tokenize  | alpino  |
| coref     | default |

2021-07-06 17:32:41 INFO: Use device: cpu
2021-07-06 17:32:41 INFO: Loading: tokenize
2021-07-06 17:32:41 INFO: Loading: coref
Setting CUDA_VISIBLE_DEVICES to: 
Running model: final
Loading context embeddings..
Loading head embeddings..
Loading BERT model...


OSError: Unable to load weights from pytorch checkpoint file for 'wietsedv/bert-base-dutch-cased' at '/home/erikt/.cache/torch/transformers/7a7191f5270caad7138d7e61b6e7a8e9d0eaad0d058a9faabb3896b520de8e9a.ead4c92543de4acacd907c329709df627a019151e95d79ced000d1199473fd29'If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True. 