# Experiments

In [None]:
import sys
sys.path.append("../")

## spaCy, stanfordnlp, SoMaJo comparison

In [None]:
import spacy
nlp_spacy = spacy.load('de_core_news_md')

In [None]:
import stanfordnlp
# stanfordnlp.download('de')
nlp_stanford = stanfordnlp.Pipeline(lang='de')

In [None]:
from somajo import SoMaJo
tokenizer = SoMaJo("de_CMC", split_camel_case=True)

### Test documents

In [None]:
doc1 = "#S1 Nach der Weichenstörung in Hohen Neuendorf verkehren die S-Bahnen wieder durchgehend, erster Zug ab #Frohnau 21:58 Uhr und erster Zug ab #Hohen_Neuendorf 22:03 Uhr."
doc2 = "Unfall\nAbschnitt: Marzahn (Berlin)\nGültig ab: 09.02.2016 20:06\ngesperrt, Unfall\n"
doc3 = "■ #A1 #Bremen Richtung #Hamburg zwischen Horster Dreieck und #Stillhorn 9 km #Stau.  Dort ist wegen #Bauarbeiten nur eine Spur frei.\n"
doc4 = "Wegen einer techn. Störung an der Strecke besteht für die Linien S41, S42 u. S46 zw. Halensee <> Westkreuz <> Messe Nord <> Westend S-Bahn-Pendelverkehr im 20-Minuten-Takt. Die Linien S41 u. S42 fahren nur im 10-Minuten-Takt, die Linie S46 fährt nur Königs Wusterhausen <> Tempelhof."
doc5 = "#S3, #S5, #S7, #S9: Nach einer ärztliche Versorgung eines Fahrgastes im Zug in Bellevue kommt es noch zu Verspätungen und vereinzelten Ausfällen."

In [None]:
test_docs = [doc1, doc2, doc3, doc4, doc5]

### Process documents with spaCy, stanfordnlp, somajo

In [None]:
spacy_docs = [nlp_spacy(doc) for doc in test_docs]

In [None]:
stanford_docs = [nlp_stanford(doc) for doc in test_docs]

In [None]:
somajo_docs = [list(tokenizer.tokenize_text([doc])) for doc in test_docs]

### Tokenization comparison
How to access tokens:

#### spaCy
`Doc` is a sequence of `Token`s. We can get the token text with `Token.text`.

#### stanfordnlp
Here we have to access the sentences of a `Doc` to access the tokens with `tokens` property. We can get the token text with `Token.text`.

### somajo
Similar to stanfordnlp.

In [None]:
def get_spacy_doc_tokens(doc):
    return [token.text for token in doc]

def get_stanford_doc_tokens(doc):
    return [token.text for sentence in doc.sentences for token in sentence.tokens]

def get_somajo_doc_tokens(doc):
    return [token.text for sentence in doc for token in sentence]

In [None]:
for spacy_doc, stanford_doc, somajo_doc in zip(spacy_docs, stanford_docs, somajo_docs):
    spacy_tokens = get_spacy_doc_tokens(spacy_doc)
    print("spaCy:", spacy_tokens)
    stanford_tokens = get_stanford_doc_tokens(stanford_doc)
    print("stanfordnlp:", stanford_tokens)
    somajo_tokens = get_somajo_doc_tokens(somajo_doc)
    print("somajo:", somajo_tokens)
    print("\n")

spaCy tokenizer treats hashtags as separate tokens and keeps whitespace characters.
stanfordnlp more often than not treats hashtags as separate token and often does not handle abbreviations well, i.e. the tokenizer treats the dot as a separate token.
It also tends to split words containing punctuation marks more aggressively than the other tokenizers.
SoMaJo does not treat hashtags as separate tokens and handles abbreviations better. It does however split dates into multiple tokens.

### Sentence splitting comparison

In [None]:
def get_spacy_doc_sentences(doc):
    return [s.text for s in doc.sents]

def get_stanford_doc_sentences(doc):
    # introduces whitespaces
    # see: https://github.com/stanfordnlp/stanfordnlp/blob/dev/stanfordnlp/models/common/doc.py
    # to get original sentence text
    return [" ".join([token.text for token in sentence.tokens]) for sentence in doc.sentences]

def get_somajo_doc_sentences(doc):
    # introduces whitespaces
    return [" ".join([token.text for token in sentence]) for sentence in doc]

In [None]:
for spacy_doc, stanford_doc, somajo_doc in zip(spacy_docs, stanford_docs, somajo_docs):
    spacy_sentences = get_spacy_doc_sentences(spacy_doc)
    print("spaCy:", len(spacy_sentences), "\n", spacy_sentences)
    stanford_sentences = get_stanford_doc_sentences(stanford_doc)
    print("stanfordnlp:", len(stanford_sentences), "\n", stanford_sentences)
    somajo_sentences = get_somajo_doc_sentences(somajo_doc)
    print("somajo:", len(somajo_sentences), "\n", somajo_sentences)
    print("\n")

In the small sample of sentences we can observe that spaCy tends to split the document text very aggressively. It seems to not be able to handle hashtags, punctuation marks and abbreviations well.
stanfordnlp tends to do a little better, but seems rather ill-equipped to handle text data from social media containing a lot of abbreviations and use of special punctuation marks.
SoMaJo does considerably better. In our testing we found that it only made mistakes on very few occasions where it encountered unknown abbreviations.
Therefore we chose to do event extraction on a document level and use SoMaJo sentence splitting information for our negative labeling functions.

In [None]:
import pandas as pd
from wsee.utils import corpus_statistics
sd4m_train = pd.read_json("../data/daystream_corpus/train/train_with_events_and_defaults.jsonl", lines=True, encoding='utf8')
filtered_sd4m_train = sd4m_train[sd4m_train.apply(lambda document: corpus_statistics.has_triggers(document), axis=1)]
corpus_statistics.get_snorkel_event_stats(filtered_sd4m_train)

In [None]:
from wsee.data import pipeline

df_sd_train, Y_sd_train = pipeline.build_event_role_examples(filtered_sd4m_train)

In [None]:
from wsee.labeling import event_argument_role_lfs as role_lfs
from snorkel.labeling import PandasLFApplier

lfs = [
    role_lfs.lf_somajo_separate_sentence,
    role_lfs.lf_stanford_separate_sentence
]
applier = PandasLFApplier(lfs)

In [None]:
L_sd_train = applier.apply(df_sd_train)

In [None]:
from snorkel.labeling import LFAnalysis

LFAnalysis(L_sd_train, lfs).lf_summary(Y_sd_train)

The SD4M train set contains 2001 positive event roles, but `lf_stanford_separate_sentenc` using the sentence splitting information from stanfordnlp incorrectly labels 94 of these as `no_arg`.
While `lf_somajo_separate_sentenc` identifies less correct `no_arg` instances, it only rarely labels event_roles incorrectly. 

In [None]:
from wsee.labeling import error_analysis
pd.set_option('display.max_colwidth', None)
labeled_sd4m_roles = df_sd_train.copy()
labeled_sd4m_roles['label'] = Y_sd_train

In [None]:
error_analysis.sample_fp(labeled_df=labeled_sd4m_roles, lf_outputs=L_sd_train, lf_index=0, label_of_interest=10, sample_size=1)[['text', 'trigger', 'argument', 'somajo_doc', 'label']]

In [None]:
error_analysis.sample_fp(labeled_df=labeled_sd4m_roles, lf_outputs=L_sd_train, lf_index=1, label_of_interest=10, sample_size=1)[['text', 'trigger', 'argument', 'sentence_spans', 'label']]