# Experiments

In [None]:
import sys
sys.path.append("../")

## spaCy and stanfordnlp comparison

In [None]:
import spacy
nlp_spacy = spacy.load('de_core_news_md')

In [None]:
import stanfordnlp
# stanfordnlp.download('de')
nlp_stanford = stanfordnlp.Pipeline(lang='de')

### Test documents

In [None]:
doc1 = "#S1 Nach der Weichenstörung in Hohen Neuendorf verkehren die S-Bahnen wieder durchgehend, erster Zug ab #Frohnau 21:58 Uhr und erster Zug ab #Hohen_Neuendorf 22:03 Uhr."
doc2 = "Unfall\nAbschnitt: Marzahn (Berlin)\nGültig ab: 09.02.2016 20:06\ngesperrt, Unfall\n"
doc3 = "■ #A1 #Bremen Richtung #Hamburg zwischen Horster Dreieck und #Stillhorn 9 km #Stau.  Dort ist wegen #Bauarbeiten nur eine Spur frei.\n"
doc4 = "Wegen einer techn. Störung an der Strecke besteht für die Linien S41, S42 u. S46 zw. Halensee <> Westkreuz <> Messe Nord <> Westend S-Bahn-Pendelverkehr im 20-Minuten-Takt. Die Linien S41 u. S42 fahren nur im 10-Minuten-Takt, die Linie S46 fährt nur Königs Wusterhausen <> Tempelhof."
doc5 = "#S3, #S5, #S7, #S9: Nach einer ärztliche Versorgung eines Fahrgastes im Zug in Bellevue kommt es noch zu Verspätungen und vereinzelten Ausfällen."

In [None]:
test_docs = [doc1, doc2, doc3, doc4, doc5]

### Process documents with spaCy and stanfordnlp

In [None]:
spacy_docs = [nlp_spacy(doc) for doc in test_docs]

In [None]:
stanford_docs = [nlp_stanford(doc) for doc in test_docs]

### Tokenization comparison
How to access tokens:

#### spaCy
`Doc` is a sequence of `Token`s. We can get the token text with `Token.text`.

#### stanfordnlp
Here we have to access the sentences of a `Doc` to access the tokens with `tokens` property. We can get the token text with `Token.text`.

In [None]:
import difflib
from pprint import pprint


def get_spacy_doc_tokens(doc):
    return [token.text for token in doc]

def get_stanford_doc_tokens(doc):
    return [token.text for sentence in doc.sentences for token in sentence.tokens]

def print_list_differences(list_a, list_b):
    d = difflib.Differ()
    result = list(d.compare(list_a, list_b))
    pprint(result)

In [None]:
for spacy_doc, stanford_doc in zip(spacy_docs, stanford_docs):
    spacy_tokens = get_spacy_doc_tokens(spacy_doc)
    print("spaCy:", spacy_tokens)
    stanford_tokens = get_stanford_doc_tokens(stanford_doc)
    print("stanfordnlp:", stanford_tokens)
    print("\nDifferences:")
    print_list_differences(spacy_tokens, stanford_tokens)
    print("\n")

### Sentence splitting comparison

In [None]:
def get_spacy_doc_sentences(doc):
    return [s.text for s in doc.sents]

def get_stanford_doc_sentences(doc):
    # introduces whitespaces
    # see: https://github.com/stanfordnlp/stanfordnlp/blob/dev/stanfordnlp/models/common/doc.py
    # to get original sentence text
    return [" ".join([t.text for t in s.tokens]) for s in stanford_doc.sentences]

In [None]:
for spacy_doc, stanford_doc in zip(spacy_docs, stanford_docs):
    spacy_sentences = get_spacy_doc_sentences(spacy_doc)
    print("spaCy:", len(spacy_sentences), "\n", spacy_sentences)
    stanford_sentences = get_stanford_doc_sentences(stanford_doc)
    print("stanfordnlp:", len(stanford_sentences), "\n", stanford_sentences)
    print("\n")

### PoS comparison

In [None]:
def get_spacy_doc_pos(doc):
    # fine-grained would be token.tag
    return [token.tag_ for token in doc]

def get_stanford_doc_pos(doc):
    return [word.pos for sentence in doc.sentences for word in sentence.words]

In [None]:
for spacy_doc, stanford_doc in zip(spacy_docs, stanford_docs):
    spacy_pos = get_spacy_doc_pos(spacy_doc)
    print(spacy_doc.text)
    print("spaCy:", spacy_pos)
    stanford_pos = get_stanford_doc_pos(stanford_doc)
    print("stanfordnlp:", stanford_pos)
    print("\n")

### Dependency Parsing Comparison

In [None]:
def get_spacy_doc_dep(doc):
    return [token.dep_ for token in doc]

def get_stanford_doc_dep(doc):
    return [word.dependency_relation for sentence in doc.sentences for word in sentence.words]

In [None]:
for spacy_doc, stanford_doc in zip(spacy_docs, stanford_docs):
    spacy_dep = get_spacy_doc_dep(spacy_doc)
    print(spacy_doc.text)
    print("spaCy:", spacy_dep)
    stanford_dep = get_stanford_doc_dep(stanford_doc)
    print("stanfordnlp:", stanford_dep)
    print("\n")