# Text2Scene
Dies ist die Lösung der Fingerübung für das Praktikum Text2Scene des Sommersemesters 2021

Autor: Xuan Anh Nguyen <br>
Email: xuananh6077@stud.uni-frankfurt.de

In [None]:
import numpy
import matplotlib.pyplot as plt
import torch
import tensorflow as tf
import spacy
import networkx as nx
import xml.etree.ElementTree as ET
import itertools
import os
import pathlib
import random
from tqdm import tqdm
from spacy.training import Example


In [None]:
def count_pos(ls):
     x_set = set(ls)
     x_dict = {}

     for entry in x_set:
          x_dict[entry] = ls.count(entry)

     return x_dict

## Aufgabe 2.2 Vorverarbeitung
Einlesen der Trainingsdaten sowie Training des Models mit dem NLP Paket **SpaCy**


In [None]:
# read all valid files used for training

train_data = [] # save only valid data for training (i.e. important tags)
full_data = [] # save all (xml) data
poss_tags = []

root = pathlib.Path().absolute()
for subdir, dirs, files in os.walk(root):
    # skip all hidden directories and files
    files = [f for f in files if not f[0] == '.']
    dirs[:] = [d for d in dirs if not d[0] == '.']
    if not subdir.startswith('.'):
        for filename in files:
            filepath = subdir + os.sep + filename
            if filepath.endswith(".xml"):
                # filepath will now point towards a valid .xml file
                
                # read and parse xml files
                tree = ET.parse(filepath)
                root = tree.getroot()
                
                full_data.append(root)
                
                # label text with entities 
                entities = []
                for elem in root[1]:
                    # filter non usable entries
                    if (elem.get('start') != None) and (elem.get('end') != None) and (elem.get('start') != '-1') and (elem.get('end') != '-1'):
                        new_ent = (int(elem.get('start')), int(elem.get('end')), elem.tag)
                        entities.append(new_ent)
                    if elem.tag not in poss_tags:
                        poss_tags.append(elem.tag)
                        
                # save the 2 special xml files extra so we can access them easier later
                if filepath.endswith("Bicycles.xml"):
                    print("confirmed")
                    special1 = root
                elif filepath.endswith("Highlights_of_the_Prado_Museum.xml"):
                    print("confirmed")
                    special2 = root

                        
                train_data.append((root[0].text, {'entities': entities}))
    
# print(TRAIN_DATA[1][0])
print(poss_tags)

In [None]:
# train the model with our training data
"""
model = None
n_iter = 1000

if model is not None:
    nlp = spacy.load(model)  
    print("Loaded model '%s'" % model)
else:
    nlp = spacy.blank('en')  
    print("Created blank 'en' model")

#set up the pipeline

if 'ner' not in nlp.pipe_names:
    nlp.add_pipe('ner', last=True)
ner = nlp.get_pipe('ner')

for _, annotations in train_data:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])
     
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.begin_training()
    for itn in range(n_iter):
        random.shuffle(train_data)
        losses = {}
        doc = nlp.make_doc(root[0].text)
        for text, annotations in train_data:
            try:
                nlp.update(
                    [Example.from_dict(nlp.make_doc(text), annotations)],  
                    drop=0.4,  
                    sgd=optimizer,
                    losses=losses)
            except:
                pass
        print('\rEpisode {}\tLosses: {:.2f}'.format(itn, losses['ner']), end="")
"""

In [None]:
# Alternatively load pretrained English tokenizer, tagger, parser and NER
nlp = spacy.load("en_core_web_sm")

## 2.3 Auswertung

Wie oft kommen welche PoS-Tags vor?

In [None]:
pos = []
for text, _ in train_data:
    doc = nlp(text)
    for token in doc:
        pos.append(token.pos_)
pos = count_pos(pos)
print("Aufgabe 1:")
print(*pos.items(), sep="\n")
    

Wie viele [SpatialEntities, Places, Motions, Locations, Signals, QsLinks, OLinks] gibt es?

In [None]:
ents = []
for data in full_data:
    for elem in data[1]:
         ents.append(elem.tag)

ents = count_pos(ents)
print("Aufgabe 2:")
print(*ents.items(), sep="\n")

Wie oft kommen welche QsLink Typen vor? (DC,EC, ...)?

In [None]:
links = []
for data in full_data:
    for elem in data[1]:
        if elem.tag == "QSLINK":
            links.append(elem.get('relType'))
            
links = count_pos(links)
print("Aufgabe 3:")
print(*links.items(), sep="\n")

Verteilung der Satzlänge graphisch darstellen (x: Satzlänge, y: Wie häufig)?

In [None]:
sents = []
for text, _ in train_data:
    doc = nlp(text)
    for sent in doc.sents:
        # find length by counting whitespaces in the sentence
        temp1 = sent.text
        temp2 = temp1.replace(' ', '')
        spaces = len(temp1) - len(temp2)
        sents.append(spaces+1)
# map lengths to amount of times it appereard
distribution = count_pos(sents)
plt.bar(list(distribution.keys()), list(distribution.values()))
plt.title("Satzlängen")
plt.xlabel("Satzlängen")
plt.ylabel("Häufigkeiten")
plt.show()

Welche Links (QSLinks, OLinks) werden von welchen Präpositionen (markiert durch SPATIAL_SIGNAL) getriggert (z.B. wie oft werden QSLinks durch die Präposition „on“ getriggert)?

In [None]:
qs_trigger = []
o_trigger = []

# collect all triggers
for data in full_data:
    temp_qs = []
    temp_o = []
    trigg_dic = {}
    
    # add all triggers ONLY with their id
    for elem in data[1]:
        if elem.tag == "QSLINK":
            temp_qs.append(elem.attrib['trigger'])
        elif elem.tag == "OLINK":
            temp_o.append(elem.attrib['trigger'])
        if elem.tag == "SPATIAL_SIGNAL":
            trigg_dic[elem.get('id')] = elem.get('text')

    # replace trigger id with their respective texts
    for i in range(len(temp_qs)):
        try:
            qs_trigger.append(trigg_dic[temp_qs[i]])
        except:
            pass
    for i in range(len(temp_o)):
        try:
            o_trigger.append(trigg_dic[temp_o[i]])
        except:
            pass

qs_trigger = count_pos(qs_trigger)
o_trigger = count_pos(o_trigger)

print("Aufgabe 5:")
print("\nQslink trigger:", *qs_trigger.items(), sep="\n")
print("\nOlink trigger:", *o_trigger.items(), sep="\n")

Welches sind die fünf häufigsten „MOTION“ Verben (und wie oft kommen diese vor)?

In [None]:
lemma_verbs = []   # only collect lemma from verbs
for data in full_data:
    # collect all different motion verbs
    verbs = []  
    for elem in data[1]:
        if elem.tag == "MOTION":
            verbs.append(elem.attrib['text'])
            
    # only add their lemma to the actual list
    doc = nlp(data[0].text)
    for token in doc:
        if token.text in verbs:
            lemma_verbs.append(token.lemma_)

lemma_verbs = count_pos(lemma_verbs)
lemma_verbs = sorted(lemma_verbs.items(), key=lambda x: x[1], reverse=True)
print("Aufgabe 6:", *lemma_verbs[0:5], sep="\n")

## 2.4 Visualisierung

Graphische Darstellung von Verbindungen zwischen Entitäten

**Bicycle.xml**

In [None]:
print(special1)

G = nx.Graph()
color_map = []
counter = 0
for elem in special1[1]:
     if elem.tag == "SPATIAL_ENTITY":
          G.add_node(elem.attrib['text'])

nx.draw(G, node_color='r', with_labels=True)

for elem in special1[1]:
     if elem.tag == "PLACE":
          G.add_node(elem.attrib['text'])

nx.draw(G, node_color='b', with_labels=True)


for elem in special1[1]:
     if elem.tag == "LOCATION":
          G.add_node(elem.attrib['text'])

nx.draw(G, node_color='g', with_labels=True)


for elem in special1[1]:
     if elem.tag == "PATH":
          G.add_node(elem.attrib['text'])

nx.draw(G, node_color='w', with_labels=True)


for elem in special1[1]:
     if elem.tag == "NONMOTIONEVENT":
          G.add_node(elem.attrib['text'])

nx.draw(G, node_color='y', with_labels=True)

**Highlights_of_the_Prado_Museum.xml**