In [None]:
# Copyright (C) 2024  Jose Ángel Pérez Garrido
# 
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

# Dependency parsing

**Goal**: implement a neural parser to output the dependency syntactic structure (in CoNLLU format) of an input given sentence.

**USAGE GUIDELINES**

This program support the following functionalities. To use them move to the desired section and follow the instructions:
1. (Section 1) Train a model and save it as a pickle file.
2. (Section 2) Predict the outputs for the test set and report the standard metrics for parsing: labelled (LAS) and unlabeled attachment scores (UAS).

**NOTE: First, make sure you configure and execute the following cells in order to setup the working environment.**


In [None]:
# SET VARIABLES
# @title Setup working environment
# @markdown ### General configuration
workspace_folder = "/content/drive/MyDrive/src" # @param {type:"string"}
# @markdown workspace_folder = Directory which contains the application .py files (fileparser.py, preprocess.py, dp_model.py...)
# @markdown - Note: You should set the absolute path where you have saved the required modules, taking into account that, in *Google Colaboratory*, you have direct access to the folder `/content/`

In [None]:
# Prepare the environment

# Mount drive folder
from google.colab import drive
drive.mount('/content/drive')

# Uninstall the current version of TensorFlow.
#!pip uninstall tensorflow -2.13.0

# Install TensorFlow 2.12.0.
#!pip install tensorflow==2.12.0

# Install required libraries
!pip install conllu==4.5.3
!pip install pydot==1.4.2
!pip install pyparsing==3.1.1
!pip install tqdm==4.66.1

import os, sys, pickle, subprocess
import matplotlib.pyplot as plt

# Verify the TensorFlow version.
import tensorflow as tf
from tensorflow.python import keras
print(tf.__version__)

# Set the model output location folder
modelfolder = "/content/Model_output"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
2.15.0


In [None]:
# Import developed Python classes
sys.path.append(workspace_folder)

from src.preprocess import *
from src.postprocess import *
from src.dependency_tree import *
from src.dp_model import *
from src.fileparser import *
from src.oracle import *

**Data preparation**

Download the dataset (`conllu` files) from the *GitHub* of Universal Dependencies (UD). We use the utility function [`tensorflow.keras.utils.get_file()`](https://www.tensorflow.org/api_docs/python/tf/keras/utils/get_file) to download the files.

In [None]:
# Load the datasets

# Set the datasets location folder
datafolder = "/content/Datasources"

# Create folders to save treebanks
languages = ["UD_English-ParTUT", "UD_English-EWT", "UD_English-GUM", "UD_Russian-SynTagRus", "UD_Russian-Taiga", "UD_Spanish-AnCora", "UD_Spanish-GSD"]
for i in languages:
  if not os.path.exists(datafolder+"/"+i):
    os.makedirs(datafolder+"/"+i)

# Download Treebanks
# UD_English-ParTUT
tf.keras.utils.get_file(
    fname=str(f"{datafolder}/UD_English-ParTUT/dev.conllu"),
    origin="https://raw.githubusercontent.com/UniversalDependencies/UD_English-ParTUT/master/en_partut-ud-dev.conllu",
    extract=False
)
tf.keras.utils.get_file(
    fname=str(f"{datafolder}/UD_English-ParTUT/test.conllu"),
    origin="https://raw.githubusercontent.com/UniversalDependencies/UD_English-ParTUT/master/en_partut-ud-test.conllu",
    extract=False
)
tf.keras.utils.get_file(
    fname=str(f"{datafolder}/UD_English-ParTUT/train.conllu"),
    origin="https://raw.githubusercontent.com/UniversalDependencies/UD_English-ParTUT/master/en_partut-ud-train.conllu",
    extract=False
)

# UD_English-EWT
tf.keras.utils.get_file(
    fname=str(f"{datafolder}/UD_English-EWT/dev.conllu"),
    origin="https://raw.githubusercontent.com/UniversalDependencies/UD_English-EWT/master/en_ewt-ud-dev.conllu",
    extract=False
)
tf.keras.utils.get_file(
    fname=str(f"{datafolder}/UD_English-EWT/test.conllu"),
    origin="https://raw.githubusercontent.com/UniversalDependencies/UD_English-EWT/master/en_ewt-ud-test.conllu",
    extract=False
)
tf.keras.utils.get_file(
    fname=str(f"{datafolder}/UD_English-EWT/train.conllu"),
    origin="https://raw.githubusercontent.com/UniversalDependencies/UD_English-EWT/master/en_ewt-ud-train.conllu",
    extract=False
)

# UD_English-GUM
tf.keras.utils.get_file(
    fname=str(f"{datafolder}/UD_English-GUM/dev.conllu"),
    origin="https://raw.githubusercontent.com/UniversalDependencies/UD_English-GUM/master/en_gum-ud-dev.conllu",
    extract=False
)
tf.keras.utils.get_file(
    fname=str(f"{datafolder}/UD_English-GUM/test.conllu"),
    origin="https://raw.githubusercontent.com/UniversalDependencies/UD_English-GUM/master/en_gum-ud-test.conllu",
    extract=False
)
tf.keras.utils.get_file(
    fname=str(f"{datafolder}/UD_English-GUM/train.conllu"),
    origin="https://raw.githubusercontent.com/UniversalDependencies/UD_English-GUM/master/en_gum-ud-train.conllu",
    extract=False
)

# UD_Russian-SynTagRus
tf.keras.utils.get_file(
    fname=str(f"{datafolder}/UD_Russian-SynTagRus/dev.conllu"),
    origin="https://raw.githubusercontent.com/UniversalDependencies/UD_Russian-SynTagRus/master/ru_syntagrus-ud-dev.conllu",
    extract=False
)
tf.keras.utils.get_file(
    fname=str(f"{datafolder}/UD_Russian-SynTagRus/test.conllu"),
    origin="https://raw.githubusercontent.com/UniversalDependencies/UD_Russian-SynTagRus/master/ru_syntagrus-ud-test.conllu",
    extract=False
)
tf.keras.utils.get_file(
    fname=str(f"{datafolder}/UD_Russian-SynTagRus/train.conllu"),
    origin="https://raw.githubusercontent.com/UniversalDependencies/UD_Russian-SynTagRus/master/ru_syntagrus-ud-train-a.conllu",
    extract=False
)

# UD_Russian-Taiga
tf.keras.utils.get_file(
    fname=str(f"{datafolder}/UD_Russian-Taiga/dev.conllu"),
    origin="https://raw.githubusercontent.com/UniversalDependencies/UD_Russian-Taiga/master/ru_taiga-ud-dev.conllu",
    extract=False
)
tf.keras.utils.get_file(
    fname=str(f"{datafolder}/UD_Russian-Taiga/test.conllu"),
    origin="https://raw.githubusercontent.com/UniversalDependencies/UD_Russian-Taiga/master/ru_taiga-ud-test.conllu",
    extract=False
)
tf.keras.utils.get_file(
    fname=str(f"{datafolder}/UD_Russian-Taiga/train.conllu"),
    origin="https://raw.githubusercontent.com/UniversalDependencies/UD_Russian-Taiga/master/ru_taiga-ud-train.conllu",
    extract=False
)

#UD_Spanish-AnCora
tf.keras.utils.get_file(
    fname=str(f"{datafolder}/UD_Spanish-AnCora/dev.conllu"),
    origin="https://raw.githubusercontent.com/UniversalDependencies/UD_Spanish-AnCora/master/es_ancora-ud-dev.conllu",
    extract=False
)
tf.keras.utils.get_file(
    fname=str(f"{datafolder}/UD_Spanish-AnCora/test.conllu"),
    origin="https://raw.githubusercontent.com/UniversalDependencies/UD_Spanish-AnCora/master/es_ancora-ud-test.conllu",
    extract=False
)
tf.keras.utils.get_file(
    fname=str(f"{datafolder}/UD_Spanish-AnCora/train.conllu"),
    origin="https://raw.githubusercontent.com/UniversalDependencies/UD_Spanish-AnCora/master/es_ancora-ud-train.conllu",
    extract=False
)

#UD_Spanish-GSD
tf.keras.utils.get_file(
    fname=str(f"{datafolder}/UD_Spanish-GSD/dev.conllu"),
    origin="https://raw.githubusercontent.com/UniversalDependencies/UD_Spanish-GSD/master/es_gsd-ud-dev.conllu",
    extract=False
)
tf.keras.utils.get_file(
    fname=str(f"{datafolder}/UD_Spanish-GSD/test.conllu"),
    origin="https://raw.githubusercontent.com/UniversalDependencies/UD_Spanish-GSD/master/es_gsd-ud-test.conllu",
    extract=False
)
tf.keras.utils.get_file(
    fname=str(f"{datafolder}/UD_Spanish-GSD/train.conllu"),
    origin="https://raw.githubusercontent.com/UniversalDependencies/UD_Spanish-GSD/master/es_gsd-ud-train.conllu",
    extract=False
)

'/content/Datasources/UD_Spanish-GSD/train.conllu'

////////////////////////////////////////////////////////////////////////////////////////////////////////

# 1. TRAIN A MODEL

**(Requirements) Setup the training module:**

First of all, you must configure the variables in the following cell to execute the program. Then, you can start the following cells to train the model. An explanation of the different variables is provided below:

---
### Dataset configuration
*   language = The treebank dataset to be selected
---
### Model topology
*   sigma_size = Elements taken from the stack each training step
*   beta_size = Elements taken from the buffer each training step
*   num_dense_layers = Number of additional hidden Dense layers
*   dropout_rate = Rate for the Dropout layer
*   dense_units = Units for every hidden TimeDistributed Dense layers
---
### Model hyperparameters
*   optimizer = Algorithm that will adjust the model's parameters during training to minimize the loss function
* epochs = Number of times the training data passes through the algorithm
* batch_size = Number of samples that will be propagated through the network

We will use the loss function "sparse_categorical_crossentropy" and the metric "accuracy".





In [None]:
# SET VARIABLES
# @title Training module configuration. Set the following variables
# @markdown ### Dataset configuration
language = "UD_English-ParTUT" # @param ["UD_English-ParTUT","UD_English-EWT","UD_Russian-SynTagRus","UD_Spanish-AnCora"]

# @markdown ---
# @markdown ### Model topology
sigma_size = 2 # @param {type:"number", min:1, max:20, step:1}
beta_size = 2 # @param {type:"number", min:1, max:20, step:1}
num_dense_layers = 0 # @param @param {type:"number", min:1, max:20, step:1}
dropout_rate = 0.0 # @param {type:"number", min:0, max:1, step:0.1}

# @markdown ---
# @markdown ### Model hyperparameters
optimizer = "adam" # @param ["adam", "sgd", "rmsprop", "adadelta", "adagrad", "adamax", "adafactor", "nadam", "ftrl"]
epochs = 15 # @param {type:"slider", min:1, max:100, step:1}
batch_size = 32 # @param {type:"number", min:1, max:512, step:1}

hyperparameters = {
    "loss" : "sparse_categorical_crossentropy",
    "optimizer" : optimizer,
    "metrics" : ["accuracy"],
    "epochs" : epochs,
    "batch_size" : batch_size
}

# Set topology
topology ={
    "sigma_size" : int(sigma_size),
    "beta_size" : int(beta_size),
    "num_dense_layers" : int(num_dense_layers),
    "dropout_rate" : int(dropout_rate)
}

**2. Process the treebank**

Secondly, we preprocess the treebank dataset. This whole process will return the sentences structured in a TokenList format (dictionary composed by the different ConLLu tags).

We make use of the class `Conllu_parser()` available in the file `preprocess.py` to extract the data (tokens ID, FORM, UPOS, DEPREL and HEAD) from the different .conllu files for training and validation. In addition, this class adds the special ROOT item as ID 0 and makes sure to remove empty tokens and multi-word units.

In [None]:
# PREPROCESS INPUT SAMPLES
print("Loading",language,"dataset...")

parser = Conllu_parser()

# Parse train file
input_str = str(f"{datafolder}/{language}/train.conllu")
train_sentences = parser(input_str)

# Parse validation file
input_str = str(f"{datafolder}/{language}/dev.conllu")
val_sentences=parser(input_str)

Loading UD_English-ParTUT dataset...


Then we create a label dictionary which will allow us to make an ad-hoc mapping for values in columns FORM, UPOS and DEPREL, as well as Arc-Eager Oracle transitions. We only use the train targets to guarantee we are testing correctly the generalization capabilities of our model.

In [None]:
# Generate dictionaries for labeling columns FORM, UPOS and DEPREL
print("Generating dictionaries for labeling columns FORM, UPOS and DEPREL...")
form_dict = generate_dict([(token["form"] for token in sentence) for sentence in train_sentences],2) #NOTE: labeling starts in 2
form_dict["None"] = 1 # NOTE: Add special token None (padding if stack or buffer does not have enough elements)
upos_dict = generate_dict([(token["upos"] for token in sentence) for sentence in train_sentences],2) #NOTE: labeling starts in 2
upos_dict["None"] = 1 # NOTE: Add special token None (padding if stack or buffer does not have enough elements)
deprel_dict = generate_dict([(token["deprel"] for token in sentence) for sentence in train_sentences])

# Generate dictionary for labeling transitions
print("Generating dictionary for labeling transitions...")
transition_dict = {
    "LEFT_ARC": 0,
    "RIGHT_ARC": 1,
    "REDUCE": 2,
    "SHIFT": 3,
}

Generating dictionaries for labeling columns FORM, UPOS and DEPREL...
Generating dictionary for labeling transitions...


Next we transform all the sentences into Dependency Trees.

In [None]:
# Create Dependency Parsing model architecture
print("Creating Reference trees...")
reference_train_trees = create_dependency_trees(train_sentences)
reference_val_trees = create_dependency_trees(val_sentences)

Creating Reference trees...


After that, we use the Arc-Eager oracle to create dependency transitions. In dependency parsing, this artifact refers to a method that provides the correct sequence of transitions needed to build the gold-standard (or manually annotated) dependency tree for a given sentence. The oracle guides the parsing algorithm, ensuring that it follows the correct steps to construct the accurate dependency tree according to the linguistic relations in the sentence.

The transition set T for the Arc-Eager algorithm contains four types of transitions:
- Transitions LEFT-ARC (for any dependency label l) add a dependency arc (j, l, i) to A, where i is the node on top of the stack σ and j is the first node in the buffer β. In addition, they pop the stack σ. They have as a precondition that the token i is not the artificial root node 0 and does not already have a head.
-  Transitions RIGHT-ARC (for any dependency label l) add a dependency arc (i, l, j) to A, where i is the node on top of the stack σ and j is the first node in the buffer β. In addition, they remove the first node j in the buffer β and push it on top of the stack σ. They have as a precondition that the token j does not already have a head.
- Transitions REDUCE pop the stack β and are subject to the precondition that the top token has a head.
- The transition SHIFT removes the first node i in the buffer β and pushes it on top of the stack σ.

We will obtain a set of pairs with the different states plus the correspondent transition to apply for each transition tree.

In [None]:
# Create Oracle
print("Creating Arc-Eager Oracle...")
oracle = Arc_Eager_Oracle()

# Get transitions. Each pair (state, transition) is a training sample
print("Preprocessing sentences with Arc-Eager Oracle...")
train_states,train_transitions = oracle(reference_train_trees)
val_states,val_transitions = oracle(reference_val_trees)

Creating Arc-Eager Oracle...
Preprocessing sentences with Arc-Eager Oracle...


To end with, we apply a preprocessing function to obtain input and target features required for training.
- Input features are composed by word forms and POS for the previously specifyied number of elements in both, the stack and the buffer. In case the number elements is not enough, a padding function to a 'None' token is applyied.
- Target features are composed by tuples of two elements: the Arc-Eager transition (LEFT-ARC, RIGHT-ARC, REDUCE or SHIFT) and its corresponding dependency relation, if any.

In [None]:
# Create samples
print("Creating input and target samples...")
x_train = preprocess_inputs(train_states, form_dict, upos_dict, topology["sigma_size"], topology["beta_size"])
y_train = preprocess_targets(train_transitions, deprel_dict, transition_dict) #(transition_targets, relation_targets)

x_val = preprocess_inputs(val_states, form_dict, upos_dict, topology["sigma_size"], topology["beta_size"])
y_val = preprocess_targets(val_transitions, deprel_dict, transition_dict)


Creating input and target samples...


**3. Train a DP model**

Now we create the architecture, compile and train the model.

In [None]:
# Create DP model architecture
print("Creating model...")
model = DPModel(form_dict,upos_dict,deprel_dict, transition_dict)
model.build_model(topology)

# Train
print("TRAINING...")
history = model.train((x_train,y_train),(x_val,y_val),hyperparameters)

Creating model...
Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_2 (InputLayer)        [(None, 8)]                  0         []                            
                                                                                                  
 embedding_1 (Embedding)     (None, 8, 2)                 13920     ['input_2[0][0]']             
                                                                                                  
 flatten_1 (Flatten)         (None, 16)                   0         ['embedding_1[0][0]']         
                                                                                                  
 dense_2 (Dense)             (None, 8)                    136       ['flatten_1[0][0]']           
                                                                          

0epoch [00:00, ?epoch/s]

0batch [00:00, ?batch/s]

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 10: early stopping


Finally we use the library `pickle` to serialize and save the model class into `/content/Model_output/` to be able to use it in future predictions. Furthermore, we generate plots to visualize the evolution of training and validation loss and accuracy for both outputs.

In [None]:
# Save model as a pickle file
print("Saving model as a pickle file...")
if not os.path.exists(str(f"{modelfolder}")):
    os.makedirs(str(f"{modelfolder}"))

with open(str(f"{modelfolder}/{language}.pickle"), "wb") as data_file:
    pickle.dump(model,data_file)

Saving model as a pickle file...


In [None]:
# Generate training plots
print("Generating training plots...")
# summarize history for accuracy
plt.plot(history.history['relation_output_accuracy'])
plt.plot(history.history['val_relation_output_accuracy'])
plt.plot(history.history['transition_output_accuracy'])
plt.plot(history.history['val_transition_output_accuracy'])
plt.title('model accuracy for relation/transition outputs')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train_relation', 'val_relation','train_transition', 'val_transition'], loc='upper left')
#plt.show()

# Save plot
if not os.path.exists(str(f"{datafolder}/Plots/{language}")):
    os.makedirs(str(f"{datafolder}/Plots/{language}"))
plt.savefig(str(f"{datafolder}/Plots/{language}/Plot_accuracy.png"))
plt.close()

# summarize history for loss
plt.plot(history.history['relation_output_loss'])
plt.plot(history.history['val_relation_output_loss'])
plt.plot(history.history['transition_output_loss'])
plt.plot(history.history['val_transition_output_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train_relation', 'val_relation','train_transition', 'val_transition'], loc='upper left')
#plt.show()

# Save plot
plt.savefig(str(f"{datafolder}/Plots/{language}/Plot_loss.png"))
plt.close()

Generating training plots...


# 2. EVALUATE A MODEL

**(Requirements) Setup the training module:**

First of all, you must configure the variables in the following cell to execute the program. Then, you can start the following cells to evaluate the model. An explanation of the different variables is provided below:

*   dp_model = Dependency Parsing model previously trained and saved as a pickle file in `Model_outputs` folder to be evaluated
*   language_samples = The treebank dataset to be selected





In [None]:
# SET VARIABLES
# @title Evaluation module configuration. Set the following variables
# @markdown ### Model configuration
dp_model = "UD_English-ParTUT" # @param ["UD_English-ParTUT","UD_English-EWT","UD_Russian-SynTagRus","UD_Spanish-AnCora"]
# @markdown - Note: The trained models must be saved into `/content/Model_output/{model_name}`

# @markdown ### Dataset configuration
language_samples = "UD_English-ParTUT" # @param ["UD_English-ParTUT","UD_English-EWT","UD_Russian-SynTagRus","UD_Spanish-AnCora"]

First, we load the previously trained model using `pickle` library.

In [None]:
# Load model
print("Loading pickle model...")
with open(str(f"{modelfolder}/{dp_model}.pickle"), "rb") as data_file:
    model = pickle.load(data_file)

Loading pickle model...


As it was explained in the previous module, we preprocess the treebank dataset. This whole process will return the data structured into pairs with:
- States
- The correspondent Arc-Eager transition



In [None]:
# Preprocess test samples
print("Preprocessing test samples...")
parser = Conllu_parser()
input_str = str(f"{datafolder}/{language_samples}/test.conllu")
input_preprocessed_str = str(f"{datafolder}/{language_samples}/test-prep.conllu")
test_sentences = parser(input_str, input_preprocessed_str)

Preprocessing test samples...


After that, we predict the outputs for the test set.

In [None]:
# Get output Dependency tree
print("Making prediction...")
output_trees = model.predict(test_sentences)

Making prediction...


After that, we apply a set of heuristics to clean the output trees since it is possible that when making predictions we may obtain a corrupted tree.

In [None]:
print("Applying heuristics...")
output_trees_corrected = apply_heuristics(test_sentences,output_trees)

Applying heuristics...


Finally, we execute the official evaluation script conll18_ud_eval.py, which
is available at: https://universaldependencies.org/conll18/evaluation.html.

Note the script only accepts valid output in the CoNLL-U format for all test sets.

In this way, we will obtain several performance metrics. For our use case we are interested in LAS (Labeled Attachment Score) and UAS (Unlabeled Attachment Score).


In [None]:
print("Writing output CoNLL-U file...")
writer = Conllu_writer()
output_str = str(f"{datafolder}/{language_samples}/test-results.conllu")
writer(output_str,test_sentences,output_trees_corrected)

print("Evaluating results...")
eval=str(f"{workspace_folder}/src/conll18_ud_eval.py")
comand = ["python", eval, input_preprocessed_str, output_str, "-v"]
result = subprocess.run(comand, capture_output=True, text=True)
print(result.stdout)
print(result.stderr)

Writing output CoNLL-U file...
Evaluating results...
Metric     | Precision |    Recall |  F1 Score | AligndAcc
-----------+-----------+-----------+-----------+-----------
Tokens     |    100.00 |    100.00 |    100.00 |
Sentences  |    100.00 |    100.00 |    100.00 |
Words      |    100.00 |    100.00 |    100.00 |
UPOS       |    100.00 |    100.00 |    100.00 |    100.00
XPOS       |    100.00 |    100.00 |    100.00 |    100.00
UFeats     |    100.00 |    100.00 |    100.00 |    100.00
AllTags    |    100.00 |    100.00 |    100.00 |    100.00
Lemmas     |    100.00 |    100.00 |    100.00 |    100.00
UAS        |     69.63 |     69.63 |     69.63 |     69.63
LAS        |     57.60 |     57.60 |     57.60 |     57.60
CLAS       |     45.99 |     41.27 |     43.50 |     41.27
MLAS       |     43.55 |     39.08 |     41.20 |     39.08
BLEX       |     45.99 |     41.27 |     43.50 |     41.27


