# Imports


In [1]:
import tensorflow as tf
import keras

from process_data import ProcessData
from dependency_parser import DependencyParser

2023-12-20 10:16:36.671847: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-20 10:16:37.199571: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-20 10:16:37.202957: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Tensorflow and Keras versions


In [2]:
print("Tensorflow ", tf.__version__)
print("Keras ", keras.__version__)

Tensorflow  2.12.0
Keras  2.12.0


# Execution pipeline


In [3]:
def obtain_data_url():
    train_file_url = "https://raw.githubusercontent.com/UniversalDependencies/UD_English-ParTUT/master/en_partut-ud-train.conllu"
    test_file_url = "https://raw.githubusercontent.com/UniversalDependencies/UD_English-ParTUT/master/en_partut-ud-test.conllu"
    dev_file_url = "https://raw.githubusercontent.com/UniversalDependencies/UD_English-ParTUT/master/en_partut-ud-dev.conllu"
    
    return train_file_url, test_file_url, dev_file_url

In [6]:
def select_model(n_features=2, new_model=False, new_samples=False):
    train = False

    if n_features <= 0:
        raise ValueError("Number of features must be over 0")
    
    try:
        train_file_url, test_file_url, dev_file_url = obtain_data_url()
    except:
        print("Data not found, please check the url")
        return None
    
    # If a model already exists, load it and put train to False
    try:
        if new_model == True:
            print("Creating a new model, the old one will be deleted")
            raise Exception("Creating a new model, the old one will be deleted")

        # Create the object to process the data
        processData = ProcessData(train_file_url, test_file_url, dev_file_url)
        processData.read_conllu_file(type_file="test")
        processData.create_samples("test", new_samples = new_samples)
        # Load model and tokenizer
        dependencyParser = DependencyParser(processData)
        dependencyParser.load_model("models/" + str(n_features) + "_features_parser.h5")
        dependencyParser.load_tokenizer("models/tokenizer.pickle")
        print("Model found, loading it")
        dependencyParser.prepare_test_data(n_features=n_features)
        dependencyParser.evaluate_model()
        # Evaluate the model with the dev data
        predictions = dependencyParser.predict(processData.test_data["dataframes"], n_features=n_features)
        dependencyParser.conllu_evaluation(predictions, n_features=n_features)

        train = False
    except:
        train = True
        if new_model != True:
            print("No model found, creating a new one")

    if train == True:
        # Create the object to process the data
        processData = ProcessData(train_file_url, test_file_url, dev_file_url)

        # Read the files
        processData.read_conllu_file(type_file="train")
        processData.read_conllu_file(type_file="test")
        processData.read_conllu_file(type_file="dev")

        # Create the samples
        processData.create_samples("train", new_samples = new_samples)
        processData.create_samples("test", new_samples = new_samples)
        processData.create_samples("dev", new_samples = new_samples)

        # Create the object to use the model
        dependencyParser = DependencyParser(processData)

        # Create the tokenizer
        dependencyParser.create_tokenizer(processData.train_data["words"])

        # Prepare the data
        dependencyParser.prepare_data(n_features=n_features)
        

        # Create and train the model
        dependencyParser.create_and_fit_model(n_features = n_features)

        # Evaluate the model with the dev data
        predictions = dependencyParser.predict(processData.test_data["dataframes"], n_features=n_features)
        dependencyParser.conllu_evaluation(predictions, n_features=n_features)

        # Save the model and the tokenizer
        dependencyParser.save_model("models/" + str(n_features) + "_features_parser.h5")
        dependencyParser.save_tokenizer("models/tokenizer.pickle")
        
    return dependencyParser

In [7]:
dependencyParser = select_model(n_features=2, new_model=False, new_samples=False)

Model found, loading it
test loss, test acc: [1.8780925273895264, 0.7728299498558044, 1.1052625179290771, 0.8154198527336121, 0.799083948135376]


Evaluate on dev data
Predicting...
LAS F1 Score: 53.51
MLAS Score: 43.27
BLEX Score: 45.12
Metric     | Precision |    Recall |  F1 Score | AligndAcc
-----------+-----------+-----------+-----------+-----------
Tokens     |    100.00 |    100.00 |    100.00 |
Sentences  |    100.00 |    100.00 |    100.00 |
Words      |    100.00 |    100.00 |    100.00 |
UPOS       |    100.00 |    100.00 |    100.00 |    100.00
XPOS       |    100.00 |    100.00 |    100.00 |    100.00
UFeats     |    100.00 |    100.00 |    100.00 |    100.00
AllTags    |    100.00 |    100.00 |    100.00 |    100.00
Lemmas     |    100.00 |    100.00 |    100.00 |    100.00
UAS        |     65.09 |     65.09 |     65.09 |     65.09
LAS        |     53.51 |     53.51 |     53.51 |     53.51
CLAS       |     51.19 |     40.33 |     45.12 |     40.33
MLAS       |     49.09 |