In [2]:
import pandas as pd
from pathlib import Path
import json
import numpy as np
import tensorflow as tf
from keras import backend as K
from keras.utils import set_random_seed
from typing import List
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from keras.layers import Dense, Dropout, Input
from keras.models import Model
from keras.optimizers import Adam
from keras.regularizers import l1_l2

set_random_seed(42)

2025-05-29 19:24:29.967729: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748539469.982589 3277251 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748539469.986952 3277251 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1748539469.998467 3277251 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1748539469.998495 3277251 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1748539469.998496 3277251 computation_placer.cc:177] computation placer alr

In [3]:
# Load the data
train_data = pd.read_csv("../data/cache/training_unwound.csv")
validation_data = pd.read_csv("../data/cache/validation_unwound.csv")
train_data["language"] = train_data["language"].astype("category")
validation_data["language"] = validation_data["language"].astype("category")
train_data.head()

Unnamed: 0,row_index,node,is_root,language,tree_diameter,tree_size,tree_edges,number_of_centroids,average_degree,number_of_leaves,...,harmonic_centrality,betweenness_centrality,pagerank,katz_centrality,current_flow_closeness,current_flow_betweenness,load_centrality,percolation_centrality,second_order_centrality,laplacian_centrality
0,0,6,False,Japanese,14,23,22,1,1.913043,6,...,5.823846,0.090909,0.048565,0.209086,0.007246,0.090909,0.090909,0.090909,98.762341,0.101449
1,0,4,False,Japanese,14,23,22,1,1.913043,6,...,4.561122,0.0,0.027162,0.188298,0.006289,0.0,0.0,0.0,112.48111,0.043478
2,0,2,False,Japanese,14,23,22,1,1.913043,6,...,6.991703,0.255411,0.066901,0.22866,0.008403,0.255411,0.255411,0.255411,84.451169,0.15942
3,0,23,False,Japanese,14,23,22,1,1.913043,6,...,5.157179,0.0,0.025477,0.190256,0.007143,0.0,0.0,0.0,100.149888,0.057971
4,0,20,False,Japanese,14,23,22,1,1.913043,6,...,7.146825,0.311688,0.042552,0.213357,0.009615,0.311688,0.311688,0.311688,71.147734,0.130435


In this script, we'll fit a MLP model to the dataset to predict whether the node is the root or not.

In [4]:
def one_hot_encode(df: pd.DataFrame) -> pd.DataFrame:
    """
    One-hot encode the 'language' column in the DataFrame.
    """
    return pd.get_dummies(df, columns=["language"], prefix="", prefix_sep="", drop_first=False)

We want to find the best hyperparameters for the MLP model. To do this, we will use a grid search with 5-fold cross-validation.

The search space, however, is quite large, so this will need to be done in the background. Thus, take a look a the `run_mlp.py` script to see the code, which was developed to be able to run it with `nohup` in the background.

Let's look at the results from the grid search.

In [5]:
grid_search_data = Path("../data/models/mlp/gridsearch-1/")
configurations = [f for f in grid_search_data.iterdir() if f.is_dir()]

config_results = []

for config in configurations:
    with open(config / "metrics.json", "r") as f:
        metrics = json.load(f)
    config_results.append(
        {
            "configuration": config,
            "average_loss": metrics["average_loss"],
            "average_accuracy": metrics["average_accuracy"],
        }
    )

df = pd.DataFrame(config_results)
df.sort_values(by="average_accuracy", ascending=False, inplace=True)
df

Unnamed: 0,configuration,average_loss,average_accuracy
3798,../data/models/mlp/gridsearch-1/configuration-...,0.528808,0.871206
1391,../data/models/mlp/gridsearch-1/configuration-...,0.550175,0.846828
28,../data/models/mlp/gridsearch-1/configuration-...,0.553840,0.845469
712,../data/models/mlp/gridsearch-1/configuration-...,0.521032,0.842782
1427,../data/models/mlp/gridsearch-1/configuration-...,0.558003,0.838728
...,...,...,...
1531,../data/models/mlp/gridsearch-1/configuration-...,0.666606,0.659566
1300,../data/models/mlp/gridsearch-1/configuration-...,0.683775,0.657823
2561,../data/models/mlp/gridsearch-1/configuration-...,0.682818,0.656814
3445,../data/models/mlp/gridsearch-1/configuration-...,0.718864,0.651862


In [6]:
# Let's load the best configuration
best_config_folder = df.iloc[0]["configuration"]
print(f"Best configuration: {best_config_folder}")

with open(best_config_folder / "configuration.json", "r") as f:
    best_configuration = json.load(f)
print(f"Best configuration data: {json.dumps(best_configuration, indent=2)}")

Best configuration: ../data/models/mlp/gridsearch-1/configuration-2484
Best configuration data: {
  "hidden_layer_sizes": [
    64,
    64,
    32
  ],
  "first_layer_l1": 0.1,
  "first_layer_l2": 0.01,
  "hidden_layer_l1": 0.0,
  "hidden_layer_l2": 0.0,
  "first_layer_dropout": 0.2,
  "hidden_dropout": 0.2,
  "initial_learning_rate": 0.001,
  "beta_1": 0.9,
  "beta_2": 0.999,
  "epsilon": 1e-08
}


Now we'll train a MLP model with the best hyperparameters found in the grid search.

In [None]:
def create_model(
    input_shape,
    hidden_layer_sizes: List[int],
    first_layer_l1: float = 0.0,
    first_layer_l2: float = 0.0,
    hidden_layer_l1: float = 0.0,
    hidden_layer_l2: float = 0.0,
    first_layer_dropout: float = 0.0,
    hidden_dropout: float = 0.0,
    initial_learning_rate: float = 0.001,
    beta_1: float = 0.9,
    beta_2: float = 0.999,
    epsilon: float = 1e-08,
) -> Model:
    """
    Create a simple feedforward neural network model.
    """
    if len(hidden_layer_sizes) == 0:
        raise ValueError("hidden_layer_sizes must contain at least one layer size.")

    inputs = Input(shape=(input_shape,))
    x = Dense(
        hidden_layer_sizes[0],
        activation="relu",
        kernel_regularizer=l1_l2(first_layer_l1, first_layer_l2),
    )(inputs)
    if first_layer_dropout > 0:
        x = Dropout(first_layer_dropout)(x)

    for layer_size in hidden_layer_sizes[1:]:
        x = Dense(layer_size, activation="relu", kernel_regularizer=l1_l2(hidden_layer_l1, hidden_layer_l2))(x)
        if hidden_dropout > 0:
            x = Dropout(hidden_dropout)(x)
    outputs = Dense(1, activation="sigmoid")(x)

    model = Model(inputs=inputs, outputs=outputs)
    model.compile(
        optimizer=Adam(learning_rate=initial_learning_rate, beta_1=beta_1, beta_2=beta_2, epsilon=epsilon),
        loss="binary_crossentropy",
        metrics=["accuracy"],
    )

    return model


# One-hot encode the training and validation data
X_train = one_hot_encode(train_data.drop(columns=["row_index", "node", "is_root"]))
y_train = train_data["is_root"]
X_val = one_hot_encode(validation_data.drop(columns=["row_index", "node", "is_root"]))
y_val = validation_data["is_root"]

model = create_model(X_train.shape[1], **best_configuration)
model.summary()

class_weights = y_train.value_counts(normalize=True).to_dict()
class_weights = {k: 1.0 / v for k, v in class_weights.items()}
class_weights

models_path = Path("../data/models/mlp/final-train")
models_path.mkdir(parents=True, exist_ok=True)

with open(models_path / "configuration.json", "w") as f:
    json.dump(best_configuration, f, indent=4)

callbacks = [
    EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True),
    ModelCheckpoint(filepath=str(models_path / "best_model.keras"), monitor="val_loss", save_best_only=True),
    TensorBoard(log_dir=str(models_path / "logs"), histogram_freq=1, write_graph=True),
]

model.fit(
    X_train,
    y_train,
    epochs=100,
    batch_size=256,
    class_weight=class_weights,
    validation_data=(X_val, y_val),
    callbacks=callbacks,
    shuffle=True,
    verbose=2,
)
model.save(models_path / "final_model.keras")
loss, accuracy = model.evaluate(X_val, y_val, verbose=0)
with open(models_path / "metrics.json", "w") as f:
    json.dump({"loss": loss, "accuracy": accuracy}, f, indent=4)
print(f"Loss: {loss:.4f}, Accuracy: {accuracy:.4f}")

I0000 00:00:1748539474.492587 3277251 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 9779 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3060, pci bus id: 0000:01:00.0, compute capability: 8.6


Epoch 1/100


I0000 00:00:1748539476.897836 3277469 service.cc:152] XLA service 0x7f517400a530 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1748539476.897849 3277469 service.cc:160]   StreamExecutor device (0): NVIDIA GeForce RTX 3060, Compute Capability 8.6
2025-05-29 19:24:36.928219: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1748539477.151998 3277469 cuda_dnn.cc:529] Loaded cuDNN version 90300
I0000 00:00:1748539478.386031 3277469 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


616/616 - 10s - 16ms/step - accuracy: 0.6780 - loss: 8.3764 - val_accuracy: 0.8100 - val_loss: 0.9239
Epoch 2/100
616/616 - 2s - 3ms/step - accuracy: 0.7228 - loss: 1.3852 - val_accuracy: 0.7441 - val_loss: 0.7354
Epoch 3/100
616/616 - 2s - 3ms/step - accuracy: 0.7306 - loss: 1.2930 - val_accuracy: 0.6930 - val_loss: 0.7780
Epoch 4/100
616/616 - 2s - 3ms/step - accuracy: 0.7314 - loss: 1.2729 - val_accuracy: 0.7003 - val_loss: 0.7840
Epoch 5/100
616/616 - 2s - 3ms/step - accuracy: 0.7342 - loss: 1.2584 - val_accuracy: 0.7855 - val_loss: 0.6427
Epoch 6/100
616/616 - 2s - 3ms/step - accuracy: 0.7351 - loss: 1.2503 - val_accuracy: 0.7399 - val_loss: 0.6970
Epoch 7/100
616/616 - 2s - 3ms/step - accuracy: 0.7371 - loss: 1.2443 - val_accuracy: 0.7920 - val_loss: 0.6587
Epoch 8/100
616/616 - 2s - 3ms/step - accuracy: 0.7328 - loss: 1.2437 - val_accuracy: 0.6942 - val_loss: 0.7362
Epoch 9/100
616/616 - 2s - 4ms/step - accuracy: 0.7252 - loss: 1.2379 - val_accuracy: 0.7921 - val_loss: 0.6379
Ep

In [8]:
from collections import defaultdict
from sklearn.metrics import accuracy_score

In [9]:
predictions = model.predict(X_val)
print(f"Node-based accuracy: {accuracy_score(y_val, predictions.reshape(-1) > 0.5):.2f}")

[1m1247/1247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step
Node-based accuracy: 0.85


In [10]:
sentence_predictions = defaultdict(dict)
sentence_real_root = {}
for (_, row), probs in zip(validation_data.iterrows(), predictions):
    sentence_predictions[row["row_index"]][int(row["node"])] = probs[0]
    if row["is_root"]:
        sentence_real_root[row["row_index"]] = row["node"]

if not set(sentence_predictions.keys()) == set(sentence_real_root.keys()):
    raise ValueError("Mismatch between sentence predictions and real roots.")


def get_predicted_root(row: pd.Series) -> str:
    """
    Get the predicted root node for a sentence.
    """
    sentence_id = row.name
    probs = sentence_predictions[sentence_id]
    return max(probs.keys(), key=probs.get)


validation_prediction = pd.DataFrame.from_dict(sentence_real_root, orient="index", columns=["root"])
validation_prediction["predicted_root"] = validation_prediction.apply(get_predicted_root, axis=1)
print(
    f"Sentence-based accuracy: {accuracy_score(validation_prediction['root'], validation_prediction['predicted_root']):.2f}"
)

Sentence-based accuracy: 0.26
