In [2]:
import pandas as pd

In [3]:
# Load the data
train_data = pd.read_csv("../data/cache/training_unwound.csv")
validation_data = pd.read_csv("../data/cache/validation_unwound.csv")
train_data["language"] = train_data["language"].astype("category")
validation_data["language"] = validation_data["language"].astype("category")
train_data.head()

Unnamed: 0,row_index,node,is_root,language,tree_diameter,tree_size,tree_edges,number_of_centroids,average_degree,number_of_leaves,...,harmonic_centrality,betweenness_centrality,pagerank,katz_centrality,current_flow_closeness,current_flow_betweenness,load_centrality,percolation_centrality,second_order_centrality,laplacian_centrality
0,0,6,False,Japanese,14,23,22,1,1.913043,6,...,5.823846,0.090909,0.048565,0.209086,0.007246,0.090909,0.090909,0.090909,98.762341,0.101449
1,0,4,False,Japanese,14,23,22,1,1.913043,6,...,4.561122,0.0,0.027162,0.188298,0.006289,0.0,0.0,0.0,112.48111,0.043478
2,0,2,False,Japanese,14,23,22,1,1.913043,6,...,6.991703,0.255411,0.066901,0.22866,0.008403,0.255411,0.255411,0.255411,84.451169,0.15942
3,0,23,False,Japanese,14,23,22,1,1.913043,6,...,5.157179,0.0,0.025477,0.190256,0.007143,0.0,0.0,0.0,100.149888,0.057971
4,0,20,False,Japanese,14,23,22,1,1.913043,6,...,7.146825,0.311688,0.042552,0.213357,0.009615,0.311688,0.311688,0.311688,71.147734,0.130435


In this script, we'll fit a logistic regression model to the dataset to predict whether the node is the root or not. Given the vast amount of features, we'll use L1 regularization to perform feature selection.

In [4]:
def one_hot_encode(df: pd.DataFrame) -> pd.DataFrame:
    """
    One-hot encode the 'language' column in the DataFrame.
    """
    return pd.get_dummies(df, columns=["language"], prefix="", prefix_sep="", drop_first=False)

In [5]:
import itertools as it
from typing import Dict, Generic, Iterator

import numpy as np
from typing import TypeVar

T = TypeVar("T")


class Dimension(Generic[T]):
    def __init__(self, *values: T):
        self._values = values

    @property
    def options(self):
        return self._values

    def __len__(self):
        return len(self._values)


class GridSearch:
    """
    Perform a grid search over a set of dimensions. Dimensions will be iterated over in the revere order they were added.
    """

    def __init__(self, dimensions: Dict[str, Dimension] = {}):
        self._dimensions = dimensions

    def add_dimension(self, key: str, dimension: Dimension):
        self._dimensions[key] = dimension

    def __len__(self):
        return np.prod(list(map(lambda x: len(x), self._dimensions.values())))

    def __iter__(self) -> Iterator[str]:
        for prod in it.product(*map(lambda x: x.options, self._dimensions.values())):
            yield {key: value for key, value in zip(self._dimensions.keys(), prod)}

In [None]:
from pathlib import Path
from typing import List
from keras.layers import Input, Dense, Dropout
from keras.models import Model
from keras.regularizers import l1_l2
import json
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.optimizers import Adam


def create_model(
    input_shape,
    hidden_layer_sizes: List[int],
    first_layer_l1: float = 0.0,
    first_layer_l2: float = 0.0,
    hidden_layer_l1: float = 0.0,
    hidden_layer_l2: float = 0.0,
    first_layer_dropout: float = 0.0,
    hidden_dropout: float = 0.0,
    initial_learning_rate: float = 0.001,
    beta_1: float = 0.9,
    beta_2: float = 0.999,
    epsilon: float = 1e-08,
) -> Model:
    """
    Create a simple feedforward neural network model.
    """
    if len(hidden_layer_sizes) == 0:
        raise ValueError("hidden_layer_sizes must contain at least one layer size.")

    inputs = Input(shape=(input_shape,))
    x = Dense(
        hidden_layer_sizes[0],
        activation="relu",
        kernel_regularizer=l1_l2(first_layer_l1, first_layer_l2),
    )(inputs)
    if first_layer_dropout > 0:
        x = Dropout(first_layer_dropout)(x)

    for layer_size in hidden_layer_sizes[1:]:
        x = Dense(layer_size, activation="relu", kernel_regularizer=l1_l2(hidden_layer_l1, hidden_layer_l2))(x)
        if hidden_dropout > 0:
            x = Dropout(hidden_dropout)(x)
    outputs = Dense(1, activation="sigmoid")(x)

    model = Model(inputs=inputs, outputs=outputs)
    model.compile(
        optimizer=Adam(learning_rate=initial_learning_rate, beta_1=beta_1, beta_2=beta_2, epsilon=epsilon),
        loss="binary_crossentropy",
        metrics=["accuracy"],
    )

    return model


grid = GridSearch()
grid.add_dimension("hidden_layer_sizes", Dimension([64, 64, 32], [64, 32], [64, 32, 16, 8]))
grid.add_dimension("first_layer_l1", Dimension(0.0, 0.01, 0.1))
grid.add_dimension("first_layer_l2", Dimension(0.0, 0.01))
grid.add_dimension("hidden_layer_l1", Dimension(0.0, 0.01))
grid.add_dimension("hidden_layer_l2", Dimension(0.0, 0.01, 0.1))
grid.add_dimension("first_layer_dropout", Dimension(0.2))
grid.add_dimension("hidden_dropout", Dimension(0.0, 0.1, 0.2))
grid.add_dimension("initial_learning_rate", Dimension(0.001))
grid.add_dimension("beta_1", Dimension(0.9, 0.95, 0.99))
grid.add_dimension("beta_2", Dimension(0.999, 0.995, 0.99))
grid.add_dimension("epsilon", Dimension(1e-08, 1e-07, 1e-06))

print(f"Total configurations: {len(grid)}")


cross_validation_folds = 5

models_path_root = Path("../data/models/mlp")

models_path = models_path_root / "run-1"
run_id = 1
while models_path.exists():
    run_id += 1
    models_path = models_path_root / f"run-{run_id}"
models_path.mkdir(parents=True, exist_ok=True)

row_indices = train_data["row_index"].unique()
# We'll separate based on row indices, because that's what we have now. Ideally
# we would separate based on sentence id, but we don't have that in the data now
for j, configuration in enumerate(grid):
    config_path = models_path / f"configuration-{j}"
    config_path.mkdir(parents=True, exist_ok=True)
    with open(config_path / "configuration.json", "w") as f:
        json.dump(configuration, f, indent=4)

    losses = []
    accuracies = []

    # Separate cross-validation data
    for i in range(cross_validation_folds):
        fold_path = config_path / f"fold-{i}"
        fold_path.mkdir(parents=True, exist_ok=True)

        fold_row_indices = row_indices[i::cross_validation_folds]
        with open(fold_path / "fold_validation_row_indices.json", "w") as f:
            json.dump(fold_row_indices.tolist(), f)

        train_fold_data = train_data[~train_data["row_index"].isin(fold_row_indices)]
        validation_fold_data = train_data[train_data["row_index"].isin(fold_row_indices)]

        # One-hot encode the training and validation data
        X_train = one_hot_encode(train_fold_data.drop(columns=["row_index", "node", "is_root"]))
        y_train = train_fold_data["is_root"]
        X_val = one_hot_encode(validation_fold_data.drop(columns=["row_index", "node", "is_root"]))
        y_val = validation_fold_data["is_root"]

        model = create_model(X_train.shape[1], **configuration)
        model.summary()

        class_weights = y_train.value_counts(normalize=True).to_dict()
        class_weights = {k: 1.0 / v for k, v in class_weights.items()}
        class_weights

        callbacks = [
            EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True),
            ModelCheckpoint(filepath=str(fold_path / "best_model.keras"), monitor="val_loss", save_best_only=True),
        ]

        model.fit(
            X_train,
            y_train,
            epochs=10,
            batch_size=256,
            class_weight=class_weights,
            validation_data=(X_val, y_val),
            callbacks=callbacks,
            shuffle=True,
        )
        model.save(fold_path / "final_model.keras")
        loss, accuracy = model.evaluate(X_val, y_val, verbose=0)
        losses.append(loss)
        accuracies.append(accuracy)
        with open(fold_path / "metrics.json", "w") as f:
            json.dump({"loss": loss, "accuracy": accuracy}, f, indent=4)
        with open(config_path / "metrics.json", "w") as f:  # Store this every fold, so we "checkpoint".
            json.dump({"losses": losses, "accuracies": accuracies}, f, indent=4)
        print(f"Fold {i + 1}/{cross_validation_folds} - Loss: {loss:.4f}, Accuracy: {accuracy:.4f}")
    print(
        f"Configuration {j + 1}/{len(grid)} - Average Loss: {sum(losses) / len(losses):.4f}, "
        f"Average Accuracy: {sum(accuracies) / len(accuracies):.4f}"
    )
    with open(config_path / "metrics.json", "w") as f:
        json.dump(
            {
                "losses": losses,
                "accuracies": accuracies,
                "average_loss": sum(losses) / len(losses),
                "average_accuracy": sum(accuracies) / len(accuracies),
            },
            f,
            indent=4,
        )
    print(f"Configuration {j + 1}/{len(grid)} completed and saved.")

Total configurations: 8748


Epoch 1/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.6682 - loss: 1.3638 - val_accuracy: 0.7754 - val_loss: 0.4353
Epoch 2/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7117 - loss: 1.0664 - val_accuracy: 0.6805 - val_loss: 0.5576
Epoch 3/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7204 - loss: 1.0275 - val_accuracy: 0.6815 - val_loss: 0.5718
Epoch 4/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7168 - loss: 1.0198 - val_accuracy: 0.6391 - val_loss: 0.5723
Epoch 5/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7100 - loss: 1.0169 - val_accuracy: 0.7324 - val_loss: 0.4851
Epoch 6/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7230 - loss: 1.0127 - val_accuracy: 0.6997 - val_loss: 0.4846
Epoch 7/10
[1m500/500[0m 

Epoch 1/10
[1m491/491[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 10ms/step - accuracy: 0.6675 - loss: 1.4128 - val_accuracy: 0.7402 - val_loss: 0.4698
Epoch 2/10
[1m491/491[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7040 - loss: 1.0549 - val_accuracy: 0.6381 - val_loss: 0.5878
Epoch 3/10
[1m491/491[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7016 - loss: 1.0509 - val_accuracy: 0.6801 - val_loss: 0.5754
Epoch 4/10
[1m491/491[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7034 - loss: 1.0347 - val_accuracy: 0.7036 - val_loss: 0.4649
Epoch 5/10
[1m491/491[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6925 - loss: 1.0309 - val_accuracy: 0.6929 - val_loss: 0.5065
Epoch 6/10
[1m491/491[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7077 - loss: 1.0156 - val_accuracy: 0.7007 - val_loss: 0.5188
Epoch 7/10
[1m491/491[0m 

Epoch 1/10
[1m460/487[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 2ms/step - accuracy: 0.6407 - loss: 2.0110




[1m487/487[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 12ms/step - accuracy: 0.6429 - loss: 1.9745 - val_accuracy: 0.7314 - val_loss: 0.5516
Epoch 2/10
[1m487/487[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7249 - loss: 1.0658 - val_accuracy: 0.7833 - val_loss: 0.4902
Epoch 3/10
[1m487/487[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7268 - loss: 1.0387 - val_accuracy: 0.7683 - val_loss: 0.4466
Epoch 4/10
[1m487/487[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7344 - loss: 1.0274 - val_accuracy: 0.7275 - val_loss: 0.5259
Epoch 5/10
[1m487/487[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7245 - loss: 1.0090 - val_accuracy: 0.7687 - val_loss: 0.4279
Epoch 6/10
[1m487/487[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7451 - loss: 0.9940 - val_accuracy: 0.7584 - val_loss: 0.4201
Epoch 7/10
[1m487/487[0m [32m━━━━━━




Fold 3/5 - Loss: 0.3988, Accuracy: 0.7942


Epoch 1/10
[1m480/494[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - accuracy: 0.6653 - loss: 1.3039




[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 15ms/step - accuracy: 0.6659 - loss: 1.2999 - val_accuracy: 0.7788 - val_loss: 0.4264
Epoch 2/10
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7090 - loss: 1.0562 - val_accuracy: 0.6831 - val_loss: 0.5449
Epoch 3/10
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7010 - loss: 1.0265 - val_accuracy: 0.7612 - val_loss: 0.4640
Epoch 4/10
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7135 - loss: 1.0238 - val_accuracy: 0.6795 - val_loss: 0.5650
Epoch 5/10
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7176 - loss: 1.0112 - val_accuracy: 0.7167 - val_loss: 0.4891
Epoch 6/10
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7184 - loss: 1.0066 - val_accuracy: 0.6683 - val_loss: 0.5498
Epoch 7/10
[1m494/494[0m [32m━━━━━━

Epoch 1/10
[1m478/492[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - accuracy: 0.6304 - loss: 1.6534




[1m492/492[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 12ms/step - accuracy: 0.6316 - loss: 1.6420 - val_accuracy: 0.5606 - val_loss: 0.8322
Epoch 2/10
[1m492/492[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7168 - loss: 1.0684 - val_accuracy: 0.7113 - val_loss: 0.5378
Epoch 3/10
[1m492/492[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7223 - loss: 1.0257 - val_accuracy: 0.7456 - val_loss: 0.4774
Epoch 4/10
[1m492/492[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7173 - loss: 1.0150 - val_accuracy: 0.7379 - val_loss: 0.4914
Epoch 5/10
[1m492/492[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7229 - loss: 0.9938 - val_accuracy: 0.7281 - val_loss: 0.5294
Epoch 6/10
[1m492/492[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7247 - loss: 1.0154 - val_accuracy: 0.7533 - val_loss: 0.5010
Epoch 7/10
[1m492/492[0m [32m━━━━━━




Fold 5/5 - Loss: 0.4372, Accuracy: 0.7683
Configuration 1/8748 - Average Loss: 0.4289, Average Accuracy: 0.7724
Configuration 1/8748 completed and saved.


Epoch 1/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.6600 - loss: 1.5286 - val_accuracy: 0.7150 - val_loss: 0.5621
Epoch 2/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7173 - loss: 1.0783 - val_accuracy: 0.6733 - val_loss: 0.6066
Epoch 3/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7128 - loss: 1.0372 - val_accuracy: 0.6465 - val_loss: 0.5740
Epoch 4/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7044 - loss: 1.0237 - val_accuracy: 0.7104 - val_loss: 0.5056
Epoch 5/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7119 - loss: 1.0269 - val_accuracy: 0.7093 - val_loss: 0.5459
Epoch 6/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7142 - loss: 1.0087 - val_accuracy: 0.6826 - val_loss: 0.5143
Epoch 7/10
[1m500/500[0m 

Epoch 1/10
[1m491/491[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6532 - loss: 1.5046

Now let's try to get its generalization performance with the single-split validation we did

In [None]:
from collections import defaultdict
from sklearn.metrics import accuracy_score

In [None]:
X_val = one_hot_encode(validation_data.drop(columns=["row_index", "node", "is_root"]))
y_val = validation_data["is_root"]

In [None]:
predictions = model.predict(X_val)
print(f"Node-based accuracy: {accuracy_score(y_val, predictions.reshape(-1) > 0.5):.2f}")

[1m1247/1247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 665us/step
Node-based accuracy: 0.70


In [None]:
sentence_predictions = defaultdict(dict)
sentence_real_root = {}
for (_, row), probs in zip(validation_data.iterrows(), predictions):
    sentence_predictions[row["row_index"]][int(row["node"])] = probs[0]
    if row["is_root"]:
        sentence_real_root[row["row_index"]] = row["node"]

if not set(sentence_predictions.keys()) == set(sentence_real_root.keys()):
    raise ValueError("Mismatch between sentence predictions and real roots.")


def get_predicted_root(row: pd.Series) -> str:
    """
    Get the predicted root node for a sentence.
    """
    sentence_id = row.name
    probs = sentence_predictions[sentence_id]
    return max(probs.keys(), key=probs.get)


validation_prediction = pd.DataFrame.from_dict(sentence_real_root, orient="index", columns=["root"])
validation_prediction["predicted_root"] = validation_prediction.apply(get_predicted_root, axis=1)
print(
    f"Sentence-based accuracy: {accuracy_score(validation_prediction['root'], validation_prediction['predicted_root']):.2f}"
)

Sentence-based accuracy: 0.26
