In [1]:
import pandas as pd

In [4]:
# Load the data
train_data = pd.read_csv("../data/cache/training_unwound.csv")
validation_data = pd.read_csv("../data/cache/validation_unwound.csv")
train_data["language"] = train_data["language"].astype("category")
validation_data["language"] = validation_data["language"].astype("category")
train_data.head()

Unnamed: 0,row_index,node,is_root,language,tree_diameter,tree_size,tree_edges,number_of_centroids,average_degree,number_of_leaves,...,harmonic_centrality,betweenness_centrality,pagerank,katz_centrality,current_flow_closeness,current_flow_betweenness,load_centrality,percolation_centrality,second_order_centrality,laplacian_centrality
0,0,6,False,Japanese,14,23,22,1,1.913043,6,...,5.823846,0.090909,0.048565,0.209086,0.007246,0.090909,0.090909,0.090909,98.762341,0.101449
1,0,4,False,Japanese,14,23,22,1,1.913043,6,...,4.561122,0.0,0.027162,0.188298,0.006289,0.0,0.0,0.0,112.48111,0.043478
2,0,2,False,Japanese,14,23,22,1,1.913043,6,...,6.991703,0.255411,0.066901,0.22866,0.008403,0.255411,0.255411,0.255411,84.451169,0.15942
3,0,23,False,Japanese,14,23,22,1,1.913043,6,...,5.157179,0.0,0.025477,0.190256,0.007143,0.0,0.0,0.0,100.149888,0.057971
4,0,20,False,Japanese,14,23,22,1,1.913043,6,...,7.146825,0.311688,0.042552,0.213357,0.009615,0.311688,0.311688,0.311688,71.147734,0.130435


In this script, we'll fit a logistic regression model to the dataset to predict whether the node is the root or not. Given the vast amount of features, we'll use L1 regularization to perform feature selection.

In [7]:
def one_hot_encode(df: pd.DataFrame) -> pd.DataFrame:
    """
    One-hot encode the 'language' column in the DataFrame.
    """
    return pd.get_dummies(df, columns=["language"], prefix="", prefix_sep="", drop_first=False)

In [8]:
X = one_hot_encode(train_data.drop(columns=["row_index", "node", "is_root"]))
y = train_data["is_root"]

In [10]:
from sklearn.linear_model import LogisticRegression

# Create a logistic regression model
l1 = 0.1
class_weight = "balanced"
model = LogisticRegression(
    penalty="l1", C=1 / l1, class_weight=class_weight, solver="liblinear", random_state=42, verbose=1
)
# Fit the model on the training data
model.fit(X, y)

[LibLinear]iter   1  #CD cycles 1
iter   2  #CD cycles 1
iter   3  #CD cycles 2
iter   4  #CD cycles 1
iter   5  #CD cycles 2
iter   6  #CD cycles 1
iter   7  #CD cycles 20
iter   8  #CD cycles 1
iter   9  #CD cycles 73
iter  10  #CD cycles 7
iter  11  #CD cycles 1
iter  12  #CD cycles 1000
iter  13  #CD cycles 1000
iter  14  #CD cycles 1000
iter  15  #CD cycles 132
iter  16  #CD cycles 88
iter  17  #CD cycles 91
iter  18  #CD cycles 1
iter  19  #CD cycles 1000
iter  20  #CD cycles 1000
iter  21  #CD cycles 1000
iter  22  #CD cycles 1000
iter  23  #CD cycles 1000
iter  24  #CD cycles 1000
iter  25  #CD cycles 1000
iter  26  #CD cycles 1000
iter  27  #CD cycles 88
iter  28  #CD cycles 1000
iter  29  #CD cycles 514
iter  30  #CD cycles 1000
iter  31  #CD cycles 44
iter  32  #CD cycles 159
iter  33  #CD cycles 615
iter  34  #CD cycles 81
iter  35  #CD cycles 274
iter  36  #CD cycles 297
iter  37  #CD cycles 291
iter  38  #CD cycles 70
iter  39  #CD cycles 107
iter  40  #CD cycles 100
iter



Now let's try to get its generalization performance with the single-split validation we did

In [13]:
from collections import defaultdict
from sklearn.metrics import accuracy_score

In [12]:
X_val = one_hot_encode(validation_data.drop(columns=["row_index", "node", "is_root"]))
y_val = validation_data["is_root"]

In [14]:
predictions = model.predict(X_val)
print(f"Node-based accuracy: {accuracy_score(y_val, predictions):.2f}")

Node-based accuracy: 0.73


In [35]:
sentence_predictions = defaultdict(dict)
sentence_real_root = {}
probabilities = model.predict_proba(X_val)
for (_, row), probs in zip(validation_data.iterrows(), probabilities):
    sentence_predictions[row["row_index"]][int(row["node"])] = probs[1]
    if row["is_root"]:
        sentence_real_root[row["row_index"]] = row["node"]

if not set(sentence_predictions.keys()) == set(sentence_real_root.keys()):
    raise ValueError("Mismatch between sentence predictions and real roots.")


def get_predicted_root(row: pd.Series) -> str:
    """
    Get the predicted root node for a sentence.
    """
    sentence_id = row.name
    probs = sentence_predictions[sentence_id]
    return max(probs.keys(), key=probs.get)


validation_prediction = pd.DataFrame.from_dict(sentence_real_root, orient="index", columns=["root"])
validation_prediction["predicted_root"] = validation_prediction.apply(get_predicted_root, axis=1)
print(
    f"Sentence-based accuracy: {accuracy_score(validation_prediction['root'], validation_prediction['predicted_root']):.2f}"
)

Sentence-based accuracy: 0.27
