In [1]:
import pandas as pd
from lib.trees import get_tree, parse_edge_list
from lib.dataset import split_training_validation
import networkx as nx
import random

In [2]:
sentences = pd.read_csv("../data/train.csv")
sentences["language"] = sentences["language"].astype("category")
sentences["edgelist"] = sentences["edgelist"].apply(parse_edge_list)
sentences["tree"] = sentences["edgelist"].apply(get_tree)
sentences.head()

Unnamed: 0,language,sentence,n,edgelist,root,tree
0,Japanese,2,23,"[(6, 4), (2, 6), (2, 23), (20, 2), (15, 20), (...",10,"(6, 4, 2, 23, 20, 15, 3, 5, 14, 8, 12, 9, 18, ..."
1,Japanese,5,18,"[(8, 9), (14, 8), (4, 14), (5, 4), (1, 2), (6,...",10,"(8, 9, 14, 4, 5, 1, 2, 6, 17, 12, 3, 7, 11, 16..."
2,Japanese,8,33,"[(2, 10), (2, 14), (4, 2), (16, 4), (6, 16), (...",3,"(2, 10, 14, 4, 16, 6, 12, 32, 26, 3, 29, 27, 2..."
3,Japanese,11,30,"[(30, 1), (14, 24), (21, 14), (3, 21), (7, 3),...",30,"(30, 1, 14, 24, 21, 3, 7, 12, 27, 16, 8, 5, 26..."
4,Japanese,12,19,"[(19, 13), (16, 19), (2, 16), (4, 10), (4, 15)...",11,"(19, 13, 16, 2, 4, 10, 15, 5, 14, 12, 3, 1, 8,..."


In [3]:
random.seed(42)
training, validation = split_training_validation(sentences, 0.2)

print("Training set size:", len(training))
print("Validation set size:", len(validation))

Training set size: 8400
Validation set size: 2100


Our first approach will be the example given by the statement. We'll unwind the the tree into a list of nodes, each
with a set of centrality scores.

In [4]:
def unwind_tree(row:pd.Series) -> pd.DataFrame:
    """
    Unwind a tree into a list of nodes, with their centrality scores.
    """
    tree = row["tree"]
    root_node = row["root"]
    language = row["language"]
    degree_centrality = nx.degree_centrality(tree)
    harmonic_centrality = nx.harmonic_centrality(tree)
    betweenness_centrality = nx.betweenness_centrality(tree)
    pagerank = nx.pagerank(tree)

    rows = []
    for node in tree:
        rows.append({
            "node": node,
            "degree_centrality": degree_centrality[node],
            "harmonic_centrality": harmonic_centrality[node],
            "betweenness_centrality": betweenness_centrality[node],
            "pagerank": pagerank[node],
            "language": language,
            "row_index": row.name,
            "is_root": node == root_node,
        })
    return pd.DataFrame(rows)

In [5]:
validation_unwound = pd.concat(validation.apply(unwind_tree, axis=1).tolist(), ignore_index=True)
training_unwound = pd.concat(training.apply(unwind_tree, axis=1).tolist(), ignore_index=True)
training_unwound

Unnamed: 0,node,degree_centrality,harmonic_centrality,betweenness_centrality,pagerank,language,row_index,is_root
0,6,0.090909,5.823846,0.090909,0.048565,Japanese,0,False
1,4,0.045455,4.561122,0.000000,0.027162,Japanese,0,False
2,2,0.136364,6.991703,0.255411,0.066901,Japanese,0,False
3,23,0.045455,5.157179,0.000000,0.025477,Japanese,0,False
4,20,0.090909,7.146825,0.311688,0.042552,Japanese,0,False
...,...,...,...,...,...,...,...,...
157583,19,0.055556,5.005159,0.000000,0.032147,Russian,10499,False
157584,1,0.055556,6.034524,0.000000,0.029739,Russian,10499,False
157585,14,0.055556,6.034524,0.000000,0.029739,Russian,10499,False
157586,5,0.111111,6.701190,0.111111,0.057065,Russian,10499,False


In [6]:
print("Training unwound size:", len(training_unwound))
print("Validation unwound size:", len(validation_unwound))

Training unwound size: 157588
Validation unwound size: 39891


In [7]:
X_train = training_unwound[["degree_centrality", "harmonic_centrality", "betweenness_centrality", "pagerank", "language"]]
X_train = pd.get_dummies(X_train, columns=["language"], drop_first=False) # Convert "language" to one-hot encoding
Y_train = training_unwound["is_root"]
Y_train = pd.get_dummies(Y_train, drop_first=False)

X_val = validation_unwound[["degree_centrality", "harmonic_centrality", "betweenness_centrality", "pagerank", "language"]]
X_val = pd.get_dummies(X_val, columns=["language"], drop_first=False) # Convert "language" to one-hot encoding
Y_val = validation_unwound["is_root"]
Y_val = pd.get_dummies(Y_val, drop_first=False)

X_train.head()

Unnamed: 0,degree_centrality,harmonic_centrality,betweenness_centrality,pagerank,language_Arabic,language_Chinese,language_Czech,language_English,language_Finnish,language_French,...,language_Italian,language_Japanese,language_Korean,language_Polish,language_Portuguese,language_Russian,language_Spanish,language_Swedish,language_Thai,language_Turkish
0,0.090909,5.823846,0.090909,0.048565,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
1,0.045455,4.561122,0.0,0.027162,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
2,0.136364,6.991703,0.255411,0.066901,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
3,0.045455,5.157179,0.0,0.025477,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
4,0.090909,7.146825,0.311688,0.042552,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False


Now we'll build a model that will predict whether each node is a root or not.

In [12]:
from sklearn.ensemble import RandomForestClassifier
from collections import defaultdict
from sklearn.metrics import accuracy_score

In [None]:
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    class_weight="balanced",
)
model.fit(X_train, Y_train)
print("Model trained.")

Model trained.


We'll compute two kinds of accuracy:
1. Node-based accuracy: The number of nodes correctly classified as root or not divided by the total number of nodes.
2. Sentence-based accuracy: The number of sentences from which the top-probability root node was correctly classified as root divided by the total number of sentences.

In [28]:
predictions = model.predict(X_val)
print(f"Node-based accuracy: {accuracy_score(Y_val, predictions):.2f}")

Node-based accuracy: 0.31


In [36]:
sentence_predictions = defaultdict(dict)
probabilities = model.predict_proba(X_val)[0]
for (_, row), probs in zip(validation_unwound.iterrows(), probabilities):
    sentence_predictions[row["row_index"]][row["node"]] = probs[0]


def get_predicted_root(row: pd.Series) -> str:
    """
    Get the predicted root node for a sentence.
    """
    sentence_id = row.name
    probs = sentence_predictions[sentence_id]
    return max(probs.keys(), key=probs.get)

validation_prediction = validation.copy()
validation_prediction["predicted_root"] = validation_prediction.apply(get_predicted_root, axis=1)
print("Sentence-based accuracy:", accuracy_score(validation_prediction["root"], validation_prediction["predicted_root"]))

Sentence-based accuracy: 0.23714285714285716
