In [1]:
import pandas as pd
from lib.gridsearch import GridSearch, Dimension
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor
import json
import time
from typing import Dict, Any, List
import logging

In [2]:
logger = logging.getLogger(__name__)
logging.basicConfig(
    level=logging.INFO,
    format="(%(asctime)s) %(levelname)s # %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)

In [3]:
# Load the data
train_data = pd.read_csv("../data/cache/training_unwound.csv")
validation_data = pd.read_csv("../data/cache/validation_unwound.csv")
train_data["language"] = train_data["language"].astype("category")
validation_data["language"] = validation_data["language"].astype("category")
train_data.head()

Unnamed: 0,row_index,node,is_root,language,tree_diameter,tree_size,tree_edges,number_of_centroids,average_degree,number_of_leaves,...,harmonic_centrality,betweenness_centrality,pagerank,katz_centrality,current_flow_closeness,current_flow_betweenness,load_centrality,percolation_centrality,second_order_centrality,laplacian_centrality
0,0,6,False,Japanese,14,23,22,1,1.913043,6,...,5.823846,0.090909,0.048565,0.209086,0.007246,0.090909,0.090909,0.090909,98.762341,0.101449
1,0,4,False,Japanese,14,23,22,1,1.913043,6,...,4.561122,0.0,0.027162,0.188298,0.006289,0.0,0.0,0.0,112.48111,0.043478
2,0,2,False,Japanese,14,23,22,1,1.913043,6,...,6.991703,0.255411,0.066901,0.22866,0.008403,0.255411,0.255411,0.255411,84.451169,0.15942
3,0,23,False,Japanese,14,23,22,1,1.913043,6,...,5.157179,0.0,0.025477,0.190256,0.007143,0.0,0.0,0.0,100.149888,0.057971
4,0,20,False,Japanese,14,23,22,1,1.913043,6,...,7.146825,0.311688,0.042552,0.213357,0.009615,0.311688,0.311688,0.311688,71.147734,0.130435


In [4]:
def one_hot_encode(df: pd.DataFrame) -> pd.DataFrame:
    """
    One-hot encode the 'language' column in the DataFrame.
    """
    return pd.get_dummies(df, columns=["language"], prefix="", prefix_sep="", drop_first=False)

In [5]:
X = one_hot_encode(train_data.drop(columns=["row_index", "node", "is_root"]))
y = train_data["is_root"]

In [10]:
grid = GridSearch()

grid.add_dimension("n_neighbors", Dimension(2, 3, 5, 8, 10))
grid.add_dimension("weights", Dimension("uniform", "distance"))
grid.add_dimension("algorithm", Dimension("auto", "ball_tree", "kd_tree", "brute"))
grid.add_dimension("metric", Dimension("minkowski"))
grid.add_dimension("p", Dimension(1, 2))

print(len(grid))

80


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score


def run_config_fold(
    configuration: Dict[str, Any],
    fold: int,
    fold_path: Path,
    train_fold_data: pd.DataFrame,
    validation_fold_data: pd.DataFrame,
) -> Dict[str, float]:

    # One-hot encode the training and validation data
    X_train = one_hot_encode(train_fold_data.drop(columns=["row_index", "node", "is_root"]))
    y_train = train_fold_data["is_root"]
    X_val = one_hot_encode(validation_fold_data.drop(columns=["row_index", "node", "is_root"]))
    y_val = validation_fold_data["is_root"]

    model = KNeighborsClassifier(**configuration)

    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    with open(fold_path / "metrics.json", "w") as f:
        json.dump({"accuracy": accuracy}, f, indent=4)
    logger.info(f"Fold {fold + 1}/{cross_validation_folds} - Accuracy: {accuracy:.4f}")

    del model

    return accuracy


models_path = Path("../data/models/knn/gridsearch")
cross_validation_folds = 5

average_duration = 0
duration_points = 0

row_indices = train_data["row_index"].unique()
# We'll separate based on row indices, because that's what we have now. Ideally
# we would separate based on sentence id, but we don't have that in the data now
for j, configuration in enumerate(grid):
    logger.info(f"Configuration {j + 1}/{len(grid)}: {configuration}")
    config_path = models_path / f"configuration-{j}"
    if config_path.exists():
        logger.info(f"Skipping configuration {j + 1}/{len(grid)}")
        continue
    config_path.mkdir(parents=True, exist_ok=True)
    with open(config_path / "configuration.json", "w") as f:
        json.dump(configuration, f, indent=4)

    accuracies = []
    conf_start = time.time()

    # Separate cross-validation data
    with ProcessPoolExecutor(max_workers=cross_validation_folds) as executor:
        jobs = []
        for fold in range(cross_validation_folds):
            fold_path = config_path / f"fold-{fold}"
            fold_path.mkdir(parents=True, exist_ok=True)

            fold_row_indices = row_indices[fold::cross_validation_folds]
            with open(fold_path / "fold_validation_row_indices.json", "w") as f:
                json.dump(fold_row_indices.tolist(), f)

            train_fold_data = train_data[~train_data["row_index"].isin(fold_row_indices)]
            validation_fold_data = train_data[train_data["row_index"].isin(fold_row_indices)]
            job = executor.submit(
                run_config_fold, configuration, fold, fold_path, train_fold_data, validation_fold_data
            )
            jobs.append(job)
        for job in jobs:
            accuracy = job.result()
            accuracies.append(accuracy)
    logger.info(f"Configuration {j + 1}/{len(grid)} - Average Accuracy: {sum(accuracies) / len(accuracies):.4f}")

    with open(config_path / "metrics.json", "w") as f:
        json.dump({"accuracies": accuracies, "average_accuracy": sum(accuracies) / len(accuracies)}, f, indent=4)

    logger.info(f"Configuration {j + 1}/{len(grid)} completed in {time.time() - conf_start:.2f} seconds.")
    missing_confs = len(grid) - (j + 1)
    duration_points = min(duration_points + 1, 10)  # At most 10 points to average over, otherwise too smooth.
    average_duration = (average_duration * (duration_points - 1) + (time.time() - conf_start)) / duration_points
    logger.info(f"Missing configurations: {missing_confs}. ETA: {average_duration * missing_confs:.2f} seconds.")

(2025-05-28 20:46:03) INFO # Configuration 1/80: {'n_neighbors': 2, 'weights': 'uniform', 'algorithm': 'auto', 'metric': 'minkowski', 'p': 1}


In [7]:
from collections import defaultdict
from sklearn.metrics import accuracy_score

X_val = one_hot_encode(validation_data.drop(columns=["row_index", "node", "is_root"]))
y_val = validation_data["is_root"]

predictions = model.predict(X_val)
print(f"Node-based accuracy: {accuracy_score(y_val, predictions):.2f}")

Node-based accuracy: 0.94


In [8]:
sentence_predictions = defaultdict(dict)
sentence_real_root = {}
probabilities = model.predict_proba(X_val)
for (_, row), probs in zip(validation_data.iterrows(), probabilities):
    sentence_predictions[row["row_index"]][int(row["node"])] = probs[1]
    if row["is_root"]:
        sentence_real_root[row["row_index"]] = row["node"]

if not set(sentence_predictions.keys()) == set(sentence_real_root.keys()):
    raise ValueError("Mismatch between sentence predictions and real roots.")


def get_predicted_root(row: pd.Series) -> str:
    """
    Get the predicted root node for a sentence.
    """
    sentence_id = row.name
    probs = sentence_predictions[sentence_id]
    return max(probs.keys(), key=probs.get)


validation_prediction = pd.DataFrame.from_dict(sentence_real_root, orient="index", columns=["root"])
validation_prediction["predicted_root"] = validation_prediction.apply(get_predicted_root, axis=1)
print(
    f"Sentence-based accuracy: {accuracy_score(validation_prediction['root'], validation_prediction['predicted_root']):.2f}"
)

Sentence-based accuracy: 0.27
