In [10]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
import pandas as pd
from collections import defaultdict
from sklearn.metrics import accuracy_score

## Load the data

In [4]:
validation_unwound = pd.read_csv("../data/cache/validation_unwound.csv")
training_unwound = pd.read_csv("../data/cache/training_unwound.csv")

In [None]:
X_train = training_unwound.drop(columns=["row_index", "node", "is_root"])
X_train = pd.get_dummies(X_train, columns=["language"], drop_first=False)
y_train = training_unwound["is_root"]


X_val = validation_unwound.drop(
    columns=[
        "row_index",
        "node",
        "is_root",
    ]
)
X_val = pd.get_dummies(X_val, columns=["language"], drop_first=False)
y_val = validation_unwound["is_root"]

## LDA

In [6]:
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)

predictions = lda.predict(X_val)
print(f"Node-based accuracy: {accuracy_score(y_val, predictions):.2f}")


sentence_predictions = defaultdict(dict)
probs = lda.predict_proba(X_val)

sentence_real_root = {}
for (_, row), pred in zip(validation_unwound.iterrows(), probs):
    sentence_predictions[row["row_index"]][row["node"]] = pred[1]
    if row["is_root"]:
        sentence_real_root[row["row_index"]] = row["node"]

if not set(sentence_predictions.keys()) == set(sentence_real_root.keys()):
    raise ValueError("Mismatch between sentence predictions and real roots.")


def get_predicted_root(row: pd.Series) -> str:
    """
    Get the predicted root node for a sentence.
    """
    sentence_id = row.name
    probs = sentence_predictions[sentence_id]
    return max(probs.keys(), key=probs.get)


validation_prediction = pd.DataFrame.from_dict(sentence_real_root, orient="index", columns=["root"])
validation_prediction["predicted_root"] = validation_prediction.apply(get_predicted_root, axis=1)
print(
    f"Sentence-based accuracy: {accuracy_score(validation_prediction['root'], validation_prediction['predicted_root']):.2f}"
)

Node-based accuracy: 0.93
Sentence-based accuracy: 0.26


## QDA

In [None]:
qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train, y_train)

predictions = qda.predict(X_val)
print(f"Node-based accuracy: {accuracy_score(y_val, predictions):.2f}")

sentence_predictions = defaultdict(dict)
probs = qda.predict_proba(X_val)

sentence_real_root = {}
for (_, row), pred in zip(validation_unwound.iterrows(), probs):
    sentence_predictions[row["row_index"]][row["node"]] = pred[1]
    if row["is_root"]:
        sentence_real_root[row["row_index"]] = row["node"]

if not set(sentence_predictions.keys()) == set(sentence_real_root.keys()):
    raise ValueError("Mismatch between sentence predictions and real roots.")


def get_predicted_root(row: pd.Series) -> str:
    """
    Get the predicted root node for a sentence.
    """
    sentence_id = row.name
    probs = sentence_predictions[sentence_id]
    return max(probs.keys(), key=probs.get)


validation_prediction = pd.DataFrame.from_dict(sentence_real_root, orient="index", columns=["root"])
validation_prediction["predicted_root"] = validation_prediction.apply(get_predicted_root, axis=1)
print(
    f"Sentence-based accuracy: {accuracy_score(validation_prediction['root'], validation_prediction['predicted_root']):.2f}"
)



Node-based accuracy: 0.52
Sentence-based accuracy: 0.41


## Naive Bayes

In [8]:
nb = GaussianNB()
nb.fit(X_train, y_train)


predictions = nb.predict(X_val)
print(f"Node-based accuracy: {accuracy_score(y_val, predictions):.3f}")


sentence_predictions = defaultdict(dict)
probs = nb.predict_proba(X_val)

sentence_real_root = {}
for (_, row), pred in zip(validation_unwound.iterrows(), probs):
    sentence_predictions[row["row_index"]][row["node"]] = pred[1]
    if row["is_root"]:
        sentence_real_root[row["row_index"]] = row["node"]

if not set(sentence_predictions.keys()) == set(sentence_real_root.keys()):
    raise ValueError("Mismatch between sentence predictions and real roots.")


def get_predicted_root(row: pd.Series) -> str:
    """
    Get the predicted root node for a sentence.
    """
    sentence_id = row.name
    probs = sentence_predictions[sentence_id]
    return max(probs.keys(), key=probs.get)


validation_prediction = pd.DataFrame.from_dict(sentence_real_root, orient="index", columns=["root"])
validation_prediction["predicted_root"] = validation_prediction.apply(get_predicted_root, axis=1)
print(
    f"Sentence-based accuracy: {accuracy_score(validation_prediction['root'], validation_prediction['predicted_root']):.3f}"
)

Node-based accuracy: 0.784
Sentence-based accuracy: 0.289


In [None]:
# cross validate and grid search the smoothing paramter of naive bayes
from sklearn.model_selection import GridSearchCV

param_grid = {"var_smoothing": [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0]}
grid_search = GridSearchCV(GaussianNB(), param_grid, cv=5, scoring="accuracy")
grid_search.fit(X_train, y_train)
print(f"Best parameters for Naive Bayes: {grid_search.best_params_}")
nb_best = grid_search.best_estimator_

nb = GaussianNB(var_smoothing=0)
nb.fit(X_train, y_train)
predictions = nb.predict(X_val)
print(f"Node-based accuracy after grid search: {accuracy_score(y_val, predictions):.2f}")
sentence_predictions = defaultdict(dict)
probs = nb.predict_proba(X_val)
sentence_real_root = {}
for (_, row), pred in zip(validation_unwound.iterrows(), probs):
    sentence_predictions[row["row_index"]][row["node"]] = pred[1]
    if row["is_root"]:
        sentence_real_root[row["row_index"]] = row["node"]
if not set(sentence_predictions.keys()) == set(sentence_real_root.keys()):
    raise ValueError("Mismatch between sentence predictions and real roots.")


def get_predicted_root(row: pd.Series) -> str:
    """
    Get the predicted root node for a sentence.
    """
    sentence_id = row.name
    probs = sentence_predictions[sentence_id]
    return max(probs.keys(), key=probs.get)


validation_prediction = pd.DataFrame.from_dict(sentence_real_root, orient="index", columns=["root"])
validation_prediction["predicted_root"] = validation_prediction.apply(get_predicted_root, axis=1)
print(
    f"Sentence-based accuracy after grid search: {accuracy_score(validation_prediction['root'], validation_prediction['predicted_root']):.2f}"
)

Best parameters for Naive Bayes: {'var_smoothing': 0.1}
Node-based accuracy after grid search: 0.78
Sentence-based accuracy after grid search: 0.29
