# Logistic regression, decision tree, random forest, extra tree implementations and optimizations

Import the packages

In [21]:
import sys
import os

import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict
from sklearn.pipeline import Pipeline
import optuna

Prepare for preprocessing and Feature Engineering

In [22]:
# Automatically add the project root (1 level up) to the Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.append(project_root)

from feature_engineer import preprocessor, VandalismScorer


Read in the train data and preprocess it

In [23]:
df_train = pd.read_csv(project_root+"/data/train.csv")
preprocessor(df_train)

Raw features including add_lines and deleted_lines

In [24]:
feature_cols = ["EditID", "user_edit_count", "user_warns", "num_recent_reversions", "num_edits_5d_before", "is_person", "added_lines", "deleted_lines"]

Tune and fit the models, and record the metric results

# Logistic regression

In [25]:
def train_logreg(
    predictor: pd.DataFrame,
    target: pd.Series,
    cv: StratifiedKFold,
    scoring: str = "accuracy",
    n_trials: int = 10
):
    baseline_model = Pipeline([
        ('scorer', VandalismScorer(n_splits=5)),
        ('logreg', LogisticRegression(max_iter=500, random_state=42))
    ])
    baseline_score = cross_val_score(
        baseline_model, predictor, target, cv=cv, scoring=scoring, n_jobs=-1
    ).mean()
    print(f"Baseline {scoring} score: {baseline_score:.4f}")

    def objective(trial):
        params = {
            "C": trial.suggest_float("C", 1e-3, 1e3, log=True),
            "solver": trial.suggest_categorical("solver", ["lbfgs", "liblinear"]),
            "penalty": "l2",
            "max_iter": 500,
            "random_state": 42,
        }
        model = Pipeline([
            ('scorer', VandalismScorer(n_splits=5)),
            ('logreg', LogisticRegression(**params))
        ])
        score = cross_val_score(model, predictor, target, cv=cv, scoring=scoring, n_jobs=-1).mean()
        return score

    optuna.logging.set_verbosity(optuna.logging.WARNING)
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=n_trials)

    print("Optuna Optimization Results")
    print("Best Accuracy:", study.best_value)
    print("Best hyperparameters:", study.best_params)

    # Final evaluation
    best_model = Pipeline([
        ('scorer', VandalismScorer(n_splits=5)),
        ('logreg', LogisticRegression(**study.best_params))
    ])
    y_pred = cross_val_predict(best_model, predictor, target, cv=cv, n_jobs=-1)
    # All metrics
    accuracy = accuracy_score(target, y_pred)
    f1 = f1_score(target, y_pred)
    precision = precision_score(target, y_pred)
    recall = recall_score(target, y_pred)

    print(f"\nBest-tuned Logistic Regression metrics (cv mean):")
    print(f"Accuracy : {accuracy:.4f}")
    print(f"F1       : {f1:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall   : {recall:.4f}")

    return study.best_params, study.best_value, accuracy, f1, precision, recall

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
results = train_logreg(
    predictor=df_train[feature_cols],
    target=df_train["isvandalism"],
    cv=cv,
    scoring="accuracy"
)

Logistic regression results before tuning threshold:

Baseline accuracy score: 0.8333
Optuna Optimization Results
Best Accuracy: 0.843715886167941
Best hyperparameters: {'C': 0.0029799440180166927, 'solver': 'lbfgs'}

Best-tuned Logistic Regression metrics (cv mean):
Accuracy : 0.8438
F1       : 0.8503
Precision: 0.7967
Recall   : 0.9117

In [None]:
best_param_log = {'C': 0.0029799440180166927, 'solver': 'lbfgs', 'max_iter': 500, 'random_state': 42, 'penalty': 'l2'}
best_logreg = Pipeline([
        ('scorer', VandalismScorer(n_splits=5)),
        ('log', LogisticRegression(**best_param_log))
    ])

oof_proba = np.zeros(len(df_train))
oof_index = np.zeros(len(df_train), dtype=bool)

for train_idx, valid_idx in cv.split(df_train[feature_cols], df_train['isvandalism']):
    X_train = df_train.iloc[train_idx][feature_cols]
    y_train = df_train.iloc[train_idx]['isvandalism']        
    X_valid = df_train.iloc[valid_idx][feature_cols]

    best_logreg.fit(X_train, y_train)
    oof_proba[valid_idx] = best_logreg.predict_proba(X_valid)[:, 1]
    oof_index[valid_idx] = True

assert oof_index.all()  


thresholds = np.linspace(0, 1, 101)
best_f1 = 0
best_thresh = 0.5
y_true = df_train['isvandalism']

for t in thresholds:
    y_pred = (oof_proba >= t).astype(int)
    f1 = f1_score(y_true, y_pred)
    if f1 > best_f1:
        best_f1 = f1
        best_thresh = t


y_pred = (oof_proba >= best_thresh).astype(int)
acc = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print(f"Best threshold: {best_thresh:.2f}")
print(f"F1      : {f1:.4f}")
print(f"Accuracy: {acc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall  : {recall:.4f}")

Best threshold: 0.43
F1      : 0.8491
Accuracy: 0.8348
Precision: 0.7646
Recall  : 0.9547


Logistic regression results after tunning threshold to maximize F1 score:

Best threshold: 0.43
F1      : 0.8491
Accuracy: 0.8348
Precision: 0.7646
Recall  : 0.9547

Note that the F1 score after tuning (to maximze F1) seems lower than before tuning. This is because tuning the threshold globally on OOF probabilities is intrinsically different from averaging per-fold F1 score (where in each fold only a part of the data is used).

# Decision tree

In [None]:
def train_tree(
    predictor: pd.DataFrame,
    target: pd.Series,
    cv: StratifiedKFold,
    scoring: str = "accuracy",
    n_trials: int = 10
):

    baseline_model = Pipeline([
        ('scorer', VandalismScorer(n_splits=5)),
        ('tree', DecisionTreeClassifier(random_state=42))
    ])
    baseline_score = cross_val_score(
        baseline_model, predictor, target, cv=cv, scoring=scoring, n_jobs=-1
    ).mean()
    print(f"Baseline {scoring} score: {baseline_score:.4f}")

    def objective(trial):
        params = {
            "max_depth": trial.suggest_int("max_depth", 2, 20),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
            "criterion": trial.suggest_categorical("criterion", ["gini", "entropy", "log_loss"]),
            "random_state": 42,
        }
        model = Pipeline([
            ('scorer', VandalismScorer(n_splits=5)),
            ('tree', DecisionTreeClassifier(**params))
        ])
        score = cross_val_score(model, predictor, target, cv=cv, scoring=scoring, n_jobs=-1).mean()
        return score

    optuna.logging.set_verbosity(optuna.logging.WARNING)
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=n_trials)

    print("Optuna Optimization Results")
    print("Best Accuracy:", study.best_value)
    print("Best hyperparameters:", study.best_params)

    best_model = Pipeline([
        ('scorer', VandalismScorer(n_splits=5)),
        ('tree', DecisionTreeClassifier(**study.best_params))
    ])
    y_pred = cross_val_predict(best_model, predictor, target, cv=cv, n_jobs=-1)
    accuracy = accuracy_score(target, y_pred)
    f1 = f1_score(target, y_pred)
    precision = precision_score(target, y_pred)
    recall = recall_score(target, y_pred)

    print(f"\nBest-tuned Decision Tree metrics (cv mean):")
    print(f"Accuracy : {accuracy:.4f}")
    print(f"F1       : {f1:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall   : {recall:.4f}")

    return study.best_params, study.best_value, accuracy, f1, precision, recall

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
results = train_tree(
    predictor=df_train[feature_cols],
    target=df_train["isvandalism"],
    cv=cv,
    scoring="accuracy"
)

Baseline accuracy score: 0.8680
Optuna Optimization Results
Best Accuracy: 0.899009137234841
Best hyperparameters: {'max_depth': 7, 'min_samples_leaf': 1, 'criterion': 'gini'}

Best-tuned Decision Tree metrics (cv mean):
Accuracy : 0.8990
F1       : 0.8945
Precision: 0.9103
Recall   : 0.8793


Decision tree results before tuning threshold:

Baseline accuracy score: 0.8680
Optuna Optimization Results
Best Accuracy: 0.899009137234841
Best hyperparameters: {'max_depth': 7, 'min_samples_leaf': 1, 'criterion': 'gini'}

Best-tuned Decision Tree metrics (cv mean):
Accuracy : 0.8990
F1       : 0.8945
Precision: 0.9103
Recall   : 0.8793

In [33]:
best_param_tree = {'max_depth': 7, 'min_samples_leaf': 1, 'criterion': 'gini', 'random_state': 42}
best_tree = Pipeline([
        ('scorer', VandalismScorer(n_splits=5)),
        ('tree', DecisionTreeClassifier(**best_param_tree))
    ])

oof_proba = np.zeros(len(df_train))
oof_index = np.zeros(len(df_train), dtype=bool)

for train_idx, valid_idx in cv.split(df_train[feature_cols], df_train['isvandalism']):
    X_train = df_train.iloc[train_idx][feature_cols]
    y_train = df_train.iloc[train_idx]['isvandalism']
    X_valid = df_train.iloc[valid_idx][feature_cols]

    best_tree.fit(X_train, y_train)
    oof_proba[valid_idx] = best_tree.predict_proba(X_valid)[:, 1]
    oof_index[valid_idx] = True

assert oof_index.all()  

thresholds = np.linspace(0, 1, 101)
best_f1 = 0
best_thresh = 0.5
y_true = df_train['isvandalism']

for t in thresholds:
    y_pred = (oof_proba >= t).astype(int)
    f1 = f1_score(y_true, y_pred)
    if f1 > best_f1:
        best_f1 = f1
        best_thresh = t


y_pred = (oof_proba >= best_thresh).astype(int)
acc = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print(f"Best threshold: {best_thresh:.2f}")
print(f"F1      : {f1:.4f}")
print(f"Accuracy: {acc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall  : {recall:.4f}")

Best threshold: 0.40
F1      : 0.8969
Accuracy: 0.8992
Precision: 0.8938
Recall  : 0.9000


Decision tree results after tuning threshold to maximize F1 score:

Best threshold: 0.40
F1      : 0.8969
Accuracy: 0.8992
Precision: 0.8938
Recall  : 0.9000

# Random forest

In [34]:
def train_rf(
    predictor: pd.DataFrame,
    target: pd.Series,
    cv: StratifiedKFold,
    scoring: str = "accuracy",
    n_trials: int = 10
):

    baseline_model = Pipeline([
        ('scorer', VandalismScorer(n_splits=5)),
        ('rf', RandomForestClassifier(random_state=42, n_jobs=-1))
    ])
    baseline_score = cross_val_score(
        baseline_model, predictor, target, cv=cv, scoring=scoring, n_jobs=-1
    ).mean()
    print(f"Baseline {scoring} score: {baseline_score:.4f}")

    def objective(trial):
        params = {
            "n_estimators": 100,
            "max_samples": 500,
            "bootstrap": True,
            "max_depth": trial.suggest_int("max_depth", 2, 20),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
            "max_features": trial.suggest_categorical("max_features", ['sqrt', 'log2', None]),
            "criterion": "gini",
            "random_state": 42,
            "n_jobs": -1,
        }
        model = Pipeline([
            ('scorer', VandalismScorer(n_splits=5)),
            ('rf', RandomForestClassifier(**params))
        ])
        score = cross_val_score(model, predictor, target, cv=cv, scoring=scoring, n_jobs=-1).mean()
        return score

    optuna.logging.set_verbosity(optuna.logging.WARNING)
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=n_trials)

    print("Optuna Optimization Results")
    print("Best Accuracy:", study.best_value)
    print("Best hyperparameters:", study.best_params)

    best_model = Pipeline([
        ('scorer', VandalismScorer(n_splits=5)),
        ('rf', RandomForestClassifier(**study.best_params))
    ])
    y_pred = cross_val_predict(best_model, predictor, target, cv=cv, n_jobs=-1)
    accuracy = accuracy_score(target, y_pred)
    f1 = f1_score(target, y_pred)
    precision = precision_score(target, y_pred)
    recall = recall_score(target, y_pred)

    print(f"\nBest-tuned Random Forest metrics (cv mean):")
    print(f"Accuracy : {accuracy:.4f}")
    print(f"F1       : {f1:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall   : {recall:.4f}")

    return study.best_params, study.best_value, accuracy, f1, precision, recall

In [35]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
results = train_rf(
    predictor=df_train[feature_cols],
    target=df_train["isvandalism"],
    cv=cv,
    scoring="accuracy"
)

Baseline accuracy score: 0.8921
Optuna Optimization Results
Best Accuracy: 0.8986553173324424
Best hyperparameters: {'max_depth': 8, 'min_samples_leaf': 2, 'max_features': 'log2'}

Best-tuned Random Forest metrics (cv mean):
Accuracy : 0.9001
F1       : 0.8952
Precision: 0.9153
Recall   : 0.8760


Random forest results before tuning threshold:

Baseline accuracy score: 0.8921
Optuna Optimization Results
Best Accuracy: 0.8986553173324424
Best hyperparameters: {'max_depth': 8, 'min_samples_leaf': 2, 'max_features': 'log2'}

Best-tuned Random Forest metrics (cv mean):
Accuracy : 0.9001
F1       : 0.8952
Precision: 0.9153
Recall   : 0.8760

In [None]:
best_param_rf = {'max_depth': 8, 'min_samples_leaf': 2, 'max_features': 'log2', "n_estimators": 100,
            "max_samples": 500,
            "bootstrap": True,
            "criterion": "gini",
            "random_state": 42,
            "n_jobs": -1,}
best_rf = Pipeline([
        ('scorer', VandalismScorer(n_splits=5)),
        ('rf', RandomForestClassifier(**best_param_rf))
    ])

oof_proba = np.zeros(len(df_train))
oof_index = np.zeros(len(df_train), dtype=bool)

for train_idx, valid_idx in cv.split(df_train[feature_cols], df_train['isvandalism']):
    X_train = df_train.iloc[train_idx][feature_cols]
    y_train = df_train.iloc[train_idx]['isvandalism']
    X_valid = df_train.iloc[valid_idx][feature_cols]

    best_rf.fit(X_train, y_train)
    oof_proba[valid_idx] = best_rf.predict_proba(X_valid)[:, 1]
    oof_index[valid_idx] = True

assert oof_index.all()

thresholds = np.linspace(0, 1, 101)
best_f1 = 0
best_thresh = 0.5
y_true = df_train['isvandalism']

for t in thresholds:
    y_pred = (oof_proba >= t).astype(int)
    f1 = f1_score(y_true, y_pred)
    if f1 > best_f1:
        best_f1 = f1
        best_thresh = t

y_pred = (oof_proba >= best_thresh).astype(int)
acc = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print(f"Best threshold: {best_thresh:.2f}")
print(f"F1      : {f1:.4f}")
print(f"Accuracy: {acc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall  : {recall:.4f}")

[Random Forest] Best threshold: 0.45
F1      : 0.8978
Accuracy: 0.8990
Precision: 0.8850
Recall  : 0.9109


Random forest results after tuning threshold:

Best threshold: 0.45
F1      : 0.8978
Accuracy: 0.8990
Precision: 0.8850
Recall  : 0.9109

# Extra tree

In [42]:
def train_et(
    predictor: pd.DataFrame,
    target: pd.Series,
    cv: StratifiedKFold,
    scoring: str = "accuracy",
    n_trials: int = 10
):

    baseline_model = Pipeline([
        ('scorer', VandalismScorer(n_splits=5)),
        ('et', ExtraTreesClassifier(random_state=42, n_jobs=-1))
    ])
    baseline_score = cross_val_score(
        baseline_model, predictor, target, cv=cv, scoring=scoring, n_jobs=-1
    ).mean()
    print(f"Baseline {scoring} score: {baseline_score:.4f}")

    def objective(trial):
        params = {
            "n_estimators": 100,
            "max_depth": None, # previous optimization suggests that larger max_depth gives better accuracy
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
            "max_features": trial.suggest_categorical("max_features", ['sqrt', 'log2', None]),
            "criterion": "gini",
            "random_state": 42,
            "n_jobs": -1,
        }
        model = Pipeline([
            ('scorer', VandalismScorer(n_splits=5)),
            ('et', ExtraTreesClassifier(**params))
        ])
        score = cross_val_score(model, predictor, target, cv=cv, scoring=scoring, n_jobs=-1).mean()
        return score

    optuna.logging.set_verbosity(optuna.logging.WARNING)
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=n_trials)

    print("Optuna Optimization Results")
    print("Best Accuracy:", study.best_value)
    print("Best hyperparameters:", study.best_params)

    best_model = Pipeline([
        ('scorer', VandalismScorer(n_splits=5)),
        ('et', ExtraTreesClassifier(**study.best_params))
    ])
    y_pred = cross_val_predict(best_model, predictor, target, cv=cv, n_jobs=-1)
    accuracy = accuracy_score(target, y_pred)
    f1 = f1_score(target, y_pred)
    precision = precision_score(target, y_pred)
    recall = recall_score(target, y_pred)

    print(f"\nBest-tuned Extra Trees metrics (cv mean):")
    print(f"Accuracy : {accuracy:.4f}")
    print(f"F1       : {f1:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall   : {recall:.4f}")

    return study.best_params, study.best_value, accuracy, f1, precision, recall

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
results = train_et(
    predictor=df_train[feature_cols],
    target=df_train["isvandalism"],
    cv=cv,
    scoring="accuracy"
)

Extra tree results before tuning threshold:

Baseline accuracy score: 0.8827
Optuna Optimization Results
Best Accuracy: 0.895784398222742
Best hyperparameters: {'min_samples_leaf': 2, 'max_features': None}

Best-tuned Extra Trees metrics (cv mean):
Accuracy : 0.8961
F1       : 0.8917
Precision: 0.9049
Recall   : 0.8788

In [None]:
best_param_et = {
            "n_estimators": 100,
            "max_depth": None,
            "min_samples_leaf": 2,
            "max_features": None,
            "criterion": "gini",
            "random_state": 42,
            "n_jobs": -1,
        }
best_et = Pipeline([
        ('scorer', VandalismScorer(n_splits=5)),
        ('et', ExtraTreesClassifier(**best_param_et))
    ])

oof_proba = np.zeros(len(df_train))
oof_index = np.zeros(len(df_train), dtype=bool)

for train_idx, valid_idx in cv.split(df_train[feature_cols], df_train['isvandalism']):
    X_train = df_train.iloc[train_idx][feature_cols]
    y_train = df_train.iloc[train_idx]['isvandalism']
    X_valid = df_train.iloc[valid_idx][feature_cols]

    best_et.fit(X_train, y_train)
    oof_proba[valid_idx] = best_et.predict_proba(X_valid)[:, 1]
    oof_index[valid_idx] = True

assert oof_index.all()

thresholds = np.linspace(0, 1, 101)
best_f1 = 0
best_thresh = 0.5
y_true = df_train['isvandalism']

for t in thresholds:
    y_pred = (oof_proba >= t).astype(int)
    f1 = f1_score(y_true, y_pred)
    if f1 > best_f1:
        best_f1 = f1
        best_thresh = t

y_pred = (oof_proba >= best_thresh).astype(int)
acc = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print(f"Best threshold: {best_thresh:.2f}")
print(f"F1      : {f1:.4f}")
print(f"Accuracy: {acc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall  : {recall:.4f}")

[Extra Trees] Best threshold: 0.41
F1      : 0.8950
Accuracy: 0.8961
Precision: 0.8803
Recall  : 0.9103


Extra tree results after tuning threshold to maximize F1 score:

Best threshold: 0.41
F1      : 0.8950
Accuracy: 0.8961
Precision: 0.8803
Recall  : 0.9103