# Decision tree, random forest, extra tree implementation, with logistic regression as a baseline

Import the packages

In [None]:
import sys
import os

import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
import optuna

Prepare for preprocessing and Feature Engineering

In [None]:
# Automatically add the project root (1 level up) to the Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.append(project_root)

from feature_engineer import preprocessor, VandalismScorer


Read in the train data and preprocess it

In [None]:
df_train = pd.read_csv(project_root+"/data/train.csv")
preprocessor(df_train)

Raw features including add_lines and deleted_lines

In [None]:
feature_cols = ["EditID", "user_edit_count", "user_warns", "num_recent_reversions", "num_edits_5d_before", "is_person", "added_lines", "deleted_lines"]

Initialize the cross-validation

In [None]:
num_splits = 5
num_models = 4
kfold = StratifiedKFold(num_splits, random_state=42, shuffle=True)

Define functions that tune the models using Optuna

In [None]:
## Objective functions for the models
def objective_logreg(trial, X, y):
    C = trial.suggest_float('C', 1e-3, 1e3, log=True)
    solver = trial.suggest_categorical('solver', ['lbfgs', 'liblinear'])
    pipe = Pipeline([
        ('scorer', VandalismScorer(n_splits=4)),
        ('log', LogisticRegression(C=C, solver=solver, penalty='l2', max_iter=500))
    ])
    return cross_val_score(pipe, X, y, cv=3, scoring='accuracy').mean()

def objective_tree(trial, X, y):
    max_depth = trial.suggest_int('max_depth', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    pipe = Pipeline([
        ('scorer', VandalismScorer(n_splits=4)),
        ('tree', DecisionTreeClassifier(max_depth=max_depth, min_samples_leaf=min_samples_leaf, random_state=42))
    ])
    return cross_val_score(pipe, X, y, cv=3, scoring='accuracy').mean()

def objective_rf(trial, X, y):
    max_depth = trial.suggest_int('max_depth', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
    pipe = Pipeline([
        ('scorer', VandalismScorer(n_splits=4)),
        ('rf', RandomForestClassifier(
            n_estimators=100,
            max_depth=max_depth,
            min_samples_leaf=min_samples_leaf,
            max_features=max_features,
            bootstrap=True,
            max_samples=500,
            random_state=42,
            n_jobs=-1
        ))
    ])
    return cross_val_score(pipe, X, y, cv=3, scoring='accuracy').mean()

def objective_et(trial, X, y):
    max_depth = trial.suggest_int('max_depth', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
    pipe = Pipeline([
        ('scorer', VandalismScorer(n_splits=4)),
        ('et', ExtraTreesClassifier(
            n_estimators=100,
            max_depth=max_depth,
            min_samples_leaf=min_samples_leaf,
            max_features=max_features,
            bootstrap=True,
            max_samples=500,
            random_state=42,
            n_jobs=-1
        ))
    ])
    return cross_val_score(pipe, X, y, cv=3, scoring='accuracy').mean()

## --------- Functions to run tuning for each model ---------
def tune_model(objective_fn, X, y, n_trials=3):  # for runtime reason we only do 3 trials (so runtime for 15 trials in total needed for the 5-fold cross-validation for each model)
    def obj(trial):
        return objective_fn(trial, X, y)
    study = optuna.create_study(direction='maximize')
    study.optimize(obj, n_trials=n_trials)
    return study.best_params

Tune and fit the models, and record the metric results

A naive logistic regression with no hyperparameter tuning

In [None]:
accs_logreg, f1s_logreg = [], []

## loop through the kfold here
for train_index, test_index in kfold.split(df_train[feature_cols], df_train.isvandalism):
    ## cv training set
    df_tt = df_train.iloc[train_index]

    ## cv holdout set
    df_ho = df_train.iloc[test_index]

    logistic_pipe = Pipeline([
        ('scorer', VandalismScorer(n_splits=4)),
        ('log', LogisticRegression(max_iter=500, penalty=None))
    ])
    logistic_pipe.fit(df_tt[feature_cols], df_tt['isvandalism'])
    log_pred = logistic_pipe.predict(df_ho[feature_cols])
    acc = accuracy_score(df_ho['isvandalism'], log_pred)
    f1 = f1_score(df_ho['isvandalism'], log_pred)
    accs_logreg.append(acc)
    f1s_logreg.append(f1)

In [None]:
print(accs_logreg)
print(f1s_logreg)

Logistic regression

In [None]:
accs_logreg, f1s_logreg = [], []

## loop through the kfold here
for train_index, test_index in kfold.split(df_train[feature_cols], df_train.isvandalism):
    ## cv training set
    df_tt = df_train.iloc[train_index]

    ## cv holdout set
    df_ho = df_train.iloc[test_index]

    logreg_params = tune_model(objective_logreg, df_tt[feature_cols], df_tt['isvandalism'])
    logistic_pipe = Pipeline([
        ('scorer', VandalismScorer(n_splits=4)),
        ('log', LogisticRegression(max_iter=500, **logreg_params))
    ])
    logistic_pipe.fit(df_tt[feature_cols], df_tt['isvandalism'])
    log_pred = logistic_pipe.predict(df_ho[feature_cols])
    acc = accuracy_score(df_ho['isvandalism'], log_pred)
    f1 = f1_score(df_ho['isvandalism'], log_pred)
    accs_logreg.append(acc)
    f1s_logreg.append(f1)

[I 2025-06-26 20:38:27,559] A new study created in memory with name: no-name-d7c0114c-5ba4-49d7-91c2-c755bec60cd5
[I 2025-06-26 20:40:45,171] Trial 0 finished with value: 0.8285811170287968 and parameters: {'C': 64.6198033948995, 'solver': 'lbfgs'}. Best is trial 0 with value: 0.8285811170287968.
[I 2025-06-26 20:43:18,224] Trial 1 finished with value: 0.8314322423533742 and parameters: {'C': 0.022524555913570145, 'solver': 'lbfgs'}. Best is trial 1 with value: 0.8314322423533742.
[I 2025-06-26 20:45:47,493] Trial 2 finished with value: 0.7745550824017075 and parameters: {'C': 0.0015336617889900332, 'solver': 'liblinear'}. Best is trial 1 with value: 0.8314322423533742.
[I 2025-06-26 20:47:01,884] A new study created in memory with name: no-name-a2c9c902-318a-4850-8b1b-42d9e968717a
[I 2025-06-26 20:49:11,061] Trial 0 finished with value: 0.8259263295253366 and parameters: {'C': 0.05481854518515586, 'solver': 'lbfgs'}. Best is trial 0 with value: 0.8259263295253366.
[I 2025-06-26 20:51:

Decision tree

In [None]:
accs_tree, f1s_tree = [], []

## loop through the kfold here
for train_index, test_index in kfold.split(df_train[feature_cols], df_train.isvandalism):
    ## cv training set
    df_tt = df_train.iloc[train_index]

    ## cv holdout set
    df_ho = df_train.iloc[test_index]

    tree_params = tune_model(objective_tree, df_tt[feature_cols], df_tt['isvandalism'])
    tree_pipe = Pipeline([
        ('scorer', VandalismScorer(n_splits=4)),
        ('tree', DecisionTreeClassifier(random_state=42, **tree_params))
    ])
    tree_pipe.fit(df_tt[feature_cols], df_tt['isvandalism'])
    tree_pred = tree_pipe.predict(df_ho[feature_cols])
    acc = accuracy_score(df_ho['isvandalism'], tree_pred)
    f1 = f1_score(df_ho['isvandalism'], tree_pred)
    accs_tree.append(acc)
    f1s_tree.append(f1)

[I 2025-06-26 21:19:12,980] A new study created in memory with name: no-name-002e92fd-3b75-41c6-b6a4-5b959de4cdfd
[I 2025-06-26 21:21:21,766] Trial 0 finished with value: 0.8957821734828807 and parameters: {'max_depth': 8, 'min_samples_leaf': 5}. Best is trial 0 with value: 0.8957821734828807.
[I 2025-06-26 21:23:32,918] Trial 1 finished with value: 0.8794611715857701 and parameters: {'max_depth': 14, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.8957821734828807.
[I 2025-06-26 21:25:43,988] Trial 2 finished with value: 0.8737095537200724 and parameters: {'max_depth': 20, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.8957821734828807.
[I 2025-06-26 21:26:47,753] A new study created in memory with name: no-name-e6701ee9-5a5d-4f63-a142-17b11d540c29
[I 2025-06-26 21:28:54,032] Trial 0 finished with value: 0.8856059798642079 and parameters: {'max_depth': 9, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.8856059798642079.
[I 2025-06-26 21:31:00,605] Trial 1 finished w

Random forest

In [None]:
accs_rf, f1s_rf = [], []

## loop through the kfold here
for train_index, test_index in kfold.split(df_train[feature_cols], df_train.isvandalism):
    ## cv training set
    df_tt = df_train.iloc[train_index]

    ## cv holdout set
    df_ho = df_train.iloc[test_index]

    rf_params = tune_model(objective_rf, df_tt[feature_cols], df_tt['isvandalism'])
    rf_pipe = Pipeline([
        ('scorer', VandalismScorer(n_splits=4)),
        ('rf', RandomForestClassifier(
            n_estimators=100,  # You can increase after tuning for production
            bootstrap=True,
            max_samples=500,
            random_state=42,
            n_jobs=-1,
            **rf_params
        ))
    ])
    rf_pipe.fit(df_tt[feature_cols], df_tt['isvandalism'])
    rf_pred = rf_pipe.predict(df_ho[feature_cols])
    acc = accuracy_score(df_ho['isvandalism'], rf_pred)
    f1 = f1_score(df_ho['isvandalism'], rf_pred)
    accs_rf.append(acc)
    f1s_rf.append(f1)

[I 2025-06-26 21:59:31,004] A new study created in memory with name: no-name-d7910f05-1dbb-4538-8425-c3332bb44446
[I 2025-06-26 22:01:42,605] Trial 0 finished with value: 0.8945530537708969 and parameters: {'max_depth': 9, 'min_samples_leaf': 7, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.8945530537708969.
[I 2025-06-26 22:03:54,620] Trial 1 finished with value: 0.8988789965948293 and parameters: {'max_depth': 17, 'min_samples_leaf': 3, 'max_features': None}. Best is trial 1 with value: 0.8988789965948293.
[I 2025-06-26 22:06:03,758] Trial 2 finished with value: 0.8966176605145065 and parameters: {'max_depth': 13, 'min_samples_leaf': 7, 'max_features': None}. Best is trial 1 with value: 0.8988789965948293.
[I 2025-06-26 22:07:19,967] A new study created in memory with name: no-name-81a7946b-a4ac-4c02-8b07-91ae53da575a
[I 2025-06-26 22:09:46,157] Trial 0 finished with value: 0.8888996918192236 and parameters: {'max_depth': 11, 'min_samples_leaf': 10, 'max_features': None}. Be

Extra tree

In [None]:
accs_et, f1s_et = [], []

## loop through the kfold here
for train_index, test_index in kfold.split(df_train[feature_cols], df_train.isvandalism):
    ## cv training set
    df_tt = df_train.iloc[train_index]

    ## cv holdout set
    df_ho = df_train.iloc[test_index]

    et_params = tune_model(objective_et, df_tt[feature_cols], df_tt['isvandalism'])
    et_pipe = Pipeline([
        ('scorer', VandalismScorer(n_splits=4)),
        ('et', ExtraTreesClassifier(
            n_estimators=100,
            bootstrap=True,
            max_samples=500,
            random_state=42,
            n_jobs=-1,
            **et_params
        ))
    ])
    et_pipe.fit(df_tt[feature_cols], df_tt['isvandalism'])
    et_pred = et_pipe.predict(df_ho[feature_cols])
    acc = accuracy_score(df_ho['isvandalism'], et_pred)
    f1 = f1_score(df_ho['isvandalism'], et_pred)
    accs_et.append(acc)
    f1s_et.append(f1)

[I 2025-06-26 22:52:38,867] A new study created in memory with name: no-name-d88c6246-8731-4b68-ad18-31f2a8ffd380
[I 2025-06-26 22:54:55,798] Trial 0 finished with value: 0.8664829225972217 and parameters: {'max_depth': 17, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 0 with value: 0.8664829225972217.
[I 2025-06-26 22:56:48,341] Trial 1 finished with value: 0.7955952102852017 and parameters: {'max_depth': 8, 'min_samples_leaf': 10, 'max_features': None}. Best is trial 0 with value: 0.8664829225972217.
[I 2025-06-26 22:58:40,906] Trial 2 finished with value: 0.8068035840207695 and parameters: {'max_depth': 7, 'min_samples_leaf': 4, 'max_features': None}. Best is trial 0 with value: 0.8664829225972217.
[I 2025-06-26 22:59:36,727] A new study created in memory with name: no-name-e9156889-4e3d-4bca-a76c-0b9ca7a5454c
[I 2025-06-26 23:01:25,988] Trial 0 finished with value: 0.8486871974545106 and parameters: {'max_depth': 12, 'min_samples_leaf': 4, 'max_features': None}. Bes

Print out (average) accuracy scores and F1 scores

In [None]:
print(f"LogReg:   accuracy={sum(accs_logreg)/len(accs_logreg):.4f}, F1={sum(f1s_logreg)/len(f1s_logreg):.4f}")
print(f"Tree:     accuracy={sum(accs_tree)/len(accs_tree):.4f}, F1={sum(f1s_tree)/len(f1s_tree):.4f}")
print(f"RF:       accuracy={sum(accs_rf)/len(accs_rf):.4f}, F1={sum(f1s_rf)/len(f1s_rf):.4f}")
print(f"ET:       accuracy={sum(accs_et)/len(accs_et):.4f}, F1={sum(f1s_et)/len(f1s_et):.4f}")

LogReg:   accuracy=0.8360, F1=0.8395
Tree:     accuracy=0.9014, F1=0.8970
RF:       accuracy=0.8999, F1=0.8957
ET:       accuracy=0.8263, F1=0.7943
