# Decision tree, random forest, extra tree implementation, with logistic regression as a baseline

Import the packages

In [1]:
import sys
import os

import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
import optuna

Prepare for preprocessing and Feature Engineering

In [2]:
# Automatically add the project root (1 level up) to the Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.append(project_root)

from feature_engineer import preprocessor, VandalismScorer


Read in the train data and preprocess it

In [3]:
df_train = pd.read_csv(project_root+"/data/train.csv")
preprocessor(df_train)

Raw features including add_lines and deleted_lines

In [11]:
feature_cols = ["EditID", "user_edit_count", "user_warns", "num_recent_reversions", "num_edits_5d_before", "is_person", "added_lines", "deleted_lines"]

Initialize the cross-validation

In [7]:
num_splits = 5
num_models = 4
kfold = StratifiedKFold(num_splits, random_state=42, shuffle=True)

Define functions that tune the models using Optuna

In [8]:
## Objective functions for the models
def objective_logreg(trial, X, y):
    C = trial.suggest_float('C', 1e-3, 1e3, log=True)
    solver = trial.suggest_categorical('solver', ['lbfgs', 'liblinear'])
    pipe = Pipeline([
        ('scorer', VandalismScorer(n_splits=4)),
        ('log', LogisticRegression(C=C, solver=solver, penalty='l2', max_iter=500))
    ])
    return cross_val_score(pipe, X, y, cv=3, scoring='accuracy').mean()

def objective_tree(trial, X, y):
    max_depth = trial.suggest_int('max_depth', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    pipe = Pipeline([
        ('scorer', VandalismScorer(n_splits=4)),
        ('tree', DecisionTreeClassifier(max_depth=max_depth, min_samples_leaf=min_samples_leaf, random_state=42))
    ])
    return cross_val_score(pipe, X, y, cv=3, scoring='accuracy').mean()

def objective_rf(trial, X, y):
    max_depth = trial.suggest_int('max_depth', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
    pipe = Pipeline([
        ('scorer', VandalismScorer(n_splits=4)),
        ('rf', RandomForestClassifier(
            n_estimators=100,
            max_depth=max_depth,
            min_samples_leaf=min_samples_leaf,
            max_features=max_features,
            bootstrap=True,
            max_samples=500,
            random_state=42,
            n_jobs=-1
        ))
    ])
    return cross_val_score(pipe, X, y, cv=3, scoring='accuracy').mean()

def objective_et(trial, X, y):
    max_depth = trial.suggest_int('max_depth', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
    pipe = Pipeline([
        ('scorer', VandalismScorer(n_splits=4)),
        ('et', ExtraTreesClassifier(
            n_estimators=100,
            max_depth=max_depth,
            min_samples_leaf=min_samples_leaf,
            max_features=max_features,
            bootstrap=True,
            max_samples=500,
            random_state=42,
            n_jobs=-1
        ))
    ])
    return cross_val_score(pipe, X, y, cv=3, scoring='accuracy').mean()

## --------- Functions to run tuning for each model ---------
def tune_model(objective_fn, X, y, n_trials=15):
    def obj(trial):
        return objective_fn(trial, X, y)
    study = optuna.create_study(direction='maximize')
    study.optimize(obj, n_trials=n_trials)
    return study.best_params

Tune and fit the models, and record the metric results

In [12]:
## Naive example for logistic regression without tuning


accs_logreg, f1s_logreg = [], []

## loop through the kfold here
for train_index, test_index in kfold.split(df_train[feature_cols], df_train.isvandalism):
    ## cv training set
    df_tt = df_train.iloc[train_index]

    ## cv holdout set
    df_ho = df_train.iloc[test_index]

    logistic_pipe = Pipeline([
        ('scorer', VandalismScorer(n_splits=4)),
        ('log', LogisticRegression(max_iter=500, penalty=None))
    ])
    logistic_pipe.fit(df_tt[feature_cols], df_tt['isvandalism'])
    log_pred = logistic_pipe.predict(df_ho[feature_cols])
    acc = accuracy_score(df_ho['isvandalism'], log_pred)
    f1 = f1_score(df_ho['isvandalism'], log_pred)
    accs_logreg.append(acc)
    f1s_logreg.append(f1)

Logistic regression

In [9]:
accs_logreg, f1s_logreg = [], []

## loop through the kfold here
for train_index, test_index in kfold.split(df_train[feature_cols], df_train.isvandalism):
    ## cv training set
    df_tt = df_train.iloc[train_index]

    ## cv holdout set
    df_ho = df_train.iloc[test_index]

    logreg_params = tune_model(objective_logreg, df_tt[feature_cols], df_tt['isvandalism'])
    logistic_pipe = Pipeline([
        ('scorer', VandalismScorer(n_splits=4)),
        ('log', LogisticRegression(max_iter=500, **logreg_params))
    ])
    logistic_pipe.fit(df_tt[feature_cols], df_tt['isvandalism'])
    log_pred = logistic_pipe.predict(df_ho[feature_cols])
    acc = accuracy_score(df_ho['isvandalism'], log_pred)
    f1 = f1_score(df_ho['isvandalism'], log_pred)
    accs_logreg.append(acc)
    f1s_logreg.append(f1)

[I 2025-06-26 19:27:48,898] A new study created in memory with name: no-name-dd0f9c0a-f8cd-4d3f-9003-5543c8787fdf


[W 2025-06-26 19:28:02,519] Trial 0 failed with parameters: {'C': 0.05356926665586274, 'solver': 'liblinear'} because of the following error: ValueError('\nAll the 3 fits failed.\nIt is very likely that your model is misconfigured.\nYou can try to debug the error by setting error_score=\'raise\'.\n\nBelow are more details about the failures:\n--------------------------------------------------------------------------------\n3 fits failed with the following error:\nTraceback (most recent call last):\n  File "c:\\Users\\Zihao\\anaconda3\\envs\\hoax-detection\\Lib\\site-packages\\pandas\\core\\indexes\\base.py", line 3805, in get_loc\n    return self._engine.get_loc(casted_key)\n           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "index.pyx", line 167, in pandas._libs.index.IndexEngine.get_loc\n  File "index.pyx", line 196, in pandas._libs.index.IndexEngine.get_loc\n  File "pandas\\\\_libs\\\\hashtable_class_helper.pxi", line 7081, in pandas._libs.hashtable.PyObjectHashTable.get_item\n  Fi

ValueError: 
All the 3 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Zihao\anaconda3\envs\hoax-detection\Lib\site-packages\pandas\core\indexes\base.py", line 3805, in get_loc
    return self._engine.get_loc(casted_key)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "index.pyx", line 167, in pandas._libs.index.IndexEngine.get_loc
  File "index.pyx", line 196, in pandas._libs.index.IndexEngine.get_loc
  File "pandas\\_libs\\hashtable_class_helper.pxi", line 7081, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas\\_libs\\hashtable_class_helper.pxi", line 7089, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'EditID'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "c:\Users\Zihao\anaconda3\envs\hoax-detection\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Zihao\anaconda3\envs\hoax-detection\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Zihao\anaconda3\envs\hoax-detection\Lib\site-packages\sklearn\pipeline.py", line 654, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Zihao\anaconda3\envs\hoax-detection\Lib\site-packages\sklearn\pipeline.py", line 588, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Zihao\anaconda3\envs\hoax-detection\Lib\site-packages\joblib\memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Zihao\anaconda3\envs\hoax-detection\Lib\site-packages\sklearn\pipeline.py", line 1551, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Zihao\anaconda3\envs\hoax-detection\Lib\site-packages\sklearn\utils\_set_output.py", line 319, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Zihao\anaconda3\envs\hoax-detection\Lib\site-packages\sklearn\base.py", line 921, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Zihao\anaconda3\envs\hoax-detection\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Zihao\Documents\GitHub\summer-2025-hoax-detection-local\feature_engineer\vandalism_scorer.py", line 111, in fit
    self.EditID_to_classifier_index[self.X_train_.iat[idx, self.X_train_.columns.get_loc('EditID')]] = i
                                                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Zihao\anaconda3\envs\hoax-detection\Lib\site-packages\pandas\core\indexes\base.py", line 3812, in get_loc
    raise KeyError(key) from err
KeyError: 'EditID'


Decision tree

In [None]:
accs_tree, f1s_tree = [], []

## loop through the kfold here
for train_index, test_index in kfold.split(df_train[feature_cols], df_train.isvandalism):
    ## cv training set
    df_tt = df_train.iloc[train_index]

    ## cv holdout set
    df_ho = df_train.iloc[test_index]

    tree_params = tune_model(objective_tree, df_tt[feature_cols], df_tt['isvandalism'])
    tree_pipe = Pipeline([
        ('scorer', VandalismScorer(n_splits=4)),
        ('tree', DecisionTreeClassifier(random_state=42, **tree_params))
    ])
    tree_pipe.fit(df_tt[feature_cols], df_tt['isvandalism'])
    tree_pred = tree_pipe.predict(df_ho[feature_cols])
    acc = accuracy_score(y_ho, tree_pred)
    f1 = f1_score(y_ho, tree_pred)
    accs_tree.append(acc)
    f1s_tree.append(f1)

Random forest

In [None]:
accs_rf, f1s_rf = [], []

## loop through the kfold here
for train_index, test_index in kfold.split(df_train[feature_cols], df_train.isvandalism):
    ## cv training set
    df_tt = df_train.iloc[train_index]

    ## cv holdout set
    df_ho = df_train.iloc[test_index]

    rf_params = tune_model(objective_rf, df_tt[feature_cols], df_tt['isvandalism'])
    rf_pipe = Pipeline([
        ('scorer', VandalismScorer(n_splits=4)),
        ('rf', RandomForestClassifier(
            n_estimators=100,  # You can increase after tuning for production
            bootstrap=True,
            max_samples=500,
            random_state=42,
            n_jobs=-1,
            **rf_params
        ))
    ])
    rf_pipe.fit(df_tt[feature_cols], df_tt['isvandalism'])
    rf_pred = rf_pipe.predict(df_ho[feature_cols])
    acc = accuracy_score(y_ho, rf_pred)
    f1 = f1_score(y_ho, rf_pred)
    accs_rf.append(acc)
    f1s_rf.append(f1)

Extra tree

In [None]:
accs_et, f1s_et = [], []

## loop through the kfold here
for train_index, test_index in kfold.split(df_train[feature_cols], df_train.isvandalism):
    ## cv training set
    df_tt = df_train.iloc[train_index]

    ## cv holdout set
    df_ho = df_train.iloc[test_index]

    et_params = tune_model(objective_et, df_tt[feature_cols], df_tt['isvandalism'])
    et_pipe = Pipeline([
        ('scorer', VandalismScorer(n_splits=4)),
        ('et', ExtraTreesClassifier(
            n_estimators=100,
            bootstrap=True,
            max_samples=500,
            random_state=42,
            n_jobs=-1,
            **et_params
        ))
    ])
    et_pipe.fit(df_tt[feature_cols], df_tt['isvandalism'])
    et_pred = et_pipe.predict(df_ho[feature_cols])
    acc = accuracy_score(y_ho, et_pred)
    f1 = f1_score(y_ho, et_pred)
    accs_et.append(acc)
    f1s_et.append(f1)

Print out (average) accuracy scores and F1 scores

In [None]:
print(f"LogReg:   accuracy={sum(accs_logreg)/len(accs_logreg):.4f}, F1={sum(f1s_logreg)/len(f1s_logreg):.4f}")
print(f"Tree:     accuracy={sum(accs_tree)/len(accs_tree):.4f}, F1={sum(f1s_tree)/len(f1s_tree):.4f}")
print(f"RF:       accuracy={sum(accs_rf)/len(accs_rf):.4f}, F1={sum(f1s_rf)/len(f1s_rf):.4f}")
print(f"ET:       accuracy={sum(accs_et)/len(accs_et):.4f}, F1={sum(f1s_et)/len(f1s_et):.4f}")