# Baseline model: 50 - 50 Dummy Random Guessing Classifier

Import packages

In [3]:
import sys
import os

import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.dummy import DummyClassifier

Prepare for preprocessing and Feature Engineering

In [4]:
# Automatically add the project root (1 level up) to the Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.append(project_root)

from feature_engineer import preprocessor, VandalismScorer

Read in the train data and preprocess it

In [5]:
df_train = pd.read_csv(project_root+"/data/train.csv")
preprocessor(df_train)

Raw features including add_lines and deleted_lines

In [6]:
feature_cols = ["EditID", "user_edit_count", "user_warns", "num_recent_reversions", "num_edits_5d_before", "is_person", "added_lines", "deleted_lines"]

Initialize the cross-validation

In [7]:
num_splits = 5
num_models = 4
kfold = StratifiedKFold(num_splits, random_state=42, shuffle=True)

In [8]:
accs_dummy, f1s_dummy = [], []

for fold, (train_index, test_index) in enumerate(kfold.split(df_train[feature_cols], df_train.isvandalism)):
    df_tt = df_train.iloc[train_index]
    df_ho = df_train.iloc[test_index]

    X_ho = df_ho[feature_cols]
    y_ho = df_ho['isvandalism']

    dummy_pipe = Pipeline([
        ('scorer', VandalismScorer(n_splits=4)),
        ('dummy', DummyClassifier(strategy="uniform", random_state=fold))
    ])
    dummy_pipe.fit(X_ho, y_ho)  # Training is a formality: it does not use data
    dummy_pred = dummy_pipe.predict(X_ho)
    acc = accuracy_score(y_ho, dummy_pred)
    f1 = f1_score(y_ho, dummy_pred)
    accs_dummy.append(acc)
    f1s_dummy.append(f1)
    

In [10]:
print(f"\nDummy (random guessing) average accuracy={sum(accs_dummy)/len(accs_dummy):.4f}, average F1={sum(f1s_dummy)/len(f1s_dummy):.4f}")


Dummy (random guessing) average accuracy=0.4982, average F1=0.4915
