In [None]:
import sys
import os
import pandas as pd
import numpy as np

!pip install catboost
import catboost
from catboost import CatBoostClassifier
import matplotlib.pyplot as plt
import shap

#sklearn
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

# set paths for preprocessor
sys.path.append('/content/drive/MyDrive/Erdos/Project/summer-2025-hoax-detection/')

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.append(project_root)

from feature_engineer import (
    VandalismScorer,
    is_IP,
    account_age,
    comment_empty,
    word_count,
    preprocessor
)
#optuna
!pip install optuna
import optuna

Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading mako-1.3.10-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.4.0-py3-none-any.whl (395 kB)
Downloading alembic-1.16.2-py3-none-any.whl (242 kB)
Downloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading mako-1.3.10-py3-none-any.whl (78 kB)
Installing collected packages: Mako, colorlog, alembic, optuna
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4/4[0m [optuna]2m3/4[0m [optuna]
[1A[2KSuccessfully installed Mako-1.3.10 alembic-1.16.2 colorlog-6.9.0 optuna-4.4.0


In [5]:
#read the dataset
df = pd.read_csv("Data/train.csv")
preprocessor(df)

In [6]:
optuna.__version__

'4.4.0'

In [7]:
#Baseline Score

from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin

class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, features):
        self.features = features

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.features]


nfeatures = [
    'user_edit_count', 'user_distinct_pages', 'user_warns', 'num_edits_5d_before',
    'is_person', 'current_minor', 'account_age', 'comment_empty',
    'is_IP', 'word_count_added', 'word_count_deleted', 'vandalism_score'
]

pipe = Pipeline([
    ('scorer', VandalismScorer(n_splits=5, random_state=42)),
    ('select', FeatureSelector(nfeatures)),
    ('model', CatBoostClassifier(random_state=42, verbose=0))
])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

baseline_score = cross_val_score(
    pipe, df.copy(), df['isvandalism'].copy(),
    cv=cv, scoring='accuracy'
).mean()

print(f"Baseline accuracy score: {baseline_score:.4f}")

Baseline accuracy score: 0.9152


In [8]:
def get_oof_vandalism_score(predictor, target, cv, scorer_args=None):
    scorer_args = scorer_args or {}
    df_oof = predictor.copy()
    df_oof["vandalism_score"] = np.nan

    for train_idx, val_idx in cv.split(predictor, target):
        X_train, X_val = predictor.iloc[train_idx], predictor.iloc[val_idx]
        y_train = target.iloc[train_idx]

        scorer = VandalismScorer(**scorer_args)
        scorer.fit(X_train, y_train)
        X_val_transformed = scorer.transform(X_val)

        df_oof.loc[val_idx, "vandalism_score"] = X_val_transformed["vandalism_score"].values

    return df_oof

In [23]:
from sklearn.metrics import accuracy_score,f1_score
from sklearn.model_selection import cross_val_predict

In [28]:
def train(
    predictor: pd.DataFrame,
    target: pd.Series,
    cv: StratifiedKFold,
    scoring: str = "accuracy",
) -> None:

    # Step 1: Precompute vandalism_score safely
    predictor_with_score = get_oof_vandalism_score(
        predictor,
        target,
        cv,
        scorer_args={"n_splits": 5, "random_state": 42}
    )

    def objective(trial):
        params = {
            "depth": trial.suggest_int("depth", 4, 8),
            "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-2, 10.0, log=True),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
            "iterations": trial.suggest_int("iterations", 100, 1000),
            "subsample": trial.suggest_float("subsample", 0.6, 1.0),
            "random_strength": trial.suggest_float("random_strength", 1e-3, 10.0, log=True),
            "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 1.0),
            "border_count": trial.suggest_int("border_count", 32, 255),
            "verbose": 0,
            "random_state": 42,
        }

        nfeatures = [
            'user_edit_count', 'user_distinct_pages', 'user_warns', 'num_edits_5d_before',
            'is_person', 'current_minor', 'account_age', 'comment_empty',
            'is_IP', 'word_count_added', 'word_count_deleted', 'vandalism_score'
        ]

        model = CatBoostClassifier(**params)
        preds = cross_val_predict(
            model, predictor_with_score[nfeatures], target, cv=cv
            )
        acc = accuracy_score(target, preds)
        f1 = f1_score(target, preds)

        print(f"Trial {trial.number}: Accuracy={acc:.4f}, F1={f1:.4f}, Params={params}")


        return acc

    optuna.logging.set_verbosity(optuna.logging.WARNING)
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=25)

    print("Optuna Optimization Results")
    print("Best Accuracy:", study.best_value)
    print("Best hyperparameters:", study.best_params)

    return study.best_params, study.best_value

In [13]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
target = ["isvandalism"]

In [29]:
best_params, best_score = train(df, df.isvandalism, cv)

Trial 0: Accuracy=0.9216, F1=0.9193, Params={'depth': 4, 'l2_leaf_reg': 4.035046715075941, 'learning_rate': 0.07152156324850309, 'iterations': 606, 'subsample': 0.6626283533430583, 'random_strength': 0.13042781462724903, 'bagging_temperature': 0.011361335367422565, 'border_count': 96, 'verbose': 0, 'random_state': 42}
Trial 1: Accuracy=0.9116, F1=0.9099, Params={'depth': 4, 'l2_leaf_reg': 0.5219428452355841, 'learning_rate': 0.024773925767558224, 'iterations': 196, 'subsample': 0.720812702679729, 'random_strength': 0.0010479197011280372, 'bagging_temperature': 0.7362828460170855, 'border_count': 57, 'verbose': 0, 'random_state': 42}
Trial 2: Accuracy=0.9138, F1=0.9114, Params={'depth': 8, 'l2_leaf_reg': 2.822993241009302, 'learning_rate': 0.22819503487735865, 'iterations': 664, 'subsample': 0.6398902509658554, 'random_strength': 0.037446758967447494, 'bagging_temperature': 0.8766658736258968, 'border_count': 63, 'verbose': 0, 'random_state': 42}
Trial 3: Accuracy=0.9108, F1=0.9095, Par