In [6]:
import sys
import os
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import shap

#sklearn
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score,f1_score
from sklearn.model_selection import cross_val_predict

# set paths for preprocessor
sys.path.append('/content/drive/MyDrive/Erdos/Project/summer-2025-hoax-detection/')

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.append(project_root)

from feature_engineer import (
    VandalismScorer,
    is_IP,
    account_age,
    comment_empty,
    word_count,
    preprocessor
)
#optuna
!pip install optuna
import optuna

!pip install lightgbm
import lightgbm
from lightgbm import LGBMClassifier

Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-macosx_12_0_arm64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-macosx_12_0_arm64.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lightgbm
Successfully installed lightgbm-4.6.0


In [3]:
#read the dataset
df = pd.read_csv("../Data/train.csv")
preprocessor(df)

In [7]:
#Baseline Score

from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin

class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, features):
        self.features = features

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.features]


nfeatures = [
    'user_edit_count', 'user_distinct_pages', 'user_warns', 'num_edits_5d_before',
    'is_person', 'current_minor', 'account_age', 'comment_empty',
    'is_IP', 'word_count_added', 'word_count_deleted', 'vandalism_score'
]

pipe = Pipeline([
    ('scorer', VandalismScorer(n_splits=5, random_state=42)),
    ('select', FeatureSelector(nfeatures)),
    ('model', LGBMClassifier(objective='binary',
              metric='binary_logloss',
              verbosity = -1,
              boosting_type='gbdt',
              force_col_wise=True,
              random_state=42))
])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

baseline_score = cross_val_score(
    pipe, df.copy(), df['isvandalism'].copy(),
    cv=cv, scoring='accuracy'
).mean()

print(f"Baseline accuracy score: {baseline_score:.4f}")

Baseline accuracy score: 0.9140


In [9]:
def get_oof_vandalism_score(predictor, target, cv, scorer_args=None):
    scorer_args = scorer_args or {}
    df_oof = predictor.copy()
    df_oof["vandalism_score"] = np.nan

    for train_idx, val_idx in cv.split(predictor, target):
        X_train, X_val = predictor.iloc[train_idx], predictor.iloc[val_idx]
        y_train = target.iloc[train_idx]

        scorer = VandalismScorer(**scorer_args)
        scorer.fit(X_train, y_train)
        X_val_transformed = scorer.transform(X_val)

        df_oof.loc[val_idx, "vandalism_score"] = X_val_transformed["vandalism_score"].values

    return df_oof

In [10]:
def train(
    predictor: pd.DataFrame,
    target: pd.Series,
    cv: StratifiedKFold,
    scoring: str = "accuracy",
) -> None:

    # Step 1: Precompute vandalism_score safely
    predictor_with_score = get_oof_vandalism_score(
        predictor,
        target,
        cv,
        scorer_args={"n_splits": 5, "random_state": 42}
    )

    def objective(trial):
        params = {
            'objective': 'binary',
            'metric': 'binary_logloss',
            'verbosity': -1,
            'boosting_type': 'gbdt',
            'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True),
            'num_leaves': trial.suggest_int('num_leaves', 15, 256),
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
            'max_depth': trial.suggest_int('max_depth', 5, 12),
            'random_state': 42
        }

        nfeatures = [
            'user_edit_count', 'user_distinct_pages', 'user_warns', 'num_edits_5d_before',
            'is_person', 'current_minor', 'account_age', 'comment_empty',
            'is_IP', 'word_count_added', 'word_count_deleted', 'vandalism_score'
        ]

        model = LGBMClassifier(**params)
        preds = cross_val_predict(
            model, predictor_with_score[nfeatures], target, cv=cv
            )
        acc = accuracy_score(target, preds)
        f1 = f1_score(target, preds)

        print(f"Trial {trial.number}: Accuracy={acc:.4f}, F1={f1:.4f}, Params={params}")


        return acc

    optuna.logging.set_verbosity(optuna.logging.WARNING)
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=25)

    print("Optuna Optimization Results")
    print("Best Accuracy:", study.best_value)
    print("Best hyperparameters:", study.best_params)

    return study.best_params, study.best_value

In [12]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [13]:
best_params, best_score = train(df, df.isvandalism, cv)

Trial 0: Accuracy=0.9150, F1=0.9127, Params={'objective': 'binary', 'metric': 'binary_logloss', 'verbosity': -1, 'boosting_type': 'gbdt', 'learning_rate': 0.1243197447568749, 'num_leaves': 134, 'n_estimators': 671, 'max_depth': 9, 'random_state': 42}
Trial 1: Accuracy=0.9207, F1=0.9185, Params={'objective': 'binary', 'metric': 'binary_logloss', 'verbosity': -1, 'boosting_type': 'gbdt', 'learning_rate': 0.02620304020717626, 'num_leaves': 215, 'n_estimators': 863, 'max_depth': 5, 'random_state': 42}
Trial 2: Accuracy=0.9128, F1=0.9094, Params={'objective': 'binary', 'metric': 'binary_logloss', 'verbosity': -1, 'boosting_type': 'gbdt', 'learning_rate': 0.0016713397078052061, 'num_leaves': 210, 'n_estimators': 376, 'max_depth': 9, 'random_state': 42}
Trial 3: Accuracy=0.9184, F1=0.9162, Params={'objective': 'binary', 'metric': 'binary_logloss', 'verbosity': -1, 'boosting_type': 'gbdt', 'learning_rate': 0.024734133899256164, 'num_leaves': 212, 'n_estimators': 530, 'max_depth': 12, 'random_s