In [1]:
#### import packages ####

# os stuff
import sys
import os

#pandas and numpy
import pandas as pd
import numpy as np

#plotting
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

#sklearn packages
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_validate, cross_val_score
from sklearn.metrics import confusion_matrix, make_scorer, f1_score, accuracy_score, precision_score, recall_score

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
sys.path.append('/content/drive/MyDrive/Erdos/Project/summer-2025-hoax-detection/')

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.append(project_root)

from feature_engineer import (
    VandalismScorer,
    is_IP,
    account_age,
    comment_empty,
    word_count,
)

In [17]:
#LightGBM
import lightgbm
from lightgbm import LGBMClassifier

In [18]:
#optuna
!pip install optuna
import optuna

Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.4.0-py3-none-any.whl (395 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m395.9/395.9 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.2-py3-none-any.whl (242 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.7/242.7 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.16.2 colorlog-6.9.0 optuna-4.4.0


In [8]:
# read the csv_file; the file I used was Zihao's train file with
# Edit counts and is_person categorization uploaded June 18 on Slack
df = pd.read_csv("/content/drive/MyDrive/Erdos/Project/summer-2025-hoax-detection/Data/train.csv")

# Removing bad requests
df = df[
    ~((df["added_lines"] == "BAD REQUEST") | (df["deleted_lines"] == "BAD REQUEST"))
]

# Adding the "comment_empty" feature
df["comment_empty"] = comment_empty(df)

# Adding the "account_age" feature
df["account_age"] = df.apply(account_age, axis=1)

# Adding the "is_IP" feature
df["is_IP"] = df.apply(is_IP, axis=1)

# Adding the "word_count_added"  and "word_count_deleted" features
df["word_count_added"], df["word_count_deleted"] = zip(*df.apply(word_count, axis=1))

df["vandalism_score"] = np.zeros(df.shape[0], dtype=float)

In [9]:
df.head()

Unnamed: 0,EditType,EditID,comment,user,user_edit_count,user_distinct_pages,user_warns,user_reg_time,prev_user,common,...,deleted_lines,isvandalism,num_edits_5d_before,is_person,comment_empty,account_age,is_IP,word_count_added,word_count_deleted,vandalism_score
0,change,329595189,,Nryan30,66,13,0,1259891940,219.78.124.42,,...,,False,1,0,True,0,False,131,1,0.0
1,change,232199357,/* Penis */,89.242.200.212,4,2,2,20080815230001,66.75.235.255,,...,"""Falcon gained media attention after appearing...",True,4,1,False,1,True,4,202,0.0
2,change,329877752,Reverted edits by [[Special:Contributions/71.2...,Chamal N,18697,0,2,1208605428,71.208.113.72,,...,""".nmbhgsdj;kfhds;akjfhds;fkjhsf;kjdshf;sdkjhfd...",False,3,0,False,595,False,34,50,0.0
3,change,253129486,,Animaldudeyay1009,3,1,2,1227241317,J.delanoy,,...,"""A '''kaleidoscope''' is a tube of [[mirror]]s...",True,2,0,True,0,False,94,836,0.0
4,change,394520551,Adding Persondata using [[Project:AWB|AWB]] (7...,RjwilmsiBot,1602950,1309238,0,1257977968,LobãoV,,...,",",False,0,1,False,356,False,34,0,0.0


In [15]:
#select features and target
features = ['user_edit_count','user_distinct_pages','user_warns','num_edits_5d_before','is_person','current_minor','vandalism_score']
target = 'isvandalism'

In [11]:
scoring = {
    'accuracy' : make_scorer(accuracy_score),
    'precision': make_scorer(precision_score),
    'recall'   : make_scorer(recall_score),
    'f1_score' : make_scorer(f1_score)
}

In [13]:
cv = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 42)

In [19]:
for train_index, test_index in cv.split(df, df["isvandalism"]):
    train_df = df.iloc[train_index]
    test_df = df.iloc[test_index]

    scorer = VandalismScorer()
    scorer.fit(
        train_df["added_lines"], train_df["deleted_lines"], train_df["isvandalism"]
    )
    test_scores = scorer.score(test_df["added_lines"], test_df["deleted_lines"])

    # Only assign predictions to held-out fold
    df.loc[test_df.index, "vandalism_score"] = test_scores

In [20]:
## Baseline model
lgb = LGBMClassifier(objective='binary',
    metric='binary_logloss',
    verbosity = -1,
    boosting_type='gbdt',
    force_col_wise=True)
baseline_scores = cross_validate(lgb, df[features], df[target], cv = cv, scoring = scoring)

print("Baseline LightGBM Classifier Scores:")

for metric in scoring.keys():
    mean_score = baseline_scores[f'test_{metric}'].mean()
    print(f"{metric}: {mean_score:.4f}")

Baseline LightGBM Classifier Scores:
accuracy: 0.9114
precision: 0.9114
recall: 0.9064
f1_score: 0.9088


In [23]:
def objective(trial):
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 15, 256),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 5, 12)
    }

    model = LGBMClassifier(**params)
    score = cross_val_score(model, df[features], df[target], cv=cv, scoring='accuracy').mean()
    return score

#optuna tuning
optuna.logging.set_verbosity(optuna.logging.WARNING)
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)

print("Best accuracy:", study.best_value)
print("Best hyperparameters:", study.best_params)

Best f1: 0.9122232920285643
Best hyperparameters: {'learning_rate': 0.00828031828213807, 'num_leaves': 74, 'n_estimators': 341, 'max_depth': 11}


Optimizing $F_1$ score with $n=10$ trials, we get the following result:


```
Best f1: 0.9083058881728933
Best hyperparameters: {'learning_rate': 0.013126799624277498, 'num_leaves': 92, 'n_estimators': 311, 'max_depth': 5}
```



Optimizing `accuracy_score` with $n=10$ trials, we get the following result:


```
Best accuracy: 0.9122232920285643
Best hyperparameters: {'learning_rate': 0.00828031828213807, 'num_leaves': 74, 'n_estimators': 341, 'max_depth': 11}
```



In [24]:
# Train/test split
from sklearn.model_selection import train_test_split
df_tt, df_ho = train_test_split(df, test_size=0.2, stratify=df[target], random_state=42)

# Final model
final_model = LGBMClassifier(
    objective='binary',
    metric='binary_logloss',
    verbosity=-1,
    boosting_type='gbdt',
    learning_rate=study.best_params["learning_rate"],
    num_leaves=study.best_params['num_leaves'],
    n_estimators=study.best_params['n_estimators'],
    max_depth=study.best_params['max_depth']
)

final_model.fit(df_tt[features], df_tt[target])
preds = final_model.predict(df_ho[features])

accuracy = accuracy_score(df_ho[target], preds)
confmat = confusion_matrix(df_ho[target], preds)
print("Final Accuracy:", accuracy)
print("Confusion Matrix:\n", confmat)

Final Accuracy: 0.9089657884388518
Confusion Matrix:
 [[2376  234]
 [ 229 2247]]
