In [19]:
#### import packages ####

#pandas and numpy
import pandas as pd
import numpy as np

#plotting
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

#sklearn packages
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_validate, cross_val_score
from sklearn.metrics import confusion_matrix, make_scorer, f1_score, accuracy_score, precision_score, recall_score

In [18]:
#LightGBM
import lightgbm
from lightgbm import LGBMClassifier

In [20]:
#optuna
!pip install optuna
import optuna

Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.4.0-py3-none-any.whl (395 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m395.9/395.9 kB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.2-py3-none-any.whl (242 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.7/242.7 kB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.16.2 colorlog-6.9.0 optuna-4.4.0


In [3]:
# read the csv_file; the file I used was Zihao's train file with
# Edit counts and is_person categorization uploaded June 18 on Slack
df_train = pd.read_csv("/content/drive/MyDrive/Erdos/Project/summer-2025-hoax-detection/Data/train.csv")

In [4]:
df_train.head()

Unnamed: 0,EditType,EditID,comment,user,user_edit_count,user_distinct_pages,user_warns,user_reg_time,prev_user,common,...,num_recent_edits,num_recent_reversions,current_minor,current_timestamp,added_lines,previous_timestamp,deleted_lines,isvandalism,num_edits_5d_before,is_person
0,change,329595189,,Nryan30,66,13,0,1259891940,219.78.124.42,,...,0,0,False,1259894598,",==Leadership Through Emotion==,""Leadership is...",1259856305,,False,1,0
1,change,232199357,/* Penis */,89.242.200.212,4,2,2,20080815230001,66.75.235.255,,...,0,0,False,1218841201,It's fucking big.,1218816231,"""Falcon gained media attention after appearing...",True,4,1
2,change,329877752,Reverted edits by [[Special:Contributions/71.2...,Chamal N,18697,0,2,1208605428,71.208.113.72,,...,2,1,True,1260025124,"""Japanese modern drama in the early twentieth ...",1260025104,""".nmbhgsdj;kfhds;akjfhds;fkjhsf;kjdshf;sdkjhfd...",False,3,0
3,change,253129486,,Animaldudeyay1009,3,1,2,1227241317,J.delanoy,,...,0,0,False,1227241840,I LIKE CHEESE. CHEESE IS GOOD. CHOCOLATE MILK ...,1227241120,"""A '''kaleidoscope''' is a tube of [[mirror]]s...",True,2,0
4,change,394520551,Adding Persondata using [[Project:AWB|AWB]] (7...,RjwilmsiBot,1602950,1309238,0,1257977968,LobãoV,,...,0,0,True,1288757547,{{Persondata <!-- Metadata: see [[Wikipedia:Pe...,1285262356,",",False,0,1


In [5]:
#select features and target
features = ['user_edit_count','user_distinct_pages','user_warns','num_edits_5d_before','is_person','current_minor']
target = 'isvandalism'

In [10]:
scoring = {
    'accuracy' : make_scorer(accuracy_score),
    'precision': make_scorer(precision_score),
    'recall'   : make_scorer(recall_score),
    'f1_score' : make_scorer(f1_score)
}

In [6]:
df_tt, df_ho = train_test_split(df_train, test_size=0.2, random_state=42, stratify=df_train['isvandalism'])

In [12]:
cv = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 42)

In [33]:
## Baseline model
lgb = LGBMClassifier(objective='binary',
    metric='binary_logloss',
    verbosity = -1,
    boosting_type='gbdt',
    force_col_wise=True)
baseline_scores = cross_validate(lgb, df_tt[features], df_tt[target], cv = cv, scoring = scoring)

print("Baseline LightGBM Classifier Scores:")

for metric in scoring.keys():
    mean_score = baseline_scores[f'test_{metric}'].mean()
    print(f"{metric}: {mean_score:.4f}")

Baseline LightGBM Classifier Scores:
accuracy: 0.8820
precision: 0.8544
recall: 0.9130
f1_score: 0.8827


In [30]:
def objective(trial):
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 15, 256),
        'n_estimators': trial.suggest_int('n_estimators',100,1000),
        'max_depth': trial.suggest_int('max_depth',5,12)
    }

    model = LGBMClassifier(**params)
    score = cross_val_score(model, df_tt[features], df_tt[target], cv=cv, scoring ='f1').mean()

    return score

optuna.logging.set_verbosity(optuna.logging.WARNING)
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

print("Best f1:", study.best_value)
print("Best hyperparameters:", study.best_params)

Best f1: 0.8830615308879025
Best hyperparameters: {'learning_rate': 0.02768759247288796, 'num_leaves': 54, 'n_estimators': 435, 'max_depth': 5}


### Previous run:

```
Best f1: 0.8828958694981319
Best hyperparameters: {'learning_rate': 0.01944746097057199, 'num_leaves': 239, 'n_estimators': 869, 'max_depth': 5}
```

This run:

```
Best f1: 0.8830615308879025
Best hyperparameters: {'learning_rate': 0.02768759247288796, 'num_leaves': 54, 'n_estimators': 435, 'max_depth': 5
```



In [32]:
# Final Model with Best Hyperparameters
final_model = LGBMClassifier(
    objective='binary',
    metric='binary_logloss',
    verbosity = -1,
    boosting_type='gbdt',
    learning_rate = study.best_params["learning_rate"],
    num_leaves = study.best_params['num_leaves'],
    n_estimators = study.best_params['n_estimators'],
    max_depth = study.best_params['max_depth']
)

final_model.fit(df_tt[features],df_tt[target])
preds = final_model.predict(df_ho[features])

accuracy = accuracy_score(df_ho[target],preds)
confmat = confusion_matrix(df_ho[target],preds)
print(accuracy)

0.8775990584542958
