In [12]:
#### import packages ####

#pandas and numpy
import pandas as pd
import numpy as np

#plotting
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

#sklearn packages
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

#XGBoost
import xgboost
from xgboost import XGBClassifier

#AdaBoost
from sklearn.ensemble import AdaBoostClassifier

In [13]:
#LightGBM
import lightgbm
from lightgbm import LGBMClassifier

#install and implement Catboost
!pip install catboost
import catboost
from catboost import CatBoostClassifier



In [32]:
# read the csv_file; the file I used was Zihao's train file with
# Edit counts and is_person categorization uploaded June 18 on Slack
df_train = pd.read_csv("/content/drive/MyDrive/Erdos/data_science_origin/data-science-summer-2025 copy/Project code/Data/train_data_with_editcounts_isperson.csv")

In [6]:
#select features and target
features = ['user_edit_count','user_distinct_pages','user_warns','num_edits_5d_before','is_person','current_minor']
target = 'isvandalism'

In [10]:
df_tt, df_ho = train_test_split(df_train, test_size=0.2, random_state=42, stratify=df_train['isvandalism'])

In [27]:
## Make Base Models
# maybe do optuna/cross validation to optimize hyperparameters in each of these
knn = KNeighborsClassifier(n_neighbors=5)

svc = LinearSVC(C=2600)

rf = RandomForestClassifier(100,
                               max_depth = 4,
                               random_state=203)
ab = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=100, random_state=42)

xgb = XGBClassifier()

lgb = LGBMClassifier(verbose_eval=-1)

cat = CatBoostClassifier(logging_level='Silent')

In [28]:
#initialize voting classifier; pick soft voting if we want to give probabilities
#in practice, this is probably way too many models; probably drop svc to save runtime
voting = VotingClassifier([('knn', knn),
                           ('svc', svc),
                           ('rf', rf),
                           ('ab',ab),
                           ('xgb',xgb),
                           ('lgb',lgb),
                           ('cat',cat)],
                         voting='hard')

In [31]:
best_name = None
best_acc = 0

#fit each model and print their accuracy, then print the best accuracy score
for name,clf in (["rf_clf",rf],
                 ["svc_clf",svc],
                 ["knn_clf",knn],
                 ["ab_clf",ab],
                 ["xgb_clf",xgb],
                 ["lgb_clf",lgb],
                 ["cat_clf",cat],
                 ["voting_clf",voting]):
    # fit the model
    clf.fit(df_tt[features], df_tt.isvandalism)

    # predict
    y_pred = clf.predict(df_ho[features])

    # get acc
    acc = accuracy_score(df_ho.isvandalism,y_pred=y_pred)

    print(name,"training set accuracy",np.round(acc,5))

    # update best
    if acc > best_acc:
        best_acc = acc
        best_name = name

print(f"\nBest model: {best_name} with accuracy {np.round(best_acc, 5)}")

rf_clf training set accuracy 0.86387
svc_clf training set accuracy 0.79933
knn_clf training set accuracy 0.84916
ab_clf training set accuracy 0.86485
xgb_clf training set accuracy 0.87525
lgb_clf training set accuracy 0.87838
cat_clf training set accuracy 0.87819
voting_clf training set accuracy 0.87368

Best model: lgb_clf with accuracy 0.87838
