In [None]:
import sys
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import optuna
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

# Automatically add the project root (1 level up) to the Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.append(project_root)

from feature_engineer import preprocessor

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df = pd.read_csv("/Users/danielmilanesperez/Downloads/train.csv")
preprocessor(df)

In [12]:
def train(
    predictor: pd.DataFrame,
    target: pd.Series,
    cv: StratifiedKFold,
    log_transform: bool = False,
    cols_to_log: list[str] = None,
    scale: bool = False,
    remove_outliers: bool = False,
) -> None:

    def objective(trial):
        params = {
            "n_neighbors": trial.suggest_int("n_neighbors", 1, 50),
            "weights": trial.suggest_categorical("weights", ["uniform", "distance"]),
            "metric": trial.suggest_categorical(
                "metric", ["euclidean", "manhattan", "minkowski"]
            ),
        }

        scores = []

        for train_index, test_index in cv.split(predictor, target):
            X_train = predictor.iloc[train_index].copy()
            X_test = predictor.iloc[test_index].copy()
            y_train = target.iloc[train_index].copy()
            y_test = target.iloc[test_index].to_numpy().ravel()

            if log_transform and cols_to_log:
                for col in cols_to_log:
                    X_train[col] = np.log1p(X_train[col])
                    X_test[col] = np.log1p(X_test[col])

            if remove_outliers:
                mask = pd.Series(True, index=X_train.index)
                for col in predictor.columns:
                    unique_vals = X_train[col].nunique()
                    if unique_vals <= 2:
                        continue

                    Q1 = X_train[col].quantile(0.25)
                    Q3 = X_train[col].quantile(0.75)
                    IQR = Q3 - Q1
                    lower = Q1 - 1.5 * IQR
                    upper = Q3 + 1.5 * IQR

                    X_test[col] = X_test[col].clip(lower=lower, upper=upper)
                    mask &= (X_train[col] >= lower) & (X_train[col] <= upper)

                X_train = X_train.loc[mask]
                y_train = y_train.loc[mask]

            y_train = y_train.to_numpy().ravel()

            if scale:
                scaler = StandardScaler()
                X_train[predictor.columns] = scaler.fit_transform(
                    X_train[predictor.columns]
                )
                X_test[predictor.columns] = scaler.transform(X_test[predictor.columns])

            model = KNeighborsClassifier(**params)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            scores.append(accuracy_score(y_test, y_pred))

        return np.mean(scores)

    optuna.logging.set_verbosity(optuna.logging.WARNING)
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=50)

    print("Best score:", study.best_value)
    print("Best parameters:", study.best_params)

In [5]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
target = ["isvandalism"]

In [None]:
features = [
    "user_edit_count",
    "user_distinct_pages",
    "user_warns",
    "num_recent_edits",
    "num_recent_reversions",
    "num_edits_5d_before",
    "account_age",
    "word_count_added",
    "word_count_deleted",
    "vandalism_score",
    "comment_empty",
    "is_IP",
    "current_minor",
    "is_person",
]

print("Original data:")
train(df[features], df[target], cv)

print("Originial Outliers Removed:")
train(df[features], df[target], cv, remove_outliers=True)

print("Original scaled:")
train(df[features], df[target], cv, scale=True)

print("Original Scaled and Outliers Removed:")
train(df[features], df[target], cv, scale=True, remove_outliers=True)

Original data:
Best score: 0.8610195995639988
Best parameters: {'n_neighbors': 13, 'weights': 'distance', 'metric': 'manhattan'}
Originial Outliers Removed:
Best score: 0.8364402483768851
Best parameters: {'n_neighbors': 10, 'weights': 'distance', 'metric': 'manhattan'}
Original scaled:
Best score: 0.8865425787564993
Best parameters: {'n_neighbors': 15, 'weights': 'distance', 'metric': 'manhattan'}
Original Scaled and Outliers Removed
Best score: 0.8996779019352872
Best parameters: {'n_neighbors': 23, 'weights': 'uniform', 'metric': 'manhattan'}


In [14]:
cols_to_transform = [
    "user_edit_count",
    "user_distinct_pages",
    "user_warns",
    "num_recent_edits",
    "num_recent_reversions",
    "num_edits_5d_before",
    "account_age",
    "word_count_added",
    "word_count_deleted",
]

print("Log-Transformed data:")
train(df[features], df[target], cv, log_transform=True, cols_to_log=cols_to_transform)

print("Log-Transformed Outliers Removed:")
train(
    df[features],
    df[target],
    cv,
    log_transform=True,
    cols_to_log=cols_to_transform,
    remove_outliers=True,
)

print("Log-Transformed scaled:")
train(
    df[features],
    df[target],
    cv,
    log_transform=True,
    cols_to_log=cols_to_transform,
    scale=True,
)

print("Log-Transformed Scaled and Outliers Removed:")
train(
    df[features],
    df[target],
    cv,
    log_transform=True,
    cols_to_log=cols_to_transform,
    scale=True,
    remove_outliers=True,
)

Log-Transformed data:
Best score: 0.9094309054372947
Best parameters: {'n_neighbors': 26, 'weights': 'uniform', 'metric': 'manhattan'}
Log-Transformed Outliers Removed:
Best score: 0.9091163705020936
Best parameters: {'n_neighbors': 26, 'weights': 'distance', 'metric': 'manhattan'}
Log-Transformed scaled:
Best score: 0.9124984427145139
Best parameters: {'n_neighbors': 19, 'weights': 'distance', 'metric': 'manhattan'}
Log-Transformed Scaled and Outliers Removed:
Best score: 0.9118692568451928
Best parameters: {'n_neighbors': 21, 'weights': 'distance', 'metric': 'manhattan'}
