Import Library

In [None]:
import pandas as pd
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import StackingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score

Loading trainset

In [None]:
trainset = pd.read_csv("train.csv")

Clean Duplicated Value

In [None]:
trainset.drop_duplicates(inplace=True)

Splitting

In [None]:
x = trainset.drop(columns='ClassLabel')
y = trainset['ClassLabel']

In [None]:
x_train,x_val,y_train,y_val = train_test_split(x,y,test_size = 0.2, stratify = y,random_state=42)

Use GPU

In [None]:

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)



Columns type

In [None]:

text_col = "URL"
robust_cols = ["url_length", "url_entropy", "token_count", "path_length", "number_of_digits"]
standard_cols = ["dot_count", "subdomain_count", "query_param_count", "domain_name_length", "percentage_numeric_chars"]
minmax_cols = ["tld_length"]
binary_cols = ["has_ip_address", "https_flag", "has_hyphen_in_domain", "tld_popularity", "suspicious_file_extension"]
numeric_cols = robust_cols + standard_cols + minmax_cols


Data Preprocessing

In [None]:
sbert_model = SentenceTransformer("all-mpnet-base-v2", device=device)
def encode_urls(urls):
    embeddings = sbert_model.encode(urls, batch_size=256, show_progress_bar=True)
    return embeddings.astype(np.float16)

train_urls = x_train[text_col].tolist()
url_embeddings = encode_urls(train_urls)
embedding_cols = [f"url_emb_{i}" for i in range(url_embeddings.shape[1])]
df_train_embeddings = pd.DataFrame(url_embeddings, columns=embedding_cols, index=x_train.index)

val_urls = x_val[text_col].tolist()
val_embeddings = encode_urls(val_urls)
df_val_embeddings = pd.DataFrame(val_embeddings, columns=embedding_cols, index=x_val.index)


In [None]:
x_train_full = pd.concat([df_train_embeddings, x_train[numeric_cols + binary_cols]], axis=1)
x_val_full = pd.concat([df_val_embeddings, x_val[numeric_cols + binary_cols]], axis=1)

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ("robust", RobustScaler(), robust_cols),
        ("standard", StandardScaler(), standard_cols),
        ("minmax", MinMaxScaler(), minmax_cols)
    ],
    remainder="passthrough",
    verbose_feature_names_out=False
)


XGBOOST and MLP Parameters for Modelling

In [None]:
xgb_pipeline = Pipeline([
    ("scaler", preprocessor),
    ("xgb", XGBClassifier(
        n_estimators=300,
        learning_rate=0.015,
        max_depth=7,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=0.0,
        reg_lambda=0.1,
        eval_metric="logloss",
        use_label_encoder=False,
        tree_method="gpu_hist",  # GPU
        gpu_id=0,
        random_state=42,
        n_jobs=-1
    ))
])

mlp_pipeline = Pipeline([
    ("scaler", preprocessor),
    ("pca", PCA(n_components=128, random_state=42)),
    ("mlp", MLPClassifier(
        hidden_layer_sizes=(128, 64),
        max_iter=500,
        alpha=0.001,
        solver='adam',
        early_stopping=True,
        n_iter_no_change=20,
        random_state=42
    ))
])


Stacking

In [None]:
stack_model = StackingClassifier(
    estimators=[
        ("xgboost", xgb_pipeline),
        ("mlp_base", mlp_pipeline)
    ],
    final_estimator=MLPClassifier(
        hidden_layer_sizes=(64, 32),
        max_iter=500,
        solver='adam',
        alpha=0.001,
        early_stopping=True,
        n_iter_no_change=20,
        random_state=42
    ),
    stack_method="predict_proba",
    n_jobs=-1,
    passthrough=True
)

Hyperparameter Tuning

In [None]:
param_grid = {
    'xgboost__xgb__n_estimators': [300, 400, 500],
    'xgboost__xgb__max_depth': [5, 7, 9],
    'xgboost__xgb__learning_rate': [0.01, 0.015, 0.02],
    'xgboost__xgb__subsample': [0.7, 0.8, 0.9],
    'xgboost__xgb__colsample_bytree': [0.7, 0.8, 0.9],
    'xgboost__xgb__reg_alpha': [0.0, 0.1, 0.5],
    'mlp_base__mlp__hidden_layer_sizes': [(128,64), (256,128,64), (512,256,128)],
    'final_estimator__hidden_layer_sizes': [(64,32), (128,64)]
}

random_search = RandomizedSearchCV(
    estimator=stack_model,
    param_distributions=param_grid,
    n_iter=15,
    scoring='roc_auc',
    cv=3,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

Fitting and review AUROC Score

In [None]:
print("Fitting GPU-ready Stacking Model with extended hyperparameter tuning...")
random_search.fit(x_train_full, y_train)

print("Best hyperparameters:", random_search.best_params_)
y_pred_proba = random_search.predict_proba(x_val_full)[:,1]
auroc = roc_auc_score(y_val, y_pred_proba)
print(f" FINAL AUROC on validation set: {auroc:.5f} ")

Predict test set and save the result

In [None]:
testset = pd.read_csv("test.csv")
test_urls = testset[text_col].tolist()
test_embeddings = encode_urls(test_urls)
df_test_embeddings = pd.DataFrame(test_embeddings, columns=embedding_cols, index=testset.index)

for col in binary_cols:
    if col not in testset.columns:
        testset[col] = 0

x_test_full = pd.concat([df_test_embeddings, testset[numeric_cols + binary_cols]], axis=1)
x_test_full = x_test_full.reindex(columns=x_train_full.columns, fill_value=0)

probs = random_search.predict_proba(x_test_full)[:,1]
class_labels = (probs >= 0.5).astype(int)

submission = pd.DataFrame({
    "ID": testset.get("ID", np.arange(1,len(probs)+1)),
    "class_label": class_labels
})

submission.to_csv("Final Result.csv", index=False)
print("Submission saved.")
