In [41]:
from xgboost import XGBClassifier
import xgboost as xgb
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
import joblib


In [42]:
# Load the dataset and preprocess
df = pd.read_csv('Models/Phishing_Legitimate_full.csv')

# Drop 'id' column 
df = df.drop(columns=['id'])

# Drop high correlated features (same as Random Forest model)
df = df.drop(columns=['UrlLength', 'HostnameLength', 'PathLength'])


In [43]:
# Define hyperparameter grid for XGBoost
param_grid_xgb = {
    'n_estimators': [100, 200, 300],  # Number of boosting rounds
    'learning_rate': [0.01, 0.1, 0.2],  # Step size shrinkage
    'max_depth': [3, 5, 10],  # Maximum tree depth
    'min_child_weight': [1, 3, 5],  # Minimum sum of instance weight needed in a child
    'subsample': [0.8, 1.0],  # Fraction of samples for training
    'colsample_bytree': [0.8, 1.0]  # Fraction of features per tree
}


In [44]:
# Define features (X) and target (y)
X = df.drop(columns=['CLASS_LABEL'])
y = df['CLASS_LABEL']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize XGBoost with Grid Search
grid_search_xgb = GridSearchCV(
    estimator=XGBClassifier(random_state=42, eval_metric='logloss'),
    param_grid=param_grid_xgb,
    scoring='accuracy',
    cv=5,
    verbose=2,
    n_jobs=-1  # Use all available CPU cores
)

