## 1. Preparing Environment

In [None]:
pip install lime
# This command installs the LIME (Local Interpretable Model-agnostic Explanations) library for Python, which is used for explaining the predictions of machine learning models.

In [None]:
pip install imbalanced-learn
# This command installs the imbalanced-learn library, which provides tools to handle imbalanced datasets in machine learning tasks.

In [None]:
pip install xgboost
# This command installs the XGBoost library, which is an optimized distributed gradient boosting library designed to be highly efficient, flexible, and portable for machine learning tasks.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve
from lime.lime_tabular import LimeTabularExplainer


### Loading Dataset

In [None]:
# Load dataset
file_path = r'/PhiUSIIL_Phishing_URL_Dataset.csv' # Update with your file path

df = pd.read_csv(file_path) # Load the dataset into a pandas DataFrame
df.info() # Display information about the DataFrame

## 2. Dataset Preprocessing

### Drop Irrelevant Columns

In [None]:
# Drop ['FILENAME', 'URL', 'DOMAIN', 'TLD', 'TITLE'] columns, as they are not needed for the model training, and causes issues with the model

df = df.select_dtypes(include=['number']).copy()

# Remove duplicate rows
df = df.drop_duplicates()
df.shape # Check the shape of the DataFrame after removing duplicates

### Train-Test Split

In [None]:
# Split
X = df.iloc[:, :-1]  # All rows, all columns except the last
y = df.iloc[:, -1]   # All rows, only the last column

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)   # Stratify to maintain class distribution

print(f"Training set shape: {X_train.shape}, {y_train.shape}")  # Print shapes of training set
print(f"Testing set shape: {X_test.shape}, {y_test.shape}") # Print shapes of testing set

print("\nSample of X_train:\n")
print(X_train.head()) # Display first few rows of X_train

### Feature Scalling

In [None]:
# Initialize scaler
scaler = StandardScaler() # Standardize features by removing the mean and scaling to unit variance

# Fit only on training data
X_train_scaled = scaler.fit_transform(X_train) # Fit to data, then transform it.
X_test_scaled = scaler.transform(X_test) # Perform standardization by centering and scaling

# Convert back to DataFrame
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

print(f"Shape after scaling: X_train: {X_train_scaled.shape}, X_test: {X_test_scaled.shape}")

print("\nSample of X_train_scaled:\n")
X_train_scaled.head() # Display the first few rows of the scaled training data

### Compare SMOTE, ADASYN, BorderlineSMOTE

In [None]:
# Check original class distribution
print("Original class distribution in training set:")
print(y_train.value_counts()) # Display the count of each class in the training set
 
# Apply SMOTE
X_smote, y_smote = SMOTE(random_state=42).fit_resample(X_train_scaled, y_train) # Apply SMOTE to the scaled training data
print("\nAfter SMOTE:")
print(y_smote.value_counts()) # Display the count of each class after applying SMOTE

# Apply ADASYN
X_adasyn, y_adasyn = ADASYN(random_state=42).fit_resample(X_train_scaled, y_train)
print("\nAfter ADASYN:")
print(y_adasyn.value_counts())

# Apply BorderlineSMOTE
X_bsmote, y_bsmote = BorderlineSMOTE(random_state=42, kind='borderline-1').fit_resample(X_train_scaled, y_train)
print("\nAfter BorderlineSMOTE:")
print(y_bsmote.value_counts())

### Apply SMOTE

In [None]:
# Apply SMOTE
X_train_resampled, y_train_resampled = SMOTE(random_state=42).fit_resample(X_train_scaled, y_train)

print(f"Shape after SMOTE resampling: {X_train_resampled.shape}")
print("\nClass distribution after SMOTE:")
print(y_train_resampled.value_counts()) # Display the count of each class after SMOTE resampling

### Dataset Feature Selection

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif
import matplotlib.pyplot as plt

# Initialize SelectKBest
k = 20  # Change the number of features you want to select
selector = SelectKBest(score_func=f_classif, k=k)

# Fit on resampled training data
X_train_selected = selector.fit_transform(X_train_resampled, y_train_resampled)

# Apply the same selection on test data
X_test_selected = selector.transform(X_test_scaled)

# Get indices and scores of selected features
selected_indices = selector.get_support(indices=True)
scores = selector.scores_

# Get the original feature names
feature_names = X_train_scaled.columns
selected_feature_names = [feature_names[i] for i in selected_indices]

# Print selected features
print(f"Top {k} selected features:\n")
print(selected_feature_names)

print("\nShape of selected training set:", X_train_selected.shape)
print("Shape of selected testing set:", X_test_selected.shape)

# Plot scores
# --------------------------------------------------
# Create figure 1920x1080 pixels at 100 DPI
plt.figure(figsize=(19.2, 10.8))

# Plot F-scores
plt.barh(selected_feature_names, [scores[i] for i in selected_indices], color='skyblue', edgecolor='black')

# Axis labels and title
plt.xlabel("F-score", fontsize=16)
plt.title(f"Top {k} Features via SelectKBest (ANOVA F-test)", fontsize=20)
plt.gca().invert_yaxis()

# Add gridlines for better readability
plt.grid(axis='x', linestyle='--', alpha=0.7)

# Adjust tick label sizes
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)

plt.tight_layout()
plt.show()

## 3. Training

In [None]:
# Models dictionary to save best models
models = {}

### 4.1 Decision Tree Classifier with GridSearchCV

In [None]:
# Decision Tree Classifier with GridSearchCV

# Define parameter grid
dt_params = {
    'max_depth': [ 10],
    'criterion': ['gini']
}

# Initialize GridSearchCV
dt = GridSearchCV(
    DecisionTreeClassifier(random_state=42),
    dt_params,
    cv=3,
    scoring='f1',
    n_jobs=-1
)

# Fit model
dt.fit(X_train_selected, y_train_resampled)

# Save best model
models['Decision Tree'] = dt.best_estimator_

print("Decision Tree training complete and model saved.")

### 4.2 Random Forest Classifier with GridSearchCV

In [None]:
# Random Forest
rf_params = {'n_estimators': [100], 'max_depth': [20]}

# GridSearchCV
rf = GridSearchCV(RandomForestClassifier(random_state=42), rf_params, cv=3, scoring='f1', n_jobs=-1)

# Fit model
rf.fit(X_train_selected, y_train_resampled)

# Save best model
models['Random Forest'] = rf.best_estimator_

print("Random Forest training complete and model saved.")

### 4.3 Logistic Regression with GridSearchCV

In [None]:
# Logistic Regression
lr_params = {'C': [1]}

# GridSearchCV with L2 penalty
lr = GridSearchCV(
    LogisticRegression(penalty='l2', max_iter=1000, random_state=42),
    lr_params,
    cv=3,
    scoring='f1',
    n_jobs=-1
)

# Fit the model
lr.fit(X_train_selected, y_train_resampled)

# Save the best estimator
models['Logistic Regression'] = lr.best_estimator_

print("Logistic Regression training complete and model saved.")

### 4.4 KNN with GridSearchCV

In [None]:
# Define KNN parameter grid
knn_params = {
    'n_neighbors': [5],
    'weights': ['distance']
}

# Setup GridSearchCV
knn = GridSearchCV(
    KNeighborsClassifier(),
    knn_params,
    cv=3,
    scoring='f1',
    n_jobs=-1
)

# Fit model
knn.fit(X_train_selected, y_train_resampled)

# Save best model
models['KNN'] = knn.best_estimator_

print("K-Nearest Neighbors training complete and model saved.")

### 4.5 Gradient Boosting Classifier with GridSearchCV

In [None]:
# Define parameter grid
gbc_params = {
    'n_estimators': [100],
    'learning_rate': [0.1],
    'max_depth': [3, 5]
}

# Initialize GridSearchCV
gbc = GridSearchCV(
    GradientBoostingClassifier(random_state=42),
    gbc_params,
    cv=3,
    scoring='f1',
    n_jobs=-1
)

# Fit model
gbc.fit(X_train_selected, y_train_resampled)

# Save best model
models['Gradient Boosting'] = gbc.best_estimator_

print("Gradient Boosting training complete and model saved.")


### 4.6 Support Vector Machine (SVM) with GridSearchCV

In [None]:
# Define parameter grid
svm_params = {
    'C': [1],
    'kernel': ['rbf'],
    'gamma': ['scale']
}

# Initialize GridSearchCV
svm = GridSearchCV(
    SVC(probability=True, random_state=42),
    svm_params,
    cv=3,
    scoring='f1',
    n_jobs=-1
)

# Fit model
svm.fit(X_train_selected, y_train_resampled)

# Save best model
models['SVM'] = svm.best_estimator_

print("Support Vector Machine training complete and model saved.")


### 4.7 XGBoost with GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

# Define parameter grid
xgb_params = {
    'n_estimators': [100],
    'max_depth': [5],
    'learning_rate': [0.1],
    'subsample': [0.8, 1]
}

# Initialize GridSearchCV
xgb = GridSearchCV(
    estimator=XGBClassifier(random_state=42, eval_metric='logloss'),
    param_grid=xgb_params,
    cv=3,
    scoring='f1',
    n_jobs=-1
)

# Fit model
xgb.fit(X_train_selected, y_train_resampled)

# Save best model
models['XGBoost'] = xgb.best_estimator_

print("XGBoost training complete and model saved.")

### 4.8 Stacking Classifier

In [None]:
# Define base learners (we can use simpler or diverse models)
base_learners = [
    ('decision_tree', models['Decision Tree']),
    ('knn', models['KNN']),
    ('svm', models['SVM'])
]

# Define meta-learner
meta_learner = LogisticRegression(random_state=42, max_iter=5000)

# Initialize Stacking Classifier
stacking = StackingClassifier(
    estimators=base_learners,
    final_estimator=meta_learner,
    cv=3,
    n_jobs=-1,
    passthrough=True
)

# Fit stacking model
stacking.fit(X_train_selected, y_train_resampled)

# Save stacking model
models['Stacking'] = stacking

print("Stacking Classifier training complete and model saved.")