## 1. Loading Dataset and Dependencies.

In [None]:
pip install lime

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve
from lime.lime_tabular import LimeTabularExplainer


In [None]:
# Load dataset
file_path = 'PhiUSIIL_Phishing_URL_Dataset.csv'
df = pd.read_csv(file_path)
df.info()

## 2. Data Preprocessing

### 2.1 Drop irrelevant Columns

In [None]:
# Necessary categorical columns are already encoded
# Drop ['FILENAME', 'URL', 'DOMAIN', 'TLD', 'TITLE'] columns,
# as they are not needed for the model training, and causes issues with the model

df = df.select_dtypes(include=['number']).copy()

# Remove duplicate rows
df = df.drop_duplicates()
df.shape

### 2.2 Train-test Split

In [None]:
# Split
X = df.iloc[:, :-1]  # All rows, all columns except the last one
y = df.iloc[:, -1]   # All rows, only the last column

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set shape: {X_train.shape}, {y_train.shape}")
print(f"Testing set shape: {X_test.shape}, {y_test.shape}")

print("\nSample of X_train:\n")
print(X_train.head())

### 2.3 Feature Scaling

In [None]:
# Initialize scaler
scaler = StandardScaler()

# Fit only on training data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrame
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

print(f"Shape after scaling: X_train: {X_train_scaled.shape}, X_test: {X_test_scaled.shape}")

print("\nSample of X_train_scaled:\n")
X_train_scaled.head()

### 2.4 Compare SMOTE, ADASYN, BorderlineSMOTE

### 2.5 Apply SMOTE

## 3. Feature Selection

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif
import matplotlib.pyplot as plt

# Initialize SelectKBest
k = 20  # Change the number of features you want to select
selector = SelectKBest(score_func=f_classif, k=k)

# Fit on resampled training data
X_train_selected = selector.fit_transform(X_train_resampled, y_train_resampled)

# Apply the same selection on test data
X_test_selected = selector.transform(X_test_scaled)

# Get indices and scores of selected features
selected_indices = selector.get_support(indices=True)
scores = selector.scores_

# Get the original feature names
feature_names = X_train_scaled.columns
selected_feature_names = [feature_names[i] for i in selected_indices]

# Print selected features
print(f"Top {k} selected features:\n")
print(selected_feature_names)

print("\nShape of selected training set:", X_train_selected.shape)
print("Shape of selected testing set:", X_test_selected.shape)

# Plot scores
# --------------------------------------------------
# Create figure 1920x1080 pixels at 100 DPI
plt.figure(figsize=(19.2, 10.8))

# Plot F-scores
plt.barh(selected_feature_names, [scores[i] for i in selected_indices], color='skyblue', edgecolor='black')

# Axis labels and title
plt.xlabel("F-score", fontsize=16)
plt.title(f"Top {k} Features via SelectKBest (ANOVA F-test)", fontsize=20)
plt.gca().invert_yaxis()

# Add gridlines for better readability
plt.grid(axis='x', linestyle='--', alpha=0.7)

# Adjust tick label sizes
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)

plt.tight_layout()
plt.show()

## 4. Model Training

In [None]:
# Models dictionary to save best models
models = {}

### 4.1 Decision Tree Classifier with GridSearchCV

In [None]:
# Decision Tree Classifier with GridSearchCV

# Define parameter grid
dt_params = {
    'max_depth': [ 10],
    'criterion': ['gini']
}

# Initialize GridSearchCV
dt = GridSearchCV(
    DecisionTreeClassifier(random_state=42),
    dt_params,
    cv=3,
    scoring='f1',
    n_jobs=-1
)

# Fit model
dt.fit(X_train_selected, y_train_resampled)

# Save best model
models['Decision Tree'] = dt.best_estimator_

print("‚úÖ Decision Tree training complete and model saved.")

### 4.2 Random Forest Classifier with GridSearchCV

In [None]:
# Random Forest
rf_params = {'n_estimators': [100], 'max_depth': [20]}

# GridSearchCV
rf = GridSearchCV(RandomForestClassifier(random_state=42), rf_params, cv=3, scoring='f1', n_jobs=-1)

# Fit model
rf.fit(X_train_selected, y_train_resampled)

# Save best model
models['Random Forest'] = rf.best_estimator_

print("‚úÖ Random Forest training complete and model saved.")

### 4.3 Logistic Regression with GridSearchCV

In [None]:
# Logistic Regression
lr_params = {'C': [1]}

# GridSearchCV with L2 penalty
lr = GridSearchCV(
    LogisticRegression(penalty='l2', max_iter=1000, random_state=42),
    lr_params,
    cv=3,
    scoring='f1',
    n_jobs=-1
)

# Fit the model
lr.fit(X_train_selected, y_train_resampled)

# Save the best estimator
models['Logistic Regression'] = lr.best_estimator_

print("‚úÖ Logistic Regression training complete and model saved.")

### 4.4 KNN with GridSearchCV

In [None]:
# Define KNN parameter grid
knn_params = {
    'n_neighbors': [5],
    'weights': ['distance']
}

# Setup GridSearchCV
knn = GridSearchCV(
    KNeighborsClassifier(),
    knn_params,
    cv=3,
    scoring='f1',
    n_jobs=-1
)

# Fit model
knn.fit(X_train_selected, y_train_resampled)

# Save best model
models['KNN'] = knn.best_estimator_

print("‚úÖ K-Nearest Neighbors training complete and model saved.")

### 4.5 Gradient Boosting Classifier with GridSearchCV

In [None]:
# Define parameter grid
gbc_params = {
    'n_estimators': [100],
    'learning_rate': [0.1],
    'max_depth': [3, 5]
}

# Initialize GridSearchCV
gbc = GridSearchCV(
    GradientBoostingClassifier(random_state=42),
    gbc_params,
    cv=3,
    scoring='f1',
    n_jobs=-1
)

# Fit model
gbc.fit(X_train_selected, y_train_resampled)

# Save best model
models['Gradient Boosting'] = gbc.best_estimator_

print("‚úÖ Gradient Boosting training complete and model saved.")


### 4.6 Support Vector Machine (SVM) with GridSearchCV

In [None]:
# Define parameter grid
svm_params = {
    'C': [1],
    'kernel': ['rbf'],
    'gamma': ['scale']
}

# Initialize GridSearchCV
svm = GridSearchCV(
    SVC(probability=True, random_state=42),
    svm_params,
    cv=3,
    scoring='f1',
    n_jobs=-1
)

# Fit model
svm.fit(X_train_selected, y_train_resampled)

# Save best model
models['SVM'] = svm.best_estimator_

print("‚úÖ Support Vector Machine training complete and model saved.")


### 4.7 XGBoost with GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

# Define parameter grid
xgb_params = {
    'n_estimators': [100],
    'max_depth': [5],
    'learning_rate': [0.1],
    'subsample': [0.8, 1]
}

# Initialize GridSearchCV
xgb = GridSearchCV(
    estimator=XGBClassifier(random_state=42, eval_metric='logloss'),
    param_grid=xgb_params,
    cv=3,
    scoring='f1',
    n_jobs=-1
)

# Fit model
xgb.fit(X_train_selected, y_train_resampled)

# Save best model
models['XGBoost'] = xgb.best_estimator_

print("‚úÖ XGBoost training complete and model saved.")

### 4.8 Stacking Classifier

In [None]:
# Define base learners (we can use simpler or diverse models)
base_learners = [
    ('decision_tree', models['Decision Tree']),
    ('knn', models['KNN']),
    ('svm', models['SVM'])
]

# Define meta-learner
meta_learner = LogisticRegression(random_state=42, max_iter=5000)

# Initialize Stacking Classifier
stacking = StackingClassifier(
    estimators=base_learners,
    final_estimator=meta_learner,
    cv=3,
    n_jobs=-1,
    passthrough=True
)

# Fit stacking model
stacking.fit(X_train_selected, y_train_resampled)

# Save stacking model
models['Stacking'] = stacking

print("‚úÖ Stacking Classifier training complete and model saved.")

## 5. Model Results and Performance Evaluation


In [None]:
# Evaluate each model
for name, model in models.items():
    print(f"\nüîç Model: {name}")

    # Predictions
    y_train_pred = model.predict(X_train_selected)
    y_test_pred = model.predict(X_test_selected)

    # Probabilities (for AUC)
    if hasattr(model, "predict_proba"):
        y_test_proba = model.predict_proba(X_test_selected)[:, 1]
    else:
        # For models like SVM without predict_proba
        y_test_proba = model.decision_function(X_test_selected)

    # Accuracy
    print(f"‚úÖ Training Accuracy: {accuracy_score(y_train_resampled, y_train_pred):.4f}")
    print(f"‚úÖ Test Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")

    # Classification Report
    print("\nüìã Classification Report:")
    print(classification_report(y_test, y_test_pred))

    # AUC-ROC
    roc_auc = roc_auc_score(y_test, y_test_proba)
    print(f"üìà AUC-ROC Score: {roc_auc:.4f}")

    # Confusion Matrix
    cm = confusion_matrix(y_test, y_test_pred)
    plt.figure(figsize=(8, 6), dpi=200)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f"Confusion Matrix: {name}")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

    print(" ")


## 6. LIME

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from lime.lime_tabular import LimeTabularExplainer
from IPython.display import display, HTML

# Make sure X_train_selected and X_test_selected are DataFrames
X_train_selected = pd.DataFrame(X_train_selected)
X_test_selected = pd.DataFrame(X_test_selected)

# Assign feature names manually if needed
X_train_selected.columns = [f"feature_{i}" for i in range(X_train_selected.shape[1])]
X_test_selected.columns = X_train_selected.columns

# 1. Initialize LIME Explainer
explainer = LimeTabularExplainer(
    training_data=X_train_selected.values,
    feature_names=X_train_selected.columns.tolist(),
    class_names=['Non-Phishing', 'Phishing'],
    mode='classification'
)

# 2. Pick a random test sample
sample_idx = np.random.randint(0, X_test_selected.shape[0])
sample = X_test_selected.iloc[sample_idx]
true_label = y_test.iloc[sample_idx]

print(f"üîé Sample index: {sample_idx}  |  True Label: {true_label}")

# Optional: collect all explanations into a single table
all_explanations = []

# 3. LIME Explain for each model
for model_name, model in models.items():
    print(f"\nüîµ Explaining model: {model_name}")

    exp = explainer.explain_instance(
        data_row=sample.values,
        predict_fn=model.predict_proba
    )

    # Plot with 1080p resolution
    fig = exp.as_pyplot_figure()
    fig.set_size_inches(19.2, 10.8)
    plt.title(f"LIME Explanation for {model_name}", fontsize=20)
    plt.tight_layout()
    plt.show()

    # üîΩ Create explanation table
    explanation_list = exp.as_list()
    explanation_df = pd.DataFrame(explanation_list, columns=["Feature", "Weight"])
    explanation_df["Model"] = model_name
    explanation_df["Sample Index"] = sample_idx
    explanation_df["True Label"] = true_label

    # Display the explanation table
    display(HTML(f"<h3>LIME Explanation Table for <em>{model_name}</em></h3>"))
    display(explanation_df)

    # Store for combined view
    all_explanations.append(explanation_df)

# üìä Optional: Combine all explanations
combined_explanations_df = pd.concat(all_explanations, ignore_index=True)

# Display combined table
display(HTML("<h2>Combined LIME Explanation Table</h2>"))
display(combined_explanations_df)

# Optional: Export to CSV
# combined_explanations_df.to_csv("lime_explanations.csv", index=False)