In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Load dataset (replace with your actual dataset)
data = pd.read_csv(r'/Phishing2.csv')

# Feature extraction from URLs
def extract_features(url):
    features = {}
    features['url_length'] = len(url)
    features['num_special_chars'] = sum([1 for char in url if char in ['?', '&', '=', '-', '_', '.']])
    features['has_ip'] = int(any(char.isdigit() for char in url.split('/')))
    features['num_subdomains'] = url.count('.') - 1
    return features

# Apply feature extraction
url_features = data['url'].apply(lambda x: pd.Series(extract_features(x)))

# Combine the extracted features with the original data
data = pd.concat([data.drop('url', axis=1), url_features], axis=1)

# Encode the labels if they are categorical
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['label'])

# Define features and labels
X = data.drop('label', axis=1)
y = data['label']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize models
knn = KNeighborsClassifier(n_neighbors=5)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
dt = DecisionTreeClassifier(max_depth=10, random_state=42)
nb = GaussianNB()

# Combine models using Voting Classifier
voting_clf_2 = VotingClassifier(estimators=[
    ('Random Forest', rf),
    ('KNN', knn)
], voting='soft')  # Using 'soft' voting for probability averaging

voting_clf_3 = VotingClassifier(estimators=[
    ('Random Forest', rf),
    ('KNN', knn),
    ('Decision Tree', dt)
], voting='soft')

voting_clf_4 = VotingClassifier(estimators=[
    ('Random Forest', rf),
    ('KNN', knn),
    ('Naive Bayes', nb)
], voting='soft')

# Train the combined models
voting_clf_2.fit(X_train, y_train)
voting_clf_3.fit(X_train, y_train)
voting_clf_4.fit(X_train, y_train)

# Predict for each combination
y_pred_voting_2 = voting_clf_2.predict(X_test)
y_pred_voting_3 = voting_clf_3.predict(X_test)
y_pred_voting_4 = voting_clf_4.predict(X_test)

# Function to evaluate model performance
def evaluate_model(model_name, y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print(f"Model: {model_name}")
    print(confusion_matrix(y_test, y_pred))
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(classification_report(y_test, y_pred))

# Evaluate Voting Classifiers with 2, 3, and 4 models
evaluate_model('Ensemble (Random Forest + KNN)', y_test, y_pred_voting_2)
evaluate_model('Ensemble (Random Forest + KNN + Decision Tree)', y_test, y_pred_voting_3)
evaluate_model('Ensemble (Random Forest + KNN + Naive Bayes)', y_test, y_pred_voting_4)


Model: Ensemble (Random Forest + KNN)
[[988  26]
 [ 73 913]]
Accuracy: 0.9505
Precision: 0.9723
Recall: 0.9260
F1 Score: 0.9486
              precision    recall  f1-score   support

           0       0.93      0.97      0.95      1014
           1       0.97      0.93      0.95       986

    accuracy                           0.95      2000
   macro avg       0.95      0.95      0.95      2000
weighted avg       0.95      0.95      0.95      2000

Model: Ensemble (Random Forest + KNN + Decision Tree)
[[994  20]
 [ 77 909]]
Accuracy: 0.9515
Precision: 0.9785
Recall: 0.9219
F1 Score: 0.9493
              precision    recall  f1-score   support

           0       0.93      0.98      0.95      1014
           1       0.98      0.92      0.95       986

    accuracy                           0.95      2000
   macro avg       0.95      0.95      0.95      2000
weighted avg       0.95      0.95      0.95      2000

Model: Ensemble (Random Forest + KNN + Naive Bayes)
[[1006    8]
 [ 111  8

In [None]:
import pandas as pd

# Create a table with the provided results
def display_results():
    # Results for Stacking and XGBoost models
    results_data = {
        'Model': ['Stacking (Random Forest + XGBoost + KNN)', 'XGBoost'],
        'Accuracy': [0.9565, 0.9505],
        'Precision': [0.9668, 0.9733],
        'Recall': [0.9442, 0.9249],
        'F1 Score': [0.9554, 0.9485]
    }

    # Create a DataFrame
    results_df = pd.DataFrame(results_data)

    # Print the results in tabular form
    print(results_df)

# Call the function to display the table
display_results()


                                      Model  Accuracy  Precision  Recall  \
0  Stacking (Random Forest + XGBoost + KNN)    0.9565     0.9668  0.9442   
1                                   XGBoost    0.9505     0.9733  0.9249   

   F1 Score  
0    0.9554  
1    0.9485  
