In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [3]:
df = pd.read_csv("Dataset.csv")
print(df.head())

   Type  url_length  number_of_dots_in_url  having_repeated_digits_in_url  \
0     0          37                      2                              0   
1     1          70                      5                              0   
2     0          42                      2                              0   
3     0          46                      2                              0   
4     0          51                      3                              0   

   number_of_digits_in_url  number_of_special_char_in_url  \
0                        0                              8   
1                        0                             12   
2                        6                              8   
3                        0                              7   
4                        0                              9   

   number_of_hyphens_in_url  number_of_underline_in_url  \
0                         0                           0   
1                         0                         

In [4]:
target_column = 'Type'
features = df.columns.drop(target_column)

y = df[target_column]
X = df[features]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
def run_algos(X_train, y_train, X_test, y_test):
    accuracies = {}
    models = {
        'Decision Tree': DecisionTreeClassifier(random_state=42),
        'Random Forest': RandomForestClassifier(random_state=42),
        'XGBoost': XGBClassifier(random_state=42),
        'CATBoost': CatBoostClassifier(random_state=42, verbose=False),
    }

    best_model = None
    best_accuracy = 0

    # Loop through models
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_test_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_test_pred)
        accuracies[name] = accuracy
        print(f"{name} Test Accuracy: {accuracy}")
        class_report = classification_report(y_test, y_test_pred, output_dict=True)
        # Print the accuracy for each class
        for label, metrics in class_report.items():
            print(f'Accuracy for class {label}: {metrics}')
        print("")
        
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = model

    return accuracies, best_model

accuracies, best_model = run_algos(X_train, y_train, X_test, y_test)



Decision Tree Test Accuracy: 0.9537810042347248
Accuracy for class 0: {'precision': 0.9538917254550415, 'recall': 0.956650900988938, 'f1-score': 0.9552693208430914, 'support': 25583.0}
Accuracy for class 1: {'precision': 0.9536623072744745, 'recall': 0.9507227058774524, 'f1-score': 0.9521902377972465, 'support': 24007.0}
Accuracy for class accuracy: 0.9537810042347248
Accuracy for class macro avg: {'precision': 0.9537770163647581, 'recall': 0.9536868034331952, 'f1-score': 0.9537297793201689, 'support': 49590.0}
Accuracy for class weighted avg: {'precision': 0.953780661888579, 'recall': 0.9537810042347248, 'f1-score': 0.9537787068749205, 'support': 49590.0}

Random Forest Test Accuracy: 0.9672312966323856
Accuracy for class 0: {'precision': 0.9635477130253076, 'recall': 0.9733025837470195, 'f1-score': 0.9684005833738454, 'support': 25583.0}
Accuracy for class 1: {'precision': 0.9712396833417551, 'recall': 0.9607614445786645, 'f1-score': 0.96597214951314, 'support': 24007.0}
Accuracy for

In [5]:
model_comparison = pd.DataFrame({
    'MODEL': list(accuracies.keys()),
    'ACCURACY': list(accuracies.values())
})

# Sorting the DataFrame by accuracy in descending order
sorted_model = model_comparison.sort_values(by='ACCURACY', ascending=False)
print(sorted_model)



           MODEL  ACCURACY
1  Random Forest  0.967231
0  Decision Tree  0.953781
3       CATBoost  0.917463
2        XGBoost  0.911172


In [6]:
#Save the sorted DataFrame to a file named Phishing_website.pkl using joblib.
import joblib
joblib.dump(best_model, 'Phishing_best.pkl')

['Phishing_best.pkl']

In [7]:
print(best_model)

RandomForestClassifier(random_state=42)


In [8]:
!cd

F:\project\phising detection
