<a href="https://colab.research.google.com/github/Eshaniqbal/PhishingDetection/blob/main/PhishingTest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [38]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib


In [39]:
# Load the dataset
data = pd.read_csv('urldata.csv')  # Replace with your actual file path

# Encode the labels
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['label'])

# Define features and target variable
X = data['url']  # URLs as features
y = data['label']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)


In [40]:
# Apply TF-IDF Vectorization with limited features
vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [41]:
# Random Forest
rf_model = RandomForestClassifier(n_estimators=50, n_jobs=-1, random_state=42)  # Fewer estimators, parallel processing
rf_model.fit(X_train_tfidf, y_train)
print("Random Forest model training complete.")

# Support Vector Machine (using linear kernel for speed)
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train_tfidf, y_train)
print("SVM model training complete.")

# Gradient Boosting with reduced estimators
gb_model = GradientBoostingClassifier(n_estimators=50, random_state=42)  # Reduced iterations
gb_model.fit(X_train_tfidf, y_train)
print("Gradient Boosting model training complete.")


Random Forest model training complete.
SVM model training complete.
Gradient Boosting model training complete.


In [42]:
# Random Forest Predictions
rf_pred = rf_model.predict(X_test_tfidf)
rf_accuracy = accuracy_score(y_test, rf_pred)
print("Random Forest Accuracy:", rf_accuracy)

# SVM Predictions
svm_pred = svm_model.predict(X_test_tfidf)
svm_accuracy = accuracy_score(y_test, svm_pred)
print("SVM Accuracy:", svm_accuracy)

# Gradient Boosting Predictions
gb_pred = gb_model.predict(X_test_tfidf)
gb_accuracy = accuracy_score(y_test, gb_pred)
print("Gradient Boosting Accuracy:", gb_accuracy)

# Classification Reports
print("Random Forest Classification Report:")
print(classification_report(y_test, rf_pred, target_names=label_encoder.classes_))

print("SVM Classification Report:")
print(classification_report(y_test, svm_pred, target_names=label_encoder.classes_))

print("Gradient Boosting Classification Report:")
print(classification_report(y_test, gb_pred, target_names=label_encoder.classes_))


Random Forest Accuracy: 0.9978230929850282
SVM Accuracy: 0.9976898129637034
Gradient Boosting Accuracy: 0.9971122662046292
Random Forest Classification Report:
              precision    recall  f1-score   support

      benign       1.00      1.00      1.00     69148
   malicious       1.00      1.00      1.00     20888

    accuracy                           1.00     90036
   macro avg       1.00      1.00      1.00     90036
weighted avg       1.00      1.00      1.00     90036

SVM Classification Report:
              precision    recall  f1-score   support

      benign       1.00      1.00      1.00     69148
   malicious       1.00      0.99      1.00     20888

    accuracy                           1.00     90036
   macro avg       1.00      1.00      1.00     90036
weighted avg       1.00      1.00      1.00     90036

Gradient Boosting Classification Report:
              precision    recall  f1-score   support

      benign       1.00      1.00      1.00     69148
   malici

In [43]:
# Save the models and TF-IDF vectorizer
joblib.dump(rf_model, 'model/random_forest_model.pkl')
joblib.dump(svm_model, 'model/svm_model.pkl')
joblib.dump(gb_model, 'model/gradient_boosting_model.pkl')
joblib.dump(vectorizer, 'model/tfidf_vectorizer.pkl')
print("Models and vectorizer saved to disk.")


Models and vectorizer saved to disk.
