In [1]:
# ===============================
# Imports
# ===============================
import pandas as pd
import re
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


# ===============================
# Load Datasets
# ===============================
legitimate_urls = pd.read_csv('legitimate-urls.csv')
phishing_urls = pd.read_csv('phishing-urls.csv')

legitimate_urls['label'] = 0
phishing_urls['label'] = 1

df = pd.concat([legitimate_urls, phishing_urls], ignore_index=True)

# Ensure URL column
df.rename(columns={df.columns[0]: 'url'}, inplace=True)

print("Dataset loaded successfully")
print(df.head())


# ===============================
# Preprocessing Function
# ===============================
def preprocess_url(url):
    url = str(url).lower()
    url = re.sub(r'https?://', '', url)
    url = re.sub(r'www\.', '', url)
    url = re.sub(r'[^a-zA-Z0-9]', ' ', url)
    return url

df['processed_url'] = df['url'].apply(preprocess_url)


# ===============================
# Feature Extraction
# ===============================
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['processed_url'])
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')


# ===============================
# MODEL 1: LOGISTIC REGRESSION
# ===============================
print("\n===== Logistic Regression =====")

lr_model = LogisticRegression(max_iter=1000, n_jobs=-1)
lr_model.fit(X_train, y_train)

lr_preds = lr_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, lr_preds))
print("Classification Report:\n", classification_report(y_test, lr_preds))

cm_lr = confusion_matrix(y_test, lr_preds)
sns.heatmap(cm_lr, annot=True, fmt='d', cmap='Greens')
plt.title("Logistic Regression Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

joblib.dump(lr_model, 'logistic_regression_model.pkl')


# ===============================
# MODEL 2: SUPPORT VECTOR MACHINE
# ===============================
print("\n===== Support Vector Machine =====")

svm_model = LinearSVC(class_weight='balanced', max_iter=5000)
svm_model.fit(X_train, y_train)

svm_preds = svm_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, svm_preds))
print("Classification Report:\n", classification_report(y_test, svm_preds))

cm_svm = confusion_matrix(y_test, svm_preds)
sns.heatmap(cm_svm, annot=True, fmt='d', cmap='Oranges')
plt.title("SVM Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

joblib.dump(svm_model, 'svm_model.pkl')


# ===============================
# MODEL 3: XGBOOST (GPU)
# ===============================
print("\n===== XGBoost (GPU) =====")

xgb_model = xgb.XGBClassifier(
    tree_method='gpu_hist',
    predictor='gpu_predictor',
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='logloss',
    random_state=42
)

xgb_model.fit(X_train, y_train)
xgb_preds = xgb_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, xgb_preds))
print("Classification Report:\n", classification_report(y_test, xgb_preds))

cm_xgb = confusion_matrix(y_test, xgb_preds)
sns.heatmap(cm_xgb, annot=True, fmt='d', cmap='Purples')
plt.title("XGBoost Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

joblib.dump(xgb_model, 'xgboost_gpu_model.pkl')


# ===============================
# TEST ON NEW URLs (ALL MODELS)
# ===============================
print("\n===== Real-Time Test URLs =====")

test_urls = [
    "https://www.google.com",
    "https://www.facebook.com",
    "https://secure-login-paypal-account.com",
    "http://free-gift-card-amazon.verify-now.net"
]

test_processed = [preprocess_url(url) for url in test_urls]
test_features = vectorizer.transform(test_processed)

lr_test = lr_model.predict(test_features)
svm_test = svm_model.predict(test_features)
xgb_test = xgb_model.predict(test_features)

for i, url in enumerate(test_urls):
    print(f"\nURL: {url}")
    print("Logistic Regression:", "Phishing" if lr_test[i] == 1 else "Legitimate")
    print("SVM:", "Phishing" if svm_test[i] == 1 else "Legitimate")
    print("XGBoost:", "Phishing" if xgb_test[i] == 1 else "Legitimate")


ModuleNotFoundError: No module named 'xgboost'

In [2]:
import xgboost
print(xgboost.__version__)


ModuleNotFoundError: No module named 'xgboost'

In [3]:
import xgboost as xgb
print("XGBoost loaded successfully")


ModuleNotFoundError: No module named 'xgboost'