<a href="https://colab.research.google.com/github/DyanielCX/DLI-Assm/blob/main/HoFengSheng_SVM_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Import Dependencies

In [None]:
import time
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
from matplotlib import colormaps

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    confusion_matrix, classification_report, roc_auc_score,
    accuracy_score, precision_score, recall_score, f1_score,
    roc_curve, precision_recall_curve, average_precision_score
)
from sklearn.svm import LinearSVC

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, Dropout, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 2. Load and Preprocess Dataset

In [None]:
github_csv_url = 'https://raw.githubusercontent.com/DyanielCX/DLI-Assm/refs/heads/main/dataset_B_05_2020_1.csv'
df = pd.read_csv(github_csv_url)
df['url'] = df['url'].str.lower()
y = df['status'].values

ml_features = df.drop(columns=['url', 'status'])
urls = df['url'].values

# Tokenize URLs (char level)
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(urls)
url_seqs = tokenizer.texts_to_sequences(urls)
X_url = pad_sequences(url_seqs, maxlen=200)
vocab_size = len(tokenizer.word_index) + 1

# 3. 🧠 Train CNN to Learn from URL

In [None]:
train_start_time = time.time()

# CNN architecture
input_layer = Input(shape=(200,))
embedding = Embedding(input_dim=vocab_size, output_dim=128)(input_layer)
conv = Conv1D(filters=256, kernel_size=5, activation='relu')(embedding)
drop = Dropout(0.3)(conv)
pool = GlobalMaxPooling1D(name="cnn_features")(drop)
output_layer = Dense(1, activation='sigmoid')(pool)

# Compile model
CNN_model = Model(inputs=input_layer, outputs=output_layer)
CNN_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Split for DL training only
X_train_dl, X_test_dl, y_train_dl, y_test_dl = train_test_split(X_url, y, test_size=0.2, stratify=y, random_state=42)

# Train CNN model
CNN_model.fit(X_train_dl, y_train_dl, epochs=5, batch_size=128, validation_split=0.1, verbose=1)
loss, acc = CNN_model.evaluate(X_test_dl, y_test_dl)
print(f"✅ CNN Accuracy: {round(acc * 100, 2)}%")


 # 4. 📤 Extract CNN Features & Merge with ML Features

In [None]:
# Create a feature extractor model that outputs from 'cnn_features' layer
feature_extractor = Model(inputs=CNN_model.input,
                          outputs=CNN_model.get_layer("cnn_features").output)

# Apply on all padded URL sequences to extract CNN features
cnn_features = feature_extractor.predict(X_url)

# Standardize ML numerical features
scaler = StandardScaler()
ml_scaled = scaler.fit_transform(ml_features)

# Combine CNN features with numeric ML features
X_combined = np.hstack([cnn_features, ml_scaled])


# 5. 🤖 Train Final SVM Classifier

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, stratify=y, random_state=42)

svm_model = LinearSVC(
    loss="squared_hinge", penalty='l2', dual=True, max_iter=100000,
    tol=1e-3, class_weight='balanced', C=1.0, random_state=42
)

svm_model.fit(X_train, y_train)

# 6. 📊 Evaluate Model

In [None]:
predict_start_time = time.time()
y_pred = svm_model.predict(X_test)
predict_time = time.time() - predict_start_time
train_time = time.time() - train_start_time

y_scores = svm_model.decision_function(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_scores)

print("\n📋 Classification Report:\n", classification_report(y_test, y_pred))
print(f"✅ Accuracy: {round(accuracy * 100, 2)}%")
print(f"✅ Precision: {round(precision * 100, 2)}%")
print(f"✅ Recall: {round(recall * 100, 2)}%")
print(f"✅ F1-Score: {round(f1 * 100, 2)}%")
print(f"✅ ROC AUC: {round(roc_auc * 100, 2)}%")
print(f"⏱️ Training Time: {round(train_time / 60, 2)} minutes")
print(f"⚡ Prediction Time: {round(predict_time * 1000, 2)} milliseconds")

# 🔷 Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=["Benign", "Phishing"],
            yticklabels=["Benign", "Phishing"])
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()
plt.show()


## 6.1 📈 ROC & PR Curve

In [None]:
# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_scores)
plt.figure(figsize=(6, 4))
plt.plot(fpr, tpr, label=f'ROC AUC = {roc_auc:.2f}')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.grid(alpha=0.4)
plt.legend()
plt.tight_layout()
plt.show()

# PR Curve
prec_vals, rec_vals, _ = precision_recall_curve(y_test, y_scores)
avg_prec = average_precision_score(y_test, y_scores)
plt.figure(figsize=(6, 4))
plt.plot(rec_vals, prec_vals, label=f'Avg Precision = {avg_prec:.2f}')
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.grid(alpha=0.4)
plt.legend()
plt.tight_layout()
plt.show()