In [None]:
# Basic setup
!pip install scikit-learn tensorflow keras opencv-python matplotlib seaborn --quiet
import os
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from tensorflow.keras.applications import MobileNetV2, VGG16
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, GlobalAveragePooling2D, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.optimizers import Adam


In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# download dataset directly from Kaggle
!kaggle datasets download -d zackyzac/phishing-sites-screenshot

# unzip it
!unzip phishing-sites-screenshot.zip -d /content/dataset


In [None]:
import os
base_dir = "/content/dataset/screenshots"
for folder in os.listdir(base_dir):
    print(folder, "â†’", len(os.listdir(os.path.join(base_dir, folder))), "files")


In [None]:
import os
import cv2
import numpy as np
from tqdm import tqdm

base_dir = "/content/dataset/screenshots"
img_size = 128  # resize all images to 128x128
data, labels = [], []

categories = ["genuine_site_0", "phishing_site_1"]

for label, category in enumerate(categories):
    path = os.path.join(base_dir, category)
    for img_file in tqdm(os.listdir(path), desc=f"Loading {category}"):
        try:
            img_path = os.path.join(path, img_file)
            img = cv2.imread(img_path)
            img = cv2.resize(img, (img_size, img_size))
            data.append(img)
            labels.append(label)
        except Exception as e:
            pass

data = np.array(data) / 255.0  # normalize
labels = np.array(labels)

print("âœ… Data shape:", data.shape)
print("âœ… Labels shape:", labels.shape)


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    data, labels, test_size=0.2, random_state=42, stratify=labels
)

print("Training samples:", X_train.shape[0])
print("Testing samples:", X_test.shape[0])


In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models

cnn_model = models.Sequential([
    layers.Conv2D(32, (3,3), activation='relu', input_shape=(128,128,3)),
    layers.MaxPooling2D(2,2),
    layers.Conv2D(64, (3,3), activation='relu'),
    layers.MaxPooling2D(2,2),
    layers.Conv2D(128, (3,3), activation='relu'),
    layers.MaxPooling2D(2,2),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(1, activation='sigmoid')
])

cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
cnn_model.summary()


In [None]:
history_cnn = cnn_model.fit(
    X_train, y_train,
    epochs=10,
    batch_size=32,
    validation_split=0.2,
    verbose=1
)


In [None]:
cnn_eval = cnn_model.evaluate(X_test, y_test)
print("âœ… CNN Test Accuracy:", cnn_eval[1])


In [None]:
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout
from tensorflow.keras.optimizers import Adam

# Load base model without top layers
base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(128,128,3))
base_model.trainable = False  # freeze base layers

# Add custom classification head
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.4)(x)
predictions = Dense(1, activation='sigmoid')(x)

mobilenet_model = Model(inputs=base_model.input, outputs=predictions)
mobilenet_model.compile(optimizer=Adam(learning_rate=0.0001),
                        loss='binary_crossentropy',
                        metrics=['accuracy'])

mobilenet_model.summary()


In [None]:
history_mobilenet = mobilenet_model.fit(
    X_train, y_train,
    epochs=10,
    batch_size=32,
    validation_split=0.2,
    verbose=1
)


In [None]:
mobilenet_eval = mobilenet_model.evaluate(X_test, y_test)
print("âœ… MobileNetV2 Test Accuracy:", mobilenet_eval[1])


In [None]:
from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Flatten, Dropout
from tensorflow.keras.optimizers import Adam

# Load base model (pretrained)
base_vgg = VGG16(weights='imagenet', include_top=False, input_shape=(128,128,3))
base_vgg.trainable = False  # freeze feature extractor

# Add custom classifier
x = base_vgg.output
x = Flatten()(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.5)(x)
output = Dense(1, activation='sigmoid')(x)

vgg_model = Model(inputs=base_vgg.input, outputs=output)
vgg_model.compile(optimizer=Adam(learning_rate=0.0001),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

vgg_model.summary()


In [None]:
history_vgg = vgg_model.fit(
    X_train, y_train,
    epochs=10,
    batch_size=32,
    validation_split=0.2,
    verbose=1
)


In [None]:
vgg_eval = vgg_model.evaluate(X_test, y_test)
print("âœ… VGG16 Test Accuracy:", vgg_eval[1])


In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Predict on test data
y_pred = (mobilenet_model.predict(X_test) > 0.5).astype("int32")

# Classification report
print("ðŸ“‹ Classification Report:\n")
print(classification_report(y_test, y_pred, target_names=["Genuine", "Phishing"]))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=["Genuine", "Phishing"],
            yticklabels=["Genuine", "Phishing"])
plt.title("Confusion Matrix â€” MobileNetV2")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


In [None]:
import tensorflow as tf
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.models import Model
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# ==========================================================
# âœ… Step 1: Compute class weights (since genuine > phishing)
# ==========================================================
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights = dict(enumerate(class_weights))
print("âœ… Class Weights:", class_weights)

# ==========================================================
# âœ… Step 2: Data augmentation for phishing class generalization
# ==========================================================
datagen = ImageDataGenerator(
    rotation_range=15,
    width_shift_range=0.1,
    height_shift_range=0.1,
    brightness_range=[0.8, 1.2],
    horizontal_flip=True,
    fill_mode='nearest'
)
datagen.fit(X_train)

# ==========================================================
# âœ… Step 3: Build improved MobileNetV2
# ==========================================================
base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(128, 128, 3))

# Unfreeze last few layers for fine-tuning
for layer in base_model.layers[:-25]:
    layer.trainable = False
for layer in base_model.layers[-25:]:
    layer.trainable = True

x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dropout(0.4)(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.3)(x)
preds = Dense(1, activation='sigmoid')(x)

model = Model(inputs=base_model.input, outputs=preds)

# ==========================================================
# âœ… Step 4: Compile with Recall Focused Optimizer and Metrics
# ==========================================================
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    loss='binary_crossentropy',
    metrics=['accuracy',
             tf.keras.metrics.Precision(name='precision'),
             tf.keras.metrics.Recall(name='recall'),
             tf.keras.metrics.AUC(name='auc')]
)

# ==========================================================
# âœ… Step 5: Train with Class Weights + Data Augmentation
# ==========================================================
history = model.fit(
    datagen.flow(X_train, y_train, batch_size=32),
    validation_data=(X_test, y_test),
    epochs=25,
    class_weight=class_weights,
    verbose=1
)

# ==========================================================
# âœ… Step 6: Evaluate on Test Set
# ==========================================================
test_loss, test_acc, test_prec, test_rec, test_auc = model.evaluate(X_test, y_test)
print(f"\nâœ… Improved MobileNetV2 Results:")
print(f"Accuracy: {test_acc:.4f}")
print(f"Precision: {test_prec:.4f}")
print(f"Recall: {test_rec:.4f}")
print(f"AUC: {test_auc:.4f}")


In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Unfreeze more layers
for layer in base_model.layers[:-50]:
    layer.trainable = False
for layer in base_model.layers[-50:]:
    layer.trainable = True

# Compile again with higher learning rate
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=3e-4),
    loss='binary_crossentropy',
    metrics=['accuracy',
             tf.keras.metrics.Precision(name='precision'),
             tf.keras.metrics.Recall(name='recall'),
             tf.keras.metrics.AUC(name='auc')]
)

# Callbacks
callbacks = [
    EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3)
]

# Retrain
history = model.fit(
    datagen.flow(X_train, y_train, batch_size=32),
    validation_data=(X_test, y_test),
    epochs=20,
    class_weight=class_weights,
    callbacks=callbacks,
    verbose=1
)

# Evaluate again
test_loss, test_acc, test_prec, test_rec, test_auc = model.evaluate(X_test, y_test)
print(f"\nâœ… Fine-Tuned MobileNetV2 Results:")
print(f"Accuracy: {test_acc:.4f}")
print(f"Precision: {test_prec:.4f}")
print(f"Recall: {test_rec:.4f}")
print(f"AUC: {test_auc:.4f}")


In [None]:
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.models import Model
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense, Dropout, Input

base_model = MobileNetV2(weights='imagenet', include_top=False, input_tensor=Input(shape=(128,128,3)))
for layer in base_model.layers[:-25]:
    layer.trainable = False
for layer in base_model.layers[-25:]:
    layer.trainable = True

x = GlobalAveragePooling2D()(base_model.output)
x = Dropout(0.3)(x)
output = Dense(1, activation='sigmoid')(x)
model = Model(base_model.input, output)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
    loss=BinaryCrossentropy(),
    metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall(), tf.keras.metrics.AUC()]
)


In [None]:
datagen = tf.keras.preprocessing.image.ImageDataGenerator(
    rotation_range=25,
    width_shift_range=0.15,
    height_shift_range=0.15,
    shear_range=0.15,
    zoom_range=0.25,
    horizontal_flip=True,
    fill_mode='nearest'
)


In [None]:
history = model.fit(
    datagen.flow(X_train, y_train, batch_size=32),
    validation_data=(X_test, y_test),
    epochs=15,
    class_weight=class_weights,
    verbose=1
)

loss, acc, prec, rec, auc = model.evaluate(X_test, y_test)
print(f"\nâœ… Stabilized Fine-Tuned MobileNetV2 Results:")
print(f"Accuracy: {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall: {rec:.4f}")
print(f"AUC: {auc:.4f}")


In [None]:
import tensorflow as tf
from tensorflow.keras import backend as K

# Define Focal Loss
def focal_loss(gamma=2., alpha=.25):
    def focal_loss_fixed(y_true, y_pred):
        y_true = tf.cast(y_true, tf.float32)
        bce = tf.keras.losses.binary_crossentropy(y_true, y_pred)
        bce_exp = K.exp(-bce)
        focal_loss = alpha * (1 - bce_exp) ** gamma * bce
        return focal_loss
    return focal_loss_fixed

# Recompile model with focal loss
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
    loss=focal_loss(gamma=2.0, alpha=0.35),
    metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall(), tf.keras.metrics.AUC()]
)

# Retrain briefly to fine-tune again
history = model.fit(
    datagen.flow(X_train, y_train, batch_size=32),
    validation_data=(X_test, y_test),
    epochs=8,
    class_weight=class_weights,
    verbose=1
)

# Evaluate
loss, acc, prec, rec, auc = model.evaluate(X_test, y_test)
print(f"\nðŸŽ¯ Focal-Loss Fine-Tuned MobileNetV2 Results:")
print(f"Accuracy: {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall: {rec:.4f}")
print(f"AUC: {auc:.4f}")


In [None]:
import matplotlib.pyplot as plt

# Plot accuracy & loss
plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
plt.plot(history.history['accuracy'], label='Train Acc')
plt.plot(history.history['val_accuracy'], label='Val Acc')
plt.title("Model Accuracy (Focal Loss)")
plt.legend()

plt.subplot(1,2,2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.title("Model Loss (Focal Loss)")
plt.legend()
plt.show()

# Save the final model
model.save("phishing_screenshot_mobilenetv2_focal.h5")
print("âœ… Model saved as phishing_screenshot_mobilenetv2_focal.h5")


In [None]:
# Save using the recommended new format
model.save("phishing_screenshot_mobilenetv2_focal.keras")
print("âœ… Model saved as phishing_screenshot_mobilenetv2_focal.keras (recommended format)")
