In [1]:
%pip install pandas numpy tensorflow scikit-learn imbalanced-learn

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense, LeakyReLU, Dropout, BatchNormalization
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.under_sampling import RandomUnderSampler

In [None]:
df = pd.read_csv("/content/url_data.csv")

In [None]:
df["url_length"] = df["url"].apply(len)
df["num_digits"] = df["url"].apply(lambda x: sum(c.isdigit() for c in x))
df["num_special_chars"] = df["url"].apply(lambda x: sum(c in "!@#$%^&*()_+=" for c in x))
df["num_subdomains"] = df["url"].apply(lambda x: x.count("."))
df["has_https"] = df["url"].apply(lambda x: 1 if "https" in x else 0)
df["contains_phishing_keyword"] = df["url"].apply(lambda x: 1 if any(word in x.lower() for word in ["login", "secure", "bank", "verify", "update"]) else 0)
df["digit_ratio"] = df["num_digits"] / df["url_length"]

In [None]:
features = df[["url_length", "num_digits", "num_special_chars","num_subdomains","has_https","contains_phishing_keyword","digit_ratio"]]
labels = df["label"]

In [None]:
labels = LabelEncoder().fit_transform(labels)

undersampler = RandomUnderSampler(sampling_strategy=1.0, random_state=42)
X_resampled, y_resampled = undersampler.fit_resample(features, labels)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
def build_discriminator():
    model = Sequential([
        Dense(64, input_dim=7, activation="relu"),
        Dropout(0.3),
        BatchNormalization(momentum=0.8),
        Dense(128, activation=LeakyReLU(0.2)),
        Dropout(0.3),
        BatchNormalization(momentum=0.8),
        Dense(256, activation=LeakyReLU(0.2)),
        Dropout(0.3),
        BatchNormalization(momentum=0.8),
        Dense(1, activation="sigmoid")
    ])
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    return model

discriminator = build_discriminator()
discriminator.fit(X_train, y_train, epochs=20, batch_size=32, verbose=1)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m19750/19750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 2ms/step - accuracy: 0.7547 - loss: 0.4891
Epoch 2/20
[1m19750/19750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 2ms/step - accuracy: 0.7744 - loss: 0.4594
Epoch 3/20
[1m19750/19750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 2ms/step - accuracy: 0.7760 - loss: 0.4567
Epoch 4/20
[1m19750/19750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 2ms/step - accuracy: 0.7780 - loss: 0.4538
Epoch 5/20
[1m19750/19750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 2ms/step - accuracy: 0.7786 - loss: 0.4541
Epoch 6/20
[1m19750/19750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 2ms/step - accuracy: 0.7785 - loss: 0.4532
Epoch 7/20
[1m19750/19750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 2ms/step - accuracy: 0.7788 - loss: 0.4528
Epoch 8/20
[1m19750/19750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 2ms/step - accuracy: 0.7787 - loss: 0.4518


<keras.src.callbacks.history.History at 0x7ddd25c6b7d0>

In [None]:
def build_generator():
    model = Sequential([
        Dense(64, input_dim=3, activation=LeakyReLU(0.2)),
        BatchNormalization(),
        Dense(128, activation=LeakyReLU(0.2)),
        BatchNormalization(),
        Dense(256, activation=LeakyReLU(0.2)),
        BatchNormalization(),
        Dense(X_train.shape[1], activation="linear")
    ])
    return model

generator = build_generator()

In [None]:
def build_gan(generator, discriminator):
    discriminator.trainable = False
    model = Sequential([generator, discriminator])
    model.compile(loss="binary_crossentropy", optimizer="adam")
    return model

gan = build_gan(generator, discriminator)

In [None]:
epochs = 10000
batch_size = 32

for epoch in range(epochs):
    for _ in range(2):  
        noise = np.random.normal(0, 1, (batch_size, 3))
        g_loss = gan.train_on_batch(noise, np.ones((batch_size, 1)))

    idx = np.random.randint(0, X_train.shape[0], batch_size)
    real_samples = X_train[idx] + 0.1 * np.random.normal(size=X_train[idx].shape) 
    fake_samples = generator.predict(np.random.normal(0, 1, (batch_size, 3)))

    d_loss_real = discriminator.train_on_batch(real_samples, np.ones((batch_size, 1)) * 0.9)
    d_loss_fake = discriminator.train_on_batch(fake_samples, np.zeros((batch_size, 1)) + 0.1)
    d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

    if epoch % 1000 == 0:
        print(f"Epoch {epoch}: D Loss: {d_loss[0]:.4f}, G Loss: {g_loss:.4f}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 251ms/step




Epoch 0: D Loss: 0.4493, G Loss: 1.9776
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━

In [None]:
y_pred = discriminator.predict(X_test)
y_pred = (y_pred > 0.6).astype(int)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

discriminator.save("discriminator_model.h5")
generator.save("generator_model.h5")
gan.save("gan_model.h5")

[1m4938/4938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1ms/step




Accuracy: 0.7850094624445387
Confusion Matrix:
 [[57633 21356]
 [12611 66393]]
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.73      0.77     78989
           1       0.76      0.84      0.80     79004

    accuracy                           0.79    157993
   macro avg       0.79      0.79      0.78    157993
weighted avg       0.79      0.79      0.78    157993



In [None]:
from sklearn.preprocessing import StandardScaler
import joblib
import pandas as pd

# Load dataset again
df = pd.read_csv("/content/url_data.csv")

# Re-extract features
df["url_length"] = df["url"].apply(len)
df["num_digits"] = df["url"].apply(lambda x: sum(c.isdigit() for c in x))
df["num_special_chars"] = df["url"].apply(lambda x: sum(c in "!@#$%^&*()_+=" for c in x))
df["num_subdomains"] = df["url"].apply(lambda x: x.count("."))
df["has_https"] = df["url"].apply(lambda x: 1 if "https" in x else 0)
df["contains_phishing_keyword"] = df["url"].apply(lambda x: 1 if any(word in x.lower() for word in ["login", "secure", "bank", "verify", "update"]) else 0)
df["digit_ratio"] = df["num_digits"] / df["url_length"]

# Select features
features = df[["url_length", "num_digits", "num_special_chars", "num_subdomains", "has_https", "contains_phishing_keyword", "digit_ratio"]]

# Fit and save the scaler
scaler = StandardScaler()
scaler.fit(features)
joblib.dump(scaler, "scaler.pkl")

print("Scaler saved successfully as scaler.pkl")


Scaler saved successfully as scaler.pkl


In [None]:
import numpy as np
import tensorflow as tf
import pandas as pd
from tensorflow.keras.models import load_model
from sklearn.preprocessing import StandardScaler
import joblib
from tensorflow.keras.layers import LeakyReLU

discriminator = tf.keras.models.load_model(
    "discriminator_model.h5", custom_objects={"LeakyReLU": LeakyReLU})
discriminator.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
scaler = joblib.load("scaler.pkl")  


def extract_features(url):
    features = {
        "url_length": len(url),
        "num_digits": sum(c.isdigit() for c in url),
        "num_special_chars": sum(c in "!@#$%^&*()_+=" for c in url),
        "num_subdomains": url.count("."),
        "has_https": 1 if "https" in url else 0,
        "contains_phishing_keyword": 1
        if any(
            word in url.lower() for word in ["login", "secure", "bank", "verify", "update"]
        )
        else 0,
    }
    features["digit_ratio"] = (
        features["num_digits"] / features["url_length"] if features["url_length"] > 0 else 0
    )

    return np.array([list(features.values())])

custom_url = input("Enter a URL: ")
features = extract_features(custom_url)
scaled_features = scaler.transform(features)
prob = discriminator.predict(scaled_features)[0][0]

print(f"Prediction Probability: {prob:.4f}")
print("Predicted Class:", "Phishing" if prob>0.6 else "Legitimate")




Enter a URL: http://www.luofertasfantasticas.site




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 284ms/step
Prediction Probability: 0.7299
Predicted Class: Phishing
