## **Imports**

In [None]:
import numpy as np
import pandas as pd
import json
import joblib
import random

from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import IsolationForest
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from tqdm import tqdm

## **Utility Classes**

In [None]:
# Dataset Wrapper
class TabularDataset(Dataset):
    def __init__(self, X, y):
        self.X = X.astype(np.float32)
        self.y = y.astype(np.float32).reshape(-1, 1)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


# Clean DNN Model
class DNN(nn.Module):
    def __init__(self, input_dim, hidden=[256, 128, 64], dropout=0.3):
        super().__init__()
        layers = []
        dims = [input_dim] + hidden

        for i in range(len(hidden)):
            layers.append(nn.Linear(dims[i], dims[i+1]))
            layers.append(nn.BatchNorm1d(dims[i+1]))
            layers.append(nn.LeakyReLU(0.1))
            layers.append(nn.Dropout(dropout))

        layers.append(nn.Linear(dims[-1], 1))
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)

## **Load and Preprocess**

In [None]:
def load_and_preprocess(csv_path, label_col=None):
    df = pd.read_csv(csv_path)
    print("Loaded:", df.shape)

    # Auto-detect label column
    possible_labels = ["label", "class", "target", "Result", "is_phishing"]
    if label_col is None:
        for c in possible_labels:
            if c in df.columns:
                label_col = c
                break
        if label_col is None:
            label_col = df.columns[-1]

    # Convert labels to 0/1
    y = df[label_col]
    def convert(v):
        v = str(v).lower()
        if v in ["1", "true", "phishing", "malicious", "-1"]:
            return 1
        return 0
    y = y.apply(convert).values

    X = df.drop(columns=[label_col]).copy()

    # Remove useless columns
    drop_cols = []
    for c in X.columns:
        if X[c].nunique() <= 1:
            drop_cols.append(c)
        if X[c].isna().mean() > 0.9:
            drop_cols.append(c)
    X.drop(columns=drop_cols, inplace=True)

    # Fill numeric missing
    numeric_cols = X.select_dtypes(include=np.number).columns.tolist()
    for c in numeric_cols:
        X[c] = X[c].fillna(X[c].median())

    # Categorical handling
    cat_cols = X.select_dtypes(exclude=np.number).columns.tolist()
    small_cat = [c for c in cat_cols if X[c].nunique() <= 20]

    ohe = None
    if small_cat:
        X[small_cat] = X[small_cat].fillna("MISSING")
        ohe = OneHotEncoder(sparse=False, handle_unknown="ignore")
        ohe_values = ohe.fit_transform(X[small_cat])
        ohe_df = pd.DataFrame(ohe_values, index=X.index, columns=ohe.get_feature_names_out(small_cat))
        X = pd.concat([X.drop(columns=small_cat), ohe_df], axis=1)

    # Large categorical â†’ frequency encoding
    large_cat = [c for c in cat_cols if c not in small_cat]
    for c in large_cat:
        freq = X[c].value_counts(normalize=True).to_dict()
        X[c] = X[c].map(freq).fillna(0)

    # Final numeric matrix
    X = X.values.astype(np.float32)

    # Outlier removal
    iso = IsolationForest(contamination=0.01, random_state=42)
    mask = iso.fit_predict(X) == 1
    X, y = X[mask], y[mask]

    print("After Cleaning:", X.shape)
    return X, y

## **Load Data**

In [None]:
csv_path = "data/phishing.csv"   # change this to your CSV
X, y = load_and_preprocess(csv_path)

## **Train/Val/Test Split**

In [None]:
# Stratified split
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(sss.split(X, y))

X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]

# Validation 10% of whole dataset
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.11, stratify=y_train, random_state=42
)

print("Train:", X_train.shape)
print("Val:", X_val.shape)
print("Test:", X_test.shape)

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

## **Train the DNN**

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
input_dim = X_train.shape[1]

model = DNN(input_dim).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

train_loader = DataLoader(TabularDataset(X_train, y_train), batch_size=256, shuffle=True)
val_loader = DataLoader(TabularDataset(X_val, y_val), batch_size=256)

best_val = float("inf")
patience = 10
wait = 0

for epoch in range(100):
    model.train()
    total_loss = 0

    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)

        optimizer.zero_grad()
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for xb, yb in val_loader:
            xb, yb = xb.to(device), yb.to(device)
            logits = model(xb)
            loss = criterion(logits, yb)
            val_loss += loss.item()

    print(f"Epoch {epoch+1} | Train Loss: {total_loss:.4f} | Val Loss: {val_loss:.4f}")

    if val_loss < best_val:
        best_val = val_loss
        best_state = model.state_dict()
        wait = 0
    else:
        wait += 1
        if wait >= patience:
            print("Early stopping.")
            break

model.load_state_dict(best_state)

## **Evaluation**

In [None]:
model.eval()
with torch.no_grad():
    logits = model(torch.from_numpy(X_test).float().to(device))
    probs = torch.sigmoid(logits).cpu().numpy().flatten()
    preds = (probs >= 0.5).astype(int)

acc = accuracy_score(y_test, preds)
prec = precision_score(y_test, preds)
rec = recall_score(y_test, preds)
f1 = f1_score(y_test, preds)
cm = confusion_matrix(y_test, preds)

print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1:", f1)
print("Confusion Matrix:\n", cm)