In [25]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, Subset
import numpy as np
import time
#seeding 
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
        torch.cuda.manual_seed(42)


# 1️⃣ Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [79]:


# 2️⃣ Load MNIST
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,),(0.5,))
])

train_dataset = datasets.MNIST(root="./data", train=True, download=True, transform=transform)
test_dataset = datasets.MNIST(root="./data", train=False, transform=transform)


train_loader_compleate = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1000, shuffle=False)

In [81]:


# 3️⃣ Define a simple CNN
class SimpleCNN(nn.Module):
    def __init__(self, num_conv_layers, num_filters, dropout, criterion):
        super().__init__()
        layers = []
        in_channels = 1
        for _ in range(num_conv_layers):
            layers.append(nn.Conv2d(in_channels, num_filters, 3, padding=1))
            layers.append(nn.ReLU())
            layers.append(nn.MaxPool2d(2))
            in_channels = num_filters
        self.conv = nn.Sequential(*layers)
        self.fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(num_filters * (28 // (2 ** num_conv_layers))**2, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.Dropout(dropout),
            nn.Linear(128, 10)
        )
        self.criterion = criterion
        self.optimizer = optim.AdamW(self.parameters(), lr=0.001)
        

    def forward(self, x):
        x = self.conv(x)
        return self.fc(x)
    
    def train_model(self, train_loader, epochs=3, val_loader=None, device=torch.device("cuda" if torch.cuda.is_available() else "cpu"), log=True, patience=3, min_delta=1e-4 ):
        start_time = time.time()
        best_val_acc = 0.0
        best_epoch = 0
        patience_counter = 0
        best_state = None

        for epoch in range(epochs):
            self.train()
            correct, total = 0, 0
            running_loss = 0.0

            for images, labels in train_loader:
                images, labels = images.to(device), labels.to(device)
                self.optimizer.zero_grad()
                outputs = self(images)
                loss = self.criterion(outputs, labels)
                loss.backward()
                self.optimizer.step()
                _, predicted = torch.max(outputs, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
                running_loss += loss.item() * labels.size(0)

            train_acc = correct / total
            avg_loss = running_loss / total

            # --------------------------
            # Validation
            # --------------------------
            val_accuracy = None
            if val_loader is not None:
                val_accuracy = self.evaluate_model(val_loader)
                if log:
                    print(f"Epoch [{epoch+1}/{epochs}] "
                        f"Train Acc: {train_acc:.4f}, "
                        f"Val Acc: {val_accuracy:.4f}, "
                        f"Loss: {avg_loss:.4f}")
                
                # Early stopping check
                if val_accuracy > best_val_acc + min_delta:
                    best_val_acc = val_accuracy
                    best_epoch = epoch
                    patience_counter = 0
                    best_state = self.state_dict()  # save best model weights
                else:
                    patience_counter += 1

                if patience_counter >= patience:
                    print(f"⏹️ Early stopping at epoch {epoch+1} "
                        f"(no improvement in {patience} epochs)")
                    if best_state is not None:
                        self.load_state_dict(best_state)
                    break
            else:
                if log:
                    print(f"Epoch [{epoch+1}/{epochs}] "
                        f"Train Acc: {train_acc:.4f}, "
                        f"Loss: {avg_loss:.4f}")

        total_time = time.time() - start_time
        if val_loader:
            return train_acc, total_time, best_val_acc
        return train_acc, total_time
    def evaluate_model(self, test_loader):
        self.eval()
        correct, total = 0, 0
        with torch.no_grad():
            for images, labels in test_loader:
                images, labels = images.to(device), labels.to(device)
                outputs = self(images)
                _, predicted = torch.max(outputs, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        acc = correct / total
        return acc





In [65]:
!pip install -U torch_optimizer
!pip install lion-pytorch
!pip install ranger21
!pip install optuna




In [66]:
from torch_optimizer import  Yogi, AdaBelief, Lamb
from ranger21 import Ranger21
from lion_pytorch import Lion  # pip install lion-pytorch
import torch.optim as optim

In [67]:
from scipy.optimize import curve_fit

def power_law(x, a, b):
    return a * np.power(x, b)

In [83]:
import optuna
# ------------------------------
# 4️⃣ Objective function for Optuna
# ------------------------------
def objective(trial):
    # Suggest hyperparameters
    num_conv_layers = trial.suggest_int("num_conv_layers", 1, 4)
    num_filters = trial.suggest_categorical("num_filters", [16, 32, 64, 128])
    dropout = trial.suggest_float("dropout", 0.2, 0.5)
    lr = trial.suggest_loguniform("lr", 1e-4, 1e-2)
    batch_size = trial.suggest_categorical("batch", [16,32,64,128,524,1024,2048])
    optimizer_name = trial.suggest_categorical(
    "optimizer",
        ["AdamW", "Lion", "AdaBelief", "LAMB", "Yogi"]
    )

    # Define model and loss

    
    # Learning rate suggestion
    lr = trial.suggest_float("lr", 1e-5, 1e-2, log=True)

    

    #============================================================================================

    fractions = [0.05]
    train_times = []
    accuracies = []

    for frac in fractions:
        subset_size = int(len(train_dataset) * frac)
        subset_indices = np.random.choice(len(train_dataset), subset_size, replace=False)
        train_subset = Subset(train_dataset, subset_indices)
        train_loader = DataLoader(train_subset, batch_size=batch_size, shuffle=True)

        criterion = nn.CrossEntropyLoss()
        model = SimpleCNN(num_conv_layers, num_filters, dropout, criterion=criterion).to(device)


        # Create optimizer based on choice
        if optimizer_name == "AdamW":
            optimizer = optim.AdamW(model.parameters(), lr=lr)

        elif optimizer_name == "Lion":
            optimizer = Lion(model.parameters(), lr=lr)

        elif optimizer_name == "AdaBelief":
            optimizer = AdaBelief(model.parameters(), lr=lr, eps=1e-8, betas=(0.9, 0.999))

        elif optimizer_name == "LAMB":
            optimizer = Lamb(model.parameters(), lr=lr)

        elif optimizer_name == "Yogi":
            optimizer = Yogi(model.parameters(), lr=lr)

        else:
            optimizer = optim.Adam(model.parameters(), lr=lr)


        model.optimizer = optimizer


        acc, train_time, val_accuracy = model.train_model(train_loader=train_loader, val_loader=test_loader, epochs=3, log=False)


        train_times.append(train_time)
        accuracies.append(val_accuracy)
        print(f"{int(frac*100)}% data -> Acc: {acc*100:.2f}%, Time: {train_time:.2f}s")


    '''
    # Fit time scaling
    params_time, _ = curve_fit(power_law, fractions, train_times)
    pred_time_100 = power_law(1.0, *params_time)

    # Fit accuracy scaling (roughly)
    params_acc, _ = curve_fit(lambda x, Amax, k, alpha: Amax - k / np.power(x, alpha),
                            fractions, accuracies, bounds=(0, [1.0, 1.0, 2.0]))
    Amax_pred = params_acc[0]
    '''

    #============================================================================================
    return sum(accuracies)/len(accuracies)


# ------------------------------
# 5️⃣ Run the study
# ------------------------------
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)  # 15 trials or 10 min

# ------------------------------
# 6️⃣ Show best results
# ------------------------------
print("\n✅ Best trial:")
best_trial = study.best_trial
print(f"Accuracy: {best_trial.value:.4f}")
print("Best hyperparameters:", best_trial.params)

[I 2025-10-17 01:59:35,720] A new study created in memory with name: no-name-deb4c05b-cf3a-4247-87d5-211e669b991a
  lr = trial.suggest_loguniform("lr", 1e-4, 1e-2)
[I 2025-10-17 01:59:45,097] Trial 0 finished with value: 0.9507 and parameters: {'num_conv_layers': 3, 'num_filters': 64, 'dropout': 0.20028297982081308, 'lr': 0.004204622120048832, 'batch': 16, 'optimizer': 'AdaBelief'}. Best is trial 0 with value: 0.9507.


5% data -> Acc: 95.20%, Time: 9.37s


[I 2025-10-17 01:59:53,045] Trial 1 finished with value: 0.9472 and parameters: {'num_conv_layers': 2, 'num_filters': 128, 'dropout': 0.2083501757504469, 'lr': 0.0024130437903936704, 'batch': 128, 'optimizer': 'AdaBelief'}. Best is trial 0 with value: 0.9507.


5% data -> Acc: 93.63%, Time: 7.94s


[I 2025-10-17 02:00:00,143] Trial 2 finished with value: 0.9429 and parameters: {'num_conv_layers': 1, 'num_filters': 64, 'dropout': 0.20387662691634473, 'lr': 0.007110823206807849, 'batch': 32, 'optimizer': 'Yogi'}. Best is trial 0 with value: 0.9507.


5% data -> Acc: 96.83%, Time: 7.09s


[I 2025-10-17 02:00:07,827] Trial 3 finished with value: 0.8902 and parameters: {'num_conv_layers': 2, 'num_filters': 128, 'dropout': 0.41762241502146186, 'lr': 0.004545956273769637, 'batch': 524, 'optimizer': 'AdaBelief'}. Best is trial 0 with value: 0.9507.


5% data -> Acc: 79.67%, Time: 7.68s


[I 2025-10-17 02:00:14,199] Trial 4 finished with value: 0.8736 and parameters: {'num_conv_layers': 1, 'num_filters': 64, 'dropout': 0.46612405680157537, 'lr': 0.00010737237274839275, 'batch': 64, 'optimizer': 'AdamW'}. Best is trial 0 with value: 0.9507.


5% data -> Acc: 79.10%, Time: 6.36s


[I 2025-10-17 02:00:24,031] Trial 5 finished with value: 0.8786 and parameters: {'num_conv_layers': 4, 'num_filters': 64, 'dropout': 0.43827672648500393, 'lr': 0.00936627613042328, 'batch': 16, 'optimizer': 'Yogi'}. Best is trial 0 with value: 0.9507.


5% data -> Acc: 90.37%, Time: 9.83s


[I 2025-10-17 02:00:30,481] Trial 6 finished with value: 0.1135 and parameters: {'num_conv_layers': 4, 'num_filters': 64, 'dropout': 0.26821836097155216, 'lr': 0.0009149558031357425, 'batch': 2048, 'optimizer': 'AdaBelief'}. Best is trial 0 with value: 0.9507.


5% data -> Acc: 11.33%, Time: 6.44s


[I 2025-10-17 02:00:36,065] Trial 7 finished with value: 0.4587 and parameters: {'num_conv_layers': 1, 'num_filters': 32, 'dropout': 0.24841477214693924, 'lr': 0.00017714425181253869, 'batch': 1024, 'optimizer': 'Lion'}. Best is trial 0 with value: 0.9507.


5% data -> Acc: 46.40%, Time: 5.58s


[I 2025-10-17 02:00:43,565] Trial 8 finished with value: 0.9387 and parameters: {'num_conv_layers': 1, 'num_filters': 32, 'dropout': 0.31400168947406104, 'lr': 0.00016821659256315382, 'batch': 16, 'optimizer': 'Lion'}. Best is trial 0 with value: 0.9507.


5% data -> Acc: 93.87%, Time: 7.49s


[I 2025-10-17 02:00:52,428] Trial 9 finished with value: 0.9649 and parameters: {'num_conv_layers': 4, 'num_filters': 128, 'dropout': 0.2777369091250338, 'lr': 0.0022497231598355917, 'batch': 32, 'optimizer': 'Yogi'}. Best is trial 9 with value: 0.9649.


5% data -> Acc: 93.27%, Time: 8.86s


[I 2025-10-17 02:01:00,517] Trial 10 finished with value: 0.414 and parameters: {'num_conv_layers': 3, 'num_filters': 16, 'dropout': 0.3690446955390573, 'lr': 0.0008152596860100228, 'batch': 32, 'optimizer': 'LAMB'}. Best is trial 9 with value: 0.9649.


5% data -> Acc: 34.87%, Time: 8.08s


[I 2025-10-17 02:01:10,386] Trial 11 finished with value: 0.9483 and parameters: {'num_conv_layers': 3, 'num_filters': 128, 'dropout': 0.2976924635839614, 'lr': 0.0020283706139822, 'batch': 16, 'optimizer': 'Yogi'}. Best is trial 9 with value: 0.9649.


5% data -> Acc: 95.73%, Time: 9.86s


[I 2025-10-17 02:01:18,209] Trial 12 finished with value: 0.6061 and parameters: {'num_conv_layers': 4, 'num_filters': 16, 'dropout': 0.252145527848822, 'lr': 0.002094406291943806, 'batch': 32, 'optimizer': 'LAMB'}. Best is trial 9 with value: 0.9649.


5% data -> Acc: 45.77%, Time: 7.81s


[I 2025-10-17 02:01:25,891] Trial 13 finished with value: 0.637 and parameters: {'num_conv_layers': 3, 'num_filters': 128, 'dropout': 0.3499469932589584, 'lr': 0.0004857571097627294, 'batch': 1024, 'optimizer': 'AdamW'}. Best is trial 9 with value: 0.9649.


5% data -> Acc: 47.30%, Time: 7.67s


[I 2025-10-17 02:01:32,327] Trial 14 finished with value: 0.8961 and parameters: {'num_conv_layers': 4, 'num_filters': 64, 'dropout': 0.20083474937371562, 'lr': 0.0035484081899730263, 'batch': 128, 'optimizer': 'AdaBelief'}. Best is trial 9 with value: 0.9649.


5% data -> Acc: 79.80%, Time: 6.43s


[I 2025-10-17 02:01:42,063] Trial 15 finished with value: 0.2094 and parameters: {'num_conv_layers': 3, 'num_filters': 128, 'dropout': 0.2951832849580449, 'lr': 0.0013599613742197997, 'batch': 2048, 'optimizer': 'Yogi'}. Best is trial 9 with value: 0.9649.


5% data -> Acc: 13.93%, Time: 9.73s


[I 2025-10-17 02:01:48,531] Trial 16 finished with value: 0.9534 and parameters: {'num_conv_layers': 2, 'num_filters': 16, 'dropout': 0.2408021971248224, 'lr': 0.005021363585222409, 'batch': 32, 'optimizer': 'AdaBelief'}. Best is trial 9 with value: 0.9649.


5% data -> Acc: 95.27%, Time: 6.46s


[I 2025-10-17 02:01:55,030] Trial 17 finished with value: 0.9555 and parameters: {'num_conv_layers': 2, 'num_filters': 16, 'dropout': 0.3487564565816833, 'lr': 0.006287623638379685, 'batch': 32, 'optimizer': 'Yogi'}. Best is trial 9 with value: 0.9649.


5% data -> Acc: 95.37%, Time: 6.49s


[I 2025-10-17 02:02:01,709] Trial 18 finished with value: 0.8717 and parameters: {'num_conv_layers': 2, 'num_filters': 16, 'dropout': 0.3643488975252795, 'lr': 0.00038584590123277456, 'batch': 32, 'optimizer': 'Yogi'}. Best is trial 9 with value: 0.9649.


5% data -> Acc: 82.63%, Time: 6.67s


[I 2025-10-17 02:02:07,058] Trial 19 finished with value: 0.6589 and parameters: {'num_conv_layers': 2, 'num_filters': 16, 'dropout': 0.3349166463800498, 'lr': 0.009293499121334611, 'batch': 524, 'optimizer': 'Yogi'}. Best is trial 9 with value: 0.9649.


5% data -> Acc: 52.33%, Time: 5.34s


[I 2025-10-17 02:02:13,487] Trial 20 finished with value: 0.6042 and parameters: {'num_conv_layers': 4, 'num_filters': 32, 'dropout': 0.39201801303857253, 'lr': 0.0013782426761239624, 'batch': 64, 'optimizer': 'Yogi'}. Best is trial 9 with value: 0.9649.


5% data -> Acc: 32.37%, Time: 6.42s


[I 2025-10-17 02:02:19,996] Trial 21 finished with value: 0.9608 and parameters: {'num_conv_layers': 2, 'num_filters': 16, 'dropout': 0.27394146845530454, 'lr': 0.005559961549255367, 'batch': 32, 'optimizer': 'Yogi'}. Best is trial 9 with value: 0.9649.


5% data -> Acc: 95.40%, Time: 6.50s


[I 2025-10-17 02:02:26,529] Trial 22 finished with value: 0.9452 and parameters: {'num_conv_layers': 2, 'num_filters': 16, 'dropout': 0.28298774537877536, 'lr': 0.0029782960578245005, 'batch': 32, 'optimizer': 'Yogi'}. Best is trial 9 with value: 0.9649.


5% data -> Acc: 95.63%, Time: 6.52s


[I 2025-10-17 02:02:33,898] Trial 23 finished with value: 0.949 and parameters: {'num_conv_layers': 2, 'num_filters': 16, 'dropout': 0.32895993818855623, 'lr': 0.006081996233760989, 'batch': 32, 'optimizer': 'Yogi'}. Best is trial 9 with value: 0.9649.


5% data -> Acc: 95.93%, Time: 7.36s


[I 2025-10-17 02:02:41,370] Trial 24 finished with value: 0.9397 and parameters: {'num_conv_layers': 3, 'num_filters': 16, 'dropout': 0.31494169029330743, 'lr': 0.0064355376252324525, 'batch': 32, 'optimizer': 'Yogi'}. Best is trial 9 with value: 0.9649.


5% data -> Acc: 93.30%, Time: 7.46s


[I 2025-10-17 02:02:50,048] Trial 25 finished with value: 0.9584 and parameters: {'num_conv_layers': 2, 'num_filters': 128, 'dropout': 0.39758285801637744, 'lr': 0.0015920711089525046, 'batch': 32, 'optimizer': 'Yogi'}. Best is trial 9 with value: 0.9649.


5% data -> Acc: 95.77%, Time: 8.66s


[I 2025-10-17 02:02:58,435] Trial 26 finished with value: 0.9402 and parameters: {'num_conv_layers': 1, 'num_filters': 128, 'dropout': 0.38879974348822816, 'lr': 0.0013546230858934058, 'batch': 32, 'optimizer': 'AdamW'}. Best is trial 9 with value: 0.9649.


5% data -> Acc: 94.63%, Time: 8.36s


[I 2025-10-17 02:03:08,096] Trial 27 finished with value: 0.5865 and parameters: {'num_conv_layers': 3, 'num_filters': 128, 'dropout': 0.47752634075788347, 'lr': 0.000587866441804388, 'batch': 32, 'optimizer': 'LAMB'}. Best is trial 9 with value: 0.9649.


5% data -> Acc: 52.10%, Time: 9.65s


[I 2025-10-17 02:03:15,653] Trial 28 finished with value: 0.9411 and parameters: {'num_conv_layers': 2, 'num_filters': 128, 'dropout': 0.23293526197014386, 'lr': 0.0028308576632468134, 'batch': 128, 'optimizer': 'Yogi'}. Best is trial 9 with value: 0.9649.


5% data -> Acc: 92.33%, Time: 7.55s


[I 2025-10-17 02:03:24,702] Trial 29 finished with value: 0.3893 and parameters: {'num_conv_layers': 3, 'num_filters': 128, 'dropout': 0.4214081221768456, 'lr': 0.0016868150667647475, 'batch': 2048, 'optimizer': 'Lion'}. Best is trial 9 with value: 0.9649.


5% data -> Acc: 24.23%, Time: 9.04s

✅ Best trial:
Accuracy: 0.9649
Best hyperparameters: {'num_conv_layers': 4, 'num_filters': 128, 'dropout': 0.2777369091250338, 'lr': 0.0022497231598355917, 'batch': 32, 'optimizer': 'Yogi'}


In [84]:
from torch_optimizer import AdaBelief

# Get top 4 trials sorted by accuracy (best first)
top_trials = sorted(study.trials, key=lambda t: t.value, reverse=True)[:4]

print("\n🏆 Top 4 Trials:")
for rank, trial in enumerate(top_trials, start=1):
    params = trial.params
    print(f"\n==============================")
    print(f"🔹 Model #{rank}")
    print(f"Accuracy: {trial.value:.4f}")
    print(f"Parameters: {params}")
    print("==============================\n")

    # Build model
    criterion = nn.CrossEntropyLoss()
    model = SimpleCNN(
        params['num_conv_layers'],
        params['num_filters'],
        params['dropout'],
        criterion
    ).to(device)

    # Create optimizer
    optimizer = AdaBelief(model.parameters(), lr=params['lr'], eps=1e-8, betas=(0.9, 0.999))
    model.optimizer = optimizer

    # Train for 20 epochs on full dataset
    train_acc, total_time, val_acc = model.train_model(
        train_loader_compleate,
        epochs=5,
        val_loader=test_loader,
        device=device,
        log=True
    )

    # Evaluate final test accuracy
    test_accuracy = model.evaluate_model(test_loader)
    print(f"🎯 Final Test Accuracy (Model #{rank}): {test_accuracy*100:.2f}%")
    print(f"🕒 Total Training Time: {total_time:.2f}s\n")



🏆 Top 4 Trials:

🔹 Model #1
Accuracy: 0.9649
Parameters: {'num_conv_layers': 4, 'num_filters': 128, 'dropout': 0.2777369091250338, 'lr': 0.0022497231598355917, 'batch': 32, 'optimizer': 'Yogi'}

Epoch [1/5] Train Acc: 0.9109, Val Acc: 0.9860, Loss: 0.2690
Epoch [2/5] Train Acc: 0.9839, Val Acc: 0.9890, Loss: 0.0564
Epoch [3/5] Train Acc: 0.9884, Val Acc: 0.9846, Loss: 0.0413
Epoch [4/5] Train Acc: 0.9899, Val Acc: 0.9895, Loss: 0.0362
Epoch [5/5] Train Acc: 0.9924, Val Acc: 0.9901, Loss: 0.0275
🎯 Final Test Accuracy (Model #1): 99.01%
🕒 Total Training Time: 105.68s


🔹 Model #2
Accuracy: 0.9608
Parameters: {'num_conv_layers': 2, 'num_filters': 16, 'dropout': 0.27394146845530454, 'lr': 0.005559961549255367, 'batch': 32, 'optimizer': 'Yogi'}

Epoch [1/5] Train Acc: 0.9453, Val Acc: 0.9784, Loss: 0.1739
Epoch [2/5] Train Acc: 0.9780, Val Acc: 0.9810, Loss: 0.0761
Epoch [3/5] Train Acc: 0.9819, Val Acc: 0.9864, Loss: 0.0603
Epoch [4/5] Train Acc: 0.9843, Val Acc: 0.9840, Loss: 0.0532
Epoc

# urlCNN

In [92]:
from datasets import load_dataset
import pandas as pd

In [93]:
#dataset1

# Load the dataset from Hugging Face Hub
train_dataset = load_dataset("kmack/Phishing_urls", split="train")
test_dataset = load_dataset("kmack/Phishing_urls", split="test")
valid_dataset = load_dataset("kmack/Phishing_urls", split="valid")

# Convert to pandas DataFrame
train_df = train_dataset.to_pandas()
test_df = test_dataset.to_pandas()
valid_df = valid_dataset.to_pandas()


all_df = [train_df, test_df,valid_df]
for i, df in enumerate(all_df):
    df['url'] = df['text']
    df.drop('text',axis=1,inplace=True)
print(train_df.head())

   label                                                url
0      0             xenophongroup.com/montjoie/compgns.htm
1      1    www.azzali.eu/&usg=AOvVaw2phVSb_ENMrkATGNx5LQ0l
2      1                     guildmusic.edu.au/js/index.htm
3      1  memo.unexpectedrunner.com/ezxgytw4et\nholotili...
4      0  en.wikipedia.org/wiki/Category:American_televi...


In [94]:
# Special tokens
special_tokens = ['<PAD>', '<UNK>', '<START>', '<END>']

# ASCII printable characters
ascii_chars = [chr(i) for i in range(32, 127)]  # ' ' (space) to '~'

# Full vocabulary
vocab = special_tokens + ascii_chars

# Create mappings
char2idx = {ch: idx for idx, ch in enumerate(vocab)}
idx2char = {idx: ch for ch, idx in char2idx.items()}
vocab_size = len(vocab)
print("Vocabulary size:", vocab_size)
print("Sample:", vocab[:50])


Vocabulary size: 99
Sample: ['<PAD>', '<UNK>', '<START>', '<END>', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M']


In [95]:
train_df = train_df.drop_duplicates(subset =['url'])

In [96]:
from tqdm import tqdm
import numpy as np
import torch
import torch.nn.functional as F
tqdm.pandas()

max_url_length = 50


def encode_url(url):
    indices = torch.tensor([0]*max_url_length)
    for i,c in enumerate(url[:max_url_length]):
        indices[i] = char2idx.get(c, char2idx['<UNK>'])
    #print(indices)  


    return indices

# Apply encoding to your URL column
train_df['encode'] = train_df['url'].progress_apply(encode_url)
test_df['encode'] = test_df['url'].progress_apply(encode_url)
valid_df['encode'] = valid_df['url'].progress_apply(encode_url)



100%|██████████| 535838/535838 [01:12<00:00, 7400.53it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['encode'] = train_df['url'].progress_apply(encode_url)
100%|██████████| 70882/70882 [00:09<00:00, 7612.36it/s]
100%|██████████| 70882/70882 [00:09<00:00, 7429.28it/s]


In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Convert encoded URLs to proper tensors
url_tensor = torch.tensor(np.stack(train_df['encode'].values), dtype=torch.long)
labels_tensor = torch.tensor(train_df['label'].values, dtype=torch.float)

valid_url_tensor = torch.tensor(np.stack(valid_df['encode'].values), dtype=torch.long)
valid_labels_tensor = torch.tensor(valid_df['label'].values, dtype=torch.float)

test_url_tensor = torch.tensor(np.stack(test_df['encode'].values), dtype=torch.long)
test_labels_tensor = torch.tensor(test_df['label'].values, dtype=torch.float)


train_dataset = TensorDataset(url_tensor, labels_tensor)

val_dataset = TensorDataset(valid_url_tensor, valid_labels_tensor)
val_loader = DataLoader(val_dataset, batch_size=4056, shuffle=False)

test_dataset = TensorDataset(test_url_tensor, test_labels_tensor)
test_loader = DataLoader(test_dataset, batch_size=4056, shuffle=False)


In [127]:
class URLBinaryCNN(nn.Module):
    def __init__(
        self,
        vocab_size,
        embed_dim=128,
        conv_layers=3,
        conv_channels=[256, 128, 64],
        fc_layers=[128, 64],
        dropout=[0.5, 0.3],
    ):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.conv_blocks = nn.ModuleList()

        in_channels = embed_dim
        for i in range(conv_layers):
            out_channels = conv_channels[i]
            conv = nn.Conv1d(in_channels, out_channels, kernel_size=3, padding=1)
            bn = nn.BatchNorm1d(out_channels)
            #pool = nn.MaxPool1d(2) if i < conv_layers - 1 else nn.AdaptiveMaxPool1d(1)
            self.conv_blocks.append(nn.Sequential(conv, bn, nn.ReLU()))
            in_channels = out_channels

        self.global_pool = nn.AdaptiveMaxPool1d(1)

        # fully connected layers
        fc_modules = []
        in_dim = conv_channels[-1]
        for i, out_dim in enumerate(fc_layers):
            fc_modules.append(nn.Linear(in_dim, out_dim))
            fc_modules.append(nn.ReLU())
            if i < len(dropout):
                fc_modules.append(nn.Dropout(dropout[i]))
            in_dim = out_dim
        fc_modules.append(nn.Linear(in_dim, 1))
        fc_modules.append(nn.Sigmoid())
        self.fc = nn.Sequential(*fc_modules)

    def forward(self, x):
        x = self.embedding(x)  # (batch, seq_len, embed_dim)
        x = x.permute(0, 2, 1)  # (batch, embed_dim, seq_len)
        for block in self.conv_blocks:
            x = block(x)
        x = self.global_pool(x).squeeze(-1)
        x = self.fc(x)
        return x  # logits


# %% Training utilities
class Trainer:
    def __init__(self, model, criterion, optimizer):
        self.model = model
        self.criterion = criterion
        self.optimizer = optimizer

    def train_model(
        self,
        train_loader,
        val_loader=None,
        epochs=5,
        patience=3,
        min_delta=1e-4,
        log=True,
    ):
        best_val_acc = 0
        patience_counter = 0
        best_state = None
        start_time = time.time()

        for epoch in range(epochs):
            self.model.train()
            running_loss, correct, total = 0, 0, 0

            for x, y in train_loader:
                x, y = x.to(device), y.to(device)
                self.optimizer.zero_grad()
                out = self.model(x).view(-1)
                loss = self.criterion(out, y)
                loss.backward()
                self.optimizer.step()

                running_loss += loss.item() * y.size(0)
                preds = (torch.sigmoid(out) > 0.5).float()
                correct += (preds == y).sum().item()
                total += y.size(0)

            train_acc = correct / total
            avg_loss = running_loss / total

            # Validation
            if val_loader:
                val_acc = self.evaluate_model(val_loader)
                if log:
                    print(
                        f"Epoch [{epoch+1}/{epochs}] | Train Acc: {train_acc:.4f} | Val Acc: {val_acc:.4f} | Loss: {avg_loss:.4f}"
                    )

                if val_acc > best_val_acc + min_delta:
                    best_val_acc = val_acc
                    best_state = self.model.state_dict()
                    patience_counter = 0
                else:
                    patience_counter += 1
                    if patience_counter >= patience:
                        if best_state:
                            self.model.load_state_dict(best_state)
                        break
            else:
                if log:
                    print(f"Epoch [{epoch+1}/{epochs}] | Train Acc: {train_acc:.4f} | Loss: {avg_loss:.4f}")

        total_time = time.time() - start_time
        return train_acc, total_time, best_val_acc

    def evaluate_model(self, test_loader):
        self.model.eval()
        correct, total = 0, 0
        with torch.no_grad():
            for x, y in test_loader:
                x, y = x.to(device), y.to(device)
                out = self.model(x).squeeze()
                preds = (torch.sigmoid(out) > 0.5).float()
                correct += (preds == y).sum().item()
                total += y.size(0)
        return correct / total

In [121]:
# %% Optuna objective
def objective(trial):
    embed_dim = trial.suggest_categorical("embed_dim", [64, 128, 256])
    conv_layers = trial.suggest_int("conv_layers", 2, 6)
    conv_channels = [trial.suggest_categorical(f"conv_ch_{i}", [16, 32, 64, 128]) for i in range(conv_layers)]
    fc_layers_count = trial.suggest_int("fc_count", 2, 3)
    fc_layers = [trial.suggest_categorical(f"fc_nodes_{i}", [128,256,512,1024]) for i in range(fc_layers_count)]
    dropout = [trial.suggest_float("drop1", 0.2, 0.5), trial.suggest_float("drop2", 0.1, 0.4)]
    lr = trial.suggest_float("lr", 1e-5, 1e-3, log=True)
    batch_size = trial.suggest_categorical("batch_size", [32, 64, 128])
    optimizer_name = trial.suggest_categorical("optimizer", ["AdamW", "Lion", "AdaBelief", "LAMB", "Yogi"])


    fractions = [0.05]
    train_times = []
    accuracies = []

    for frac in fractions:

        subset_size = int(len(train_dataset) * frac)
        subset_indices = np.random.choice(len(train_dataset), subset_size, replace=False)
        train_subset = Subset(train_dataset, subset_indices)
        train_loader = DataLoader(train_subset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=1024)
        # Model setup
        model = URLBinaryCNN(vocab_size, embed_dim, conv_layers, conv_channels,fc_layers, dropout).to(device)
        criterion = nn.BCEWithLogitsLoss()

        # Optimizer
        if optimizer_name == "AdamW":
            optimizer = optim.AdamW(model.parameters(), lr=lr)
        elif optimizer_name == "Lion":
            optimizer = Lion(model.parameters(), lr=lr)
        elif optimizer_name == "AdaBelief":
            optimizer = AdaBelief(model.parameters(), lr=lr)
        elif optimizer_name == "LAMB":
            optimizer = Lamb(model.parameters(), lr=lr)
        elif optimizer_name == "Yogi":
            optimizer = Yogi(model.parameters(), lr=lr)
        else:
            optimizer = optim.Adam(model.parameters(), lr=lr)

        

        trainer = Trainer(model, criterion, optimizer)
        acc, _, val_acc = trainer.train_model(train_loader, val_loader, epochs=3, log=False)
        accuracies.append(val_acc)
        print(f"{int(frac*100)}% data -> Acc: {acc*100:.2f}%")

    return sum(accuracies)/len(accuracies)



In [122]:

# %% Run Optuna Search
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)
print("\nBest Trial:", study.best_trial.params)


[I 2025-10-17 02:45:46,432] A new study created in memory with name: no-name-9641a9bb-5e44-4e49-814a-cf3db3d7a454
[I 2025-10-17 02:46:18,108] Trial 0 finished with value: 0.6964673683022488 and parameters: {'embed_dim': 64, 'conv_layers': 5, 'conv_ch_0': 128, 'conv_ch_1': 128, 'conv_ch_2': 64, 'conv_ch_3': 64, 'conv_ch_4': 128, 'fc_count': 2, 'fc_nodes_0': 1024, 'fc_nodes_1': 128, 'drop1': 0.4436234973247015, 'drop2': 0.25266766467686275, 'lr': 1.200176155601056e-05, 'batch_size': 32, 'optimizer': 'Lion'}. Best is trial 0 with value: 0.6964673683022488.


5% data -> Acc: 69.48%


[I 2025-10-17 02:47:15,666] Trial 1 finished with value: 0.568014446544962 and parameters: {'embed_dim': 128, 'conv_layers': 6, 'conv_ch_0': 128, 'conv_ch_1': 32, 'conv_ch_2': 64, 'conv_ch_3': 128, 'conv_ch_4': 64, 'conv_ch_5': 128, 'fc_count': 2, 'fc_nodes_0': 512, 'fc_nodes_1': 128, 'drop1': 0.3317287001359468, 'drop2': 0.16972810204347916, 'lr': 0.00021196982208212907, 'batch_size': 32, 'optimizer': 'LAMB'}. Best is trial 0 with value: 0.6964673683022488.


5% data -> Acc: 53.14%


[I 2025-10-17 02:47:48,698] Trial 2 finished with value: 0.5011709601873536 and parameters: {'embed_dim': 128, 'conv_layers': 4, 'conv_ch_0': 128, 'conv_ch_1': 16, 'conv_ch_2': 128, 'conv_ch_3': 16, 'fc_count': 3, 'fc_nodes_0': 256, 'fc_nodes_1': 1024, 'fc_nodes_2': 1024, 'drop1': 0.31330164614688094, 'drop2': 0.2928815692770831, 'lr': 4.5605221960844806e-05, 'batch_size': 32, 'optimizer': 'Yogi'}. Best is trial 0 with value: 0.6964673683022488.


5% data -> Acc: 49.67%


[I 2025-10-17 02:48:06,888] Trial 3 finished with value: 0.7963234671707909 and parameters: {'embed_dim': 256, 'conv_layers': 5, 'conv_ch_0': 128, 'conv_ch_1': 128, 'conv_ch_2': 16, 'conv_ch_3': 64, 'conv_ch_4': 16, 'fc_count': 2, 'fc_nodes_0': 256, 'fc_nodes_1': 128, 'drop1': 0.443283015074945, 'drop2': 0.20724225552296993, 'lr': 0.0003626100895729462, 'batch_size': 64, 'optimizer': 'Lion'}. Best is trial 3 with value: 0.7963234671707909.


5% data -> Acc: 77.47%


[I 2025-10-17 02:48:25,854] Trial 4 finished with value: 0.5215005219943004 and parameters: {'embed_dim': 64, 'conv_layers': 4, 'conv_ch_0': 32, 'conv_ch_1': 64, 'conv_ch_2': 64, 'conv_ch_3': 16, 'fc_count': 3, 'fc_nodes_0': 512, 'fc_nodes_1': 128, 'fc_nodes_2': 1024, 'drop1': 0.3953707542817622, 'drop2': 0.3316664936969699, 'lr': 3.060520257399685e-05, 'batch_size': 32, 'optimizer': 'AdamW'}. Best is trial 3 with value: 0.7963234671707909.


5% data -> Acc: 51.17%


[I 2025-10-17 02:48:56,056] Trial 5 finished with value: 0.5011709601873536 and parameters: {'embed_dim': 64, 'conv_layers': 4, 'conv_ch_0': 32, 'conv_ch_1': 64, 'conv_ch_2': 32, 'conv_ch_3': 16, 'fc_count': 2, 'fc_nodes_0': 512, 'fc_nodes_1': 128, 'drop1': 0.23861417660427103, 'drop2': 0.10183193389012207, 'lr': 1.969599225702152e-05, 'batch_size': 32, 'optimizer': 'AdaBelief'}. Best is trial 3 with value: 0.7963234671707909.


5% data -> Acc: 50.10%


[I 2025-10-17 02:49:19,015] Trial 6 finished with value: 0.5011709601873536 and parameters: {'embed_dim': 128, 'conv_layers': 2, 'conv_ch_0': 32, 'conv_ch_1': 16, 'fc_count': 3, 'fc_nodes_0': 128, 'fc_nodes_1': 256, 'fc_nodes_2': 256, 'drop1': 0.3733204234457811, 'drop2': 0.15657603180084737, 'lr': 6.133245760470359e-05, 'batch_size': 32, 'optimizer': 'AdaBelief'}. Best is trial 3 with value: 0.7963234671707909.


5% data -> Acc: 49.79%


[I 2025-10-17 02:49:30,758] Trial 7 finished with value: 0.5024829999153523 and parameters: {'embed_dim': 256, 'conv_layers': 5, 'conv_ch_0': 32, 'conv_ch_1': 128, 'conv_ch_2': 64, 'conv_ch_3': 32, 'conv_ch_4': 64, 'fc_count': 3, 'fc_nodes_0': 128, 'fc_nodes_1': 256, 'fc_nodes_2': 256, 'drop1': 0.3611271563814605, 'drop2': 0.31331367118863895, 'lr': 1.9851292204253646e-05, 'batch_size': 128, 'optimizer': 'Lion'}. Best is trial 3 with value: 0.7963234671707909.


5% data -> Acc: 49.39%


[I 2025-10-17 02:49:48,455] Trial 8 finished with value: 0.789071978781637 and parameters: {'embed_dim': 64, 'conv_layers': 2, 'conv_ch_0': 128, 'conv_ch_1': 16, 'fc_count': 3, 'fc_nodes_0': 1024, 'fc_nodes_1': 1024, 'fc_nodes_2': 256, 'drop1': 0.28943722311847075, 'drop2': 0.2545784608099183, 'lr': 0.0006203128644075795, 'batch_size': 32, 'optimizer': 'AdamW'}. Best is trial 3 with value: 0.7963234671707909.


5% data -> Acc: 75.00%


[I 2025-10-17 02:50:19,404] Trial 9 finished with value: 0.802375779464462 and parameters: {'embed_dim': 128, 'conv_layers': 5, 'conv_ch_0': 64, 'conv_ch_1': 32, 'conv_ch_2': 16, 'conv_ch_3': 64, 'conv_ch_4': 64, 'fc_count': 2, 'fc_nodes_0': 256, 'fc_nodes_1': 128, 'drop1': 0.4957223177431545, 'drop2': 0.14400069387162426, 'lr': 0.0001981648093134655, 'batch_size': 32, 'optimizer': 'Lion'}. Best is trial 9 with value: 0.802375779464462.


5% data -> Acc: 76.48%


[I 2025-10-17 02:50:40,044] Trial 10 finished with value: 0.5021161931096753 and parameters: {'embed_dim': 128, 'conv_layers': 6, 'conv_ch_0': 64, 'conv_ch_1': 32, 'conv_ch_2': 16, 'conv_ch_3': 64, 'conv_ch_4': 32, 'conv_ch_5': 64, 'fc_count': 2, 'fc_nodes_0': 256, 'fc_nodes_1': 512, 'drop1': 0.4958995811600921, 'drop2': 0.1004487608905017, 'lr': 0.00013272649661231028, 'batch_size': 64, 'optimizer': 'Yogi'}. Best is trial 9 with value: 0.802375779464462.


5% data -> Acc: 49.96%


[I 2025-10-17 02:50:57,964] Trial 11 finished with value: 0.774498462233007 and parameters: {'embed_dim': 256, 'conv_layers': 5, 'conv_ch_0': 64, 'conv_ch_1': 128, 'conv_ch_2': 16, 'conv_ch_3': 64, 'conv_ch_4': 16, 'fc_count': 2, 'fc_nodes_0': 256, 'fc_nodes_1': 128, 'drop1': 0.4963730322100832, 'drop2': 0.19097993944386094, 'lr': 0.00042665588787448, 'batch_size': 64, 'optimizer': 'Lion'}. Best is trial 9 with value: 0.802375779464462.


5% data -> Acc: 76.94%


[I 2025-10-17 02:51:12,062] Trial 12 finished with value: 0.7327530261561468 and parameters: {'embed_dim': 256, 'conv_layers': 3, 'conv_ch_0': 16, 'conv_ch_1': 32, 'conv_ch_2': 16, 'fc_count': 2, 'fc_nodes_0': 256, 'fc_nodes_1': 512, 'drop1': 0.4295589517017499, 'drop2': 0.19756525343444348, 'lr': 0.00027763424574154004, 'batch_size': 64, 'optimizer': 'Lion'}. Best is trial 9 with value: 0.802375779464462.


5% data -> Acc: 71.43%


[I 2025-10-17 02:51:23,064] Trial 13 finished with value: 0.7671482181654017 and parameters: {'embed_dim': 256, 'conv_layers': 5, 'conv_ch_0': 64, 'conv_ch_1': 32, 'conv_ch_2': 16, 'conv_ch_3': 64, 'conv_ch_4': 16, 'fc_count': 2, 'fc_nodes_0': 256, 'fc_nodes_1': 128, 'drop1': 0.4488677146856939, 'drop2': 0.2183942495051669, 'lr': 0.0008245474979276017, 'batch_size': 128, 'optimizer': 'Lion'}. Best is trial 9 with value: 0.802375779464462.


5% data -> Acc: 79.50%


[I 2025-10-17 02:51:53,081] Trial 14 finished with value: 0.5011709601873536 and parameters: {'embed_dim': 256, 'conv_layers': 6, 'conv_ch_0': 16, 'conv_ch_1': 128, 'conv_ch_2': 16, 'conv_ch_3': 64, 'conv_ch_4': 64, 'conv_ch_5': 16, 'fc_count': 2, 'fc_nodes_0': 256, 'fc_nodes_1': 128, 'drop1': 0.46822741631222686, 'drop2': 0.3759017437448372, 'lr': 0.00011938384234471593, 'batch_size': 64, 'optimizer': 'LAMB'}. Best is trial 9 with value: 0.802375779464462.


5% data -> Acc: 49.70%


[I 2025-10-17 02:52:06,941] Trial 15 finished with value: 0.756101690132897 and parameters: {'embed_dim': 128, 'conv_layers': 3, 'conv_ch_0': 64, 'conv_ch_1': 32, 'conv_ch_2': 32, 'fc_count': 2, 'fc_nodes_0': 256, 'fc_nodes_1': 128, 'drop1': 0.40280608255050776, 'drop2': 0.15210122551354727, 'lr': 0.0002817003880521366, 'batch_size': 64, 'optimizer': 'Lion'}. Best is trial 9 with value: 0.802375779464462.


5% data -> Acc: 75.27%


[I 2025-10-17 02:52:17,954] Trial 16 finished with value: 0.7947433763155667 and parameters: {'embed_dim': 128, 'conv_layers': 5, 'conv_ch_0': 128, 'conv_ch_1': 128, 'conv_ch_2': 128, 'conv_ch_3': 128, 'conv_ch_4': 16, 'fc_count': 2, 'fc_nodes_0': 256, 'fc_nodes_1': 256, 'drop1': 0.47080807221114823, 'drop2': 0.1386624418855832, 'lr': 0.0001921253922130029, 'batch_size': 128, 'optimizer': 'Lion'}. Best is trial 9 with value: 0.802375779464462.


5% data -> Acc: 81.39%


[I 2025-10-17 02:52:34,384] Trial 17 finished with value: 0.7053694873169493 and parameters: {'embed_dim': 256, 'conv_layers': 4, 'conv_ch_0': 64, 'conv_ch_1': 64, 'conv_ch_2': 16, 'conv_ch_3': 32, 'fc_count': 2, 'fc_nodes_0': 1024, 'fc_nodes_1': 1024, 'drop1': 0.4240054953308158, 'drop2': 0.22415088858842325, 'lr': 0.000502905478563434, 'batch_size': 64, 'optimizer': 'Lion'}. Best is trial 9 with value: 0.802375779464462.


5% data -> Acc: 70.09%


[I 2025-10-17 02:52:48,604] Trial 18 finished with value: 0.5011709601873536 and parameters: {'embed_dim': 128, 'conv_layers': 3, 'conv_ch_0': 16, 'conv_ch_1': 128, 'conv_ch_2': 16, 'fc_count': 2, 'fc_nodes_0': 128, 'fc_nodes_1': 512, 'drop1': 0.20731019821627014, 'drop2': 0.12852799542673624, 'lr': 7.664680526530596e-05, 'batch_size': 64, 'optimizer': 'AdaBelief'}. Best is trial 9 with value: 0.802375779464462.


5% data -> Acc: 49.25%


[I 2025-10-17 02:52:57,737] Trial 19 finished with value: 0.6680821647244717 and parameters: {'embed_dim': 256, 'conv_layers': 6, 'conv_ch_0': 128, 'conv_ch_1': 32, 'conv_ch_2': 16, 'conv_ch_3': 64, 'conv_ch_4': 32, 'conv_ch_5': 32, 'fc_count': 2, 'fc_nodes_0': 256, 'fc_nodes_1': 128, 'drop1': 0.47621737834986577, 'drop2': 0.27773123154902485, 'lr': 0.00041784776868338035, 'batch_size': 128, 'optimizer': 'AdamW'}. Best is trial 9 with value: 0.802375779464462.


5% data -> Acc: 66.03%


[I 2025-10-17 02:53:34,355] Trial 20 finished with value: 0.74724189498039 and parameters: {'embed_dim': 256, 'conv_layers': 5, 'conv_ch_0': 64, 'conv_ch_1': 32, 'conv_ch_2': 32, 'conv_ch_3': 64, 'conv_ch_4': 128, 'fc_count': 3, 'fc_nodes_0': 256, 'fc_nodes_1': 128, 'fc_nodes_2': 512, 'drop1': 0.39660503306027195, 'drop2': 0.18270136392295272, 'lr': 0.0008100588949115134, 'batch_size': 32, 'optimizer': 'Yogi'}. Best is trial 9 with value: 0.802375779464462.


5% data -> Acc: 66.72%


[I 2025-10-17 02:53:45,417] Trial 21 finished with value: 0.8091052735532294 and parameters: {'embed_dim': 128, 'conv_layers': 5, 'conv_ch_0': 128, 'conv_ch_1': 128, 'conv_ch_2': 128, 'conv_ch_3': 128, 'conv_ch_4': 16, 'fc_count': 2, 'fc_nodes_0': 256, 'fc_nodes_1': 256, 'drop1': 0.4582219325663578, 'drop2': 0.13788391792099114, 'lr': 0.00016402905257424704, 'batch_size': 128, 'optimizer': 'Lion'}. Best is trial 21 with value: 0.8091052735532294.


5% data -> Acc: 81.93%


[I 2025-10-17 02:53:56,336] Trial 22 finished with value: 0.7961823876301459 and parameters: {'embed_dim': 128, 'conv_layers': 5, 'conv_ch_0': 128, 'conv_ch_1': 128, 'conv_ch_2': 128, 'conv_ch_3': 128, 'conv_ch_4': 16, 'fc_count': 2, 'fc_nodes_0': 256, 'fc_nodes_1': 256, 'drop1': 0.45756232128686214, 'drop2': 0.12893208374773366, 'lr': 0.00015819042025498945, 'batch_size': 128, 'optimizer': 'Lion'}. Best is trial 21 with value: 0.8091052735532294.


5% data -> Acc: 80.65%


[I 2025-10-17 02:54:07,553] Trial 23 finished with value: 0.792838802516859 and parameters: {'embed_dim': 128, 'conv_layers': 4, 'conv_ch_0': 128, 'conv_ch_1': 128, 'conv_ch_2': 128, 'conv_ch_3': 128, 'fc_count': 2, 'fc_nodes_0': 256, 'fc_nodes_1': 256, 'drop1': 0.49814638911233955, 'drop2': 0.2120215047415209, 'lr': 0.0002417932712965079, 'batch_size': 128, 'optimizer': 'Lion'}. Best is trial 21 with value: 0.8091052735532294.


5% data -> Acc: 80.98%


[I 2025-10-17 02:54:19,467] Trial 24 finished with value: 0.8000761829519483 and parameters: {'embed_dim': 128, 'conv_layers': 6, 'conv_ch_0': 128, 'conv_ch_1': 128, 'conv_ch_2': 128, 'conv_ch_3': 128, 'conv_ch_4': 16, 'conv_ch_5': 32, 'fc_count': 2, 'fc_nodes_0': 256, 'fc_nodes_1': 256, 'drop1': 0.4167886648858113, 'drop2': 0.16805662921926628, 'lr': 9.043473893852983e-05, 'batch_size': 128, 'optimizer': 'Lion'}. Best is trial 21 with value: 0.8091052735532294.


5% data -> Acc: 79.87%


[I 2025-10-17 02:54:38,298] Trial 25 finished with value: 0.5011709601873536 and parameters: {'embed_dim': 128, 'conv_layers': 6, 'conv_ch_0': 128, 'conv_ch_1': 128, 'conv_ch_2': 128, 'conv_ch_3': 128, 'conv_ch_4': 64, 'conv_ch_5': 32, 'fc_count': 2, 'fc_nodes_0': 256, 'fc_nodes_1': 256, 'drop1': 0.41723493834516334, 'drop2': 0.1185595436310527, 'lr': 9.825507268081447e-05, 'batch_size': 128, 'optimizer': 'LAMB'}. Best is trial 21 with value: 0.8091052735532294.


5% data -> Acc: 49.65%


[I 2025-10-17 02:54:49,604] Trial 26 finished with value: 0.7901723991986682 and parameters: {'embed_dim': 128, 'conv_layers': 6, 'conv_ch_0': 64, 'conv_ch_1': 16, 'conv_ch_2': 128, 'conv_ch_3': 128, 'conv_ch_4': 16, 'conv_ch_5': 32, 'fc_count': 2, 'fc_nodes_0': 1024, 'fc_nodes_1': 256, 'drop1': 0.38224112432096574, 'drop2': 0.1666501121028258, 'lr': 8.685336033243513e-05, 'batch_size': 128, 'optimizer': 'Lion'}. Best is trial 21 with value: 0.8091052735532294.


5% data -> Acc: 80.77%


[I 2025-10-17 02:55:00,687] Trial 27 finished with value: 0.5014813351767726 and parameters: {'embed_dim': 128, 'conv_layers': 6, 'conv_ch_0': 16, 'conv_ch_1': 64, 'conv_ch_2': 128, 'conv_ch_3': 128, 'conv_ch_4': 16, 'conv_ch_5': 16, 'fc_count': 2, 'fc_nodes_0': 128, 'fc_nodes_1': 256, 'drop1': 0.4757568114214608, 'drop2': 0.14092293667262856, 'lr': 5.339004847891486e-05, 'batch_size': 128, 'optimizer': 'Lion'}. Best is trial 21 with value: 0.8091052735532294.


5% data -> Acc: 49.71%


[I 2025-10-17 02:55:12,439] Trial 28 finished with value: 0.8031093930758162 and parameters: {'embed_dim': 128, 'conv_layers': 5, 'conv_ch_0': 128, 'conv_ch_1': 128, 'conv_ch_2': 128, 'conv_ch_3': 128, 'conv_ch_4': 64, 'fc_count': 2, 'fc_nodes_0': 512, 'fc_nodes_1': 256, 'drop1': 0.4160715336042746, 'drop2': 0.17291751601893907, 'lr': 0.00015503753284701072, 'batch_size': 128, 'optimizer': 'Lion'}. Best is trial 21 with value: 0.8091052735532294.


5% data -> Acc: 80.38%


[I 2025-10-17 02:55:23,416] Trial 29 finished with value: 0.7922039445839565 and parameters: {'embed_dim': 64, 'conv_layers': 5, 'conv_ch_0': 128, 'conv_ch_1': 128, 'conv_ch_2': 128, 'conv_ch_3': 32, 'conv_ch_4': 64, 'fc_count': 2, 'fc_nodes_0': 512, 'fc_nodes_1': 256, 'drop1': 0.4409821355098291, 'drop2': 0.24395836137289373, 'lr': 0.00017548601941601814, 'batch_size': 128, 'optimizer': 'Lion'}. Best is trial 21 with value: 0.8091052735532294.


5% data -> Acc: 79.67%


[I 2025-10-17 02:56:09,972] Trial 30 finished with value: 0.5015518749470952 and parameters: {'embed_dim': 128, 'conv_layers': 4, 'conv_ch_0': 64, 'conv_ch_1': 32, 'conv_ch_2': 128, 'conv_ch_3': 128, 'fc_count': 2, 'fc_nodes_0': 512, 'fc_nodes_1': 1024, 'drop1': 0.34622649336638733, 'drop2': 0.2345806301700024, 'lr': 0.00014460944698541014, 'batch_size': 32, 'optimizer': 'LAMB'}. Best is trial 21 with value: 0.8091052735532294.


5% data -> Acc: 49.77%


[I 2025-10-17 02:56:21,354] Trial 31 finished with value: 0.8033351203408482 and parameters: {'embed_dim': 128, 'conv_layers': 5, 'conv_ch_0': 128, 'conv_ch_1': 128, 'conv_ch_2': 128, 'conv_ch_3': 128, 'conv_ch_4': 64, 'fc_count': 2, 'fc_nodes_0': 512, 'fc_nodes_1': 256, 'drop1': 0.4194296536735568, 'drop2': 0.17830754633605134, 'lr': 0.00010894290448098528, 'batch_size': 128, 'optimizer': 'Lion'}. Best is trial 21 with value: 0.8091052735532294.


5% data -> Acc: 82.03%


[I 2025-10-17 02:56:32,953] Trial 32 finished with value: 0.7946869444993087 and parameters: {'embed_dim': 128, 'conv_layers': 5, 'conv_ch_0': 128, 'conv_ch_1': 128, 'conv_ch_2': 128, 'conv_ch_3': 128, 'conv_ch_4': 64, 'fc_count': 2, 'fc_nodes_0': 512, 'fc_nodes_1': 256, 'drop1': 0.4540344459184147, 'drop2': 0.18033114710242362, 'lr': 0.0001220840841717034, 'batch_size': 128, 'optimizer': 'Lion'}. Best is trial 21 with value: 0.8091052735532294.


5% data -> Acc: 79.59%


[I 2025-10-17 02:56:44,219] Trial 33 finished with value: 0.8005276374820124 and parameters: {'embed_dim': 128, 'conv_layers': 5, 'conv_ch_0': 128, 'conv_ch_1': 128, 'conv_ch_2': 128, 'conv_ch_3': 128, 'conv_ch_4': 64, 'fc_count': 2, 'fc_nodes_0': 512, 'fc_nodes_1': 256, 'drop1': 0.4297068829204158, 'drop2': 0.151214008773626, 'lr': 0.00020954917116612176, 'batch_size': 128, 'optimizer': 'Lion'}. Best is trial 21 with value: 0.8091052735532294.


5% data -> Acc: 79.64%


[I 2025-10-17 02:56:56,212] Trial 34 finished with value: 0.5156598290115968 and parameters: {'embed_dim': 128, 'conv_layers': 5, 'conv_ch_0': 128, 'conv_ch_1': 128, 'conv_ch_2': 64, 'conv_ch_3': 128, 'conv_ch_4': 64, 'fc_count': 2, 'fc_nodes_0': 512, 'fc_nodes_1': 256, 'drop1': 0.4797336562814925, 'drop2': 0.11610867560042697, 'lr': 0.0003116743192620133, 'batch_size': 128, 'optimizer': 'Yogi'}. Best is trial 21 with value: 0.8091052735532294.


5% data -> Acc: 50.06%


[I 2025-10-17 02:57:25,274] Trial 35 finished with value: 0.7785333370954545 and parameters: {'embed_dim': 128, 'conv_layers': 4, 'conv_ch_0': 128, 'conv_ch_1': 128, 'conv_ch_2': 128, 'conv_ch_3': 128, 'fc_count': 2, 'fc_nodes_0': 512, 'fc_nodes_1': 512, 'drop1': 0.32119222739383374, 'drop2': 0.17406736795816874, 'lr': 6.679336100836786e-05, 'batch_size': 32, 'optimizer': 'Lion'}. Best is trial 21 with value: 0.8091052735532294.


5% data -> Acc: 69.73%


[I 2025-10-17 02:57:33,936] Trial 36 finished with value: 0.5060664202477356 and parameters: {'embed_dim': 128, 'conv_layers': 5, 'conv_ch_0': 128, 'conv_ch_1': 16, 'conv_ch_2': 128, 'conv_ch_3': 128, 'conv_ch_4': 64, 'fc_count': 2, 'fc_nodes_0': 512, 'fc_nodes_1': 256, 'drop1': 0.40375045487928657, 'drop2': 0.1924076793422155, 'lr': 3.9424567229416656e-05, 'batch_size': 128, 'optimizer': 'AdamW'}. Best is trial 21 with value: 0.8091052735532294.


5% data -> Acc: 50.29%


[I 2025-10-17 02:58:06,029] Trial 37 finished with value: 0.5184390959623035 and parameters: {'embed_dim': 64, 'conv_layers': 4, 'conv_ch_0': 32, 'conv_ch_1': 64, 'conv_ch_2': 32, 'conv_ch_3': 32, 'fc_count': 3, 'fc_nodes_0': 512, 'fc_nodes_1': 256, 'fc_nodes_2': 128, 'drop1': 0.29124424654719006, 'drop2': 0.15203069958273144, 'lr': 0.00011319982697925382, 'batch_size': 32, 'optimizer': 'AdaBelief'}. Best is trial 21 with value: 0.8091052735532294.


5% data -> Acc: 50.42%


[I 2025-10-17 02:58:35,186] Trial 38 finished with value: 0.7981575011991761 and parameters: {'embed_dim': 128, 'conv_layers': 4, 'conv_ch_0': 128, 'conv_ch_1': 128, 'conv_ch_2': 64, 'conv_ch_3': 16, 'fc_count': 2, 'fc_nodes_0': 512, 'fc_nodes_1': 1024, 'drop1': 0.3783048577750752, 'drop2': 0.11683643028102086, 'lr': 0.00022254449135547232, 'batch_size': 32, 'optimizer': 'Lion'}. Best is trial 21 with value: 0.8091052735532294.


5% data -> Acc: 80.02%


[I 2025-10-17 02:58:50,928] Trial 39 finished with value: 0.5011709601873536 and parameters: {'embed_dim': 128, 'conv_layers': 5, 'conv_ch_0': 32, 'conv_ch_1': 32, 'conv_ch_2': 128, 'conv_ch_3': 128, 'conv_ch_4': 64, 'fc_count': 2, 'fc_nodes_0': 1024, 'fc_nodes_1': 256, 'drop1': 0.44092271468979444, 'drop2': 0.26818107080420145, 'lr': 0.0001642678955650859, 'batch_size': 128, 'optimizer': 'LAMB'}. Best is trial 21 with value: 0.8091052735532294.


5% data -> Acc: 49.82%


[I 2025-10-17 02:59:27,308] Trial 40 finished with value: 0.7951666149375017 and parameters: {'embed_dim': 64, 'conv_layers': 6, 'conv_ch_0': 128, 'conv_ch_1': 16, 'conv_ch_2': 128, 'conv_ch_3': 64, 'conv_ch_4': 128, 'conv_ch_5': 64, 'fc_count': 3, 'fc_nodes_0': 512, 'fc_nodes_1': 128, 'fc_nodes_2': 512, 'drop1': 0.35577347564519674, 'drop2': 0.3980907939455003, 'lr': 4.28874832948194e-05, 'batch_size': 32, 'optimizer': 'Lion'}. Best is trial 21 with value: 0.8091052735532294.


5% data -> Acc: 79.77%


[I 2025-10-17 02:59:38,826] Trial 41 finished with value: 0.8040405180440733 and parameters: {'embed_dim': 128, 'conv_layers': 5, 'conv_ch_0': 128, 'conv_ch_1': 128, 'conv_ch_2': 128, 'conv_ch_3': 128, 'conv_ch_4': 64, 'fc_count': 2, 'fc_nodes_0': 512, 'fc_nodes_1': 256, 'drop1': 0.42939268569184535, 'drop2': 0.15796305968103533, 'lr': 0.00021098702201892744, 'batch_size': 128, 'optimizer': 'Lion'}. Best is trial 21 with value: 0.8091052735532294.


5% data -> Acc: 77.54%


[I 2025-10-17 02:59:50,121] Trial 42 finished with value: 0.7381986964250444 and parameters: {'embed_dim': 128, 'conv_layers': 5, 'conv_ch_0': 128, 'conv_ch_1': 128, 'conv_ch_2': 128, 'conv_ch_3': 128, 'conv_ch_4': 64, 'fc_count': 2, 'fc_nodes_0': 512, 'fc_nodes_1': 256, 'drop1': 0.46174776989863353, 'drop2': 0.16353127942350545, 'lr': 0.00033361432987010295, 'batch_size': 128, 'optimizer': 'Lion'}. Best is trial 21 with value: 0.8091052735532294.


5% data -> Acc: 74.42%


[I 2025-10-17 03:00:01,684] Trial 43 finished with value: 0.7768544905617787 and parameters: {'embed_dim': 128, 'conv_layers': 5, 'conv_ch_0': 128, 'conv_ch_1': 128, 'conv_ch_2': 128, 'conv_ch_3': 16, 'conv_ch_4': 64, 'fc_count': 2, 'fc_nodes_0': 512, 'fc_nodes_1': 256, 'drop1': 0.4139254072214423, 'drop2': 0.13856774961994284, 'lr': 0.00022824001382136348, 'batch_size': 128, 'optimizer': 'Lion'}. Best is trial 21 with value: 0.8091052735532294.


5% data -> Acc: 77.76%


[I 2025-10-17 03:00:13,505] Trial 44 finished with value: 0.5011709601873536 and parameters: {'embed_dim': 128, 'conv_layers': 5, 'conv_ch_0': 128, 'conv_ch_1': 128, 'conv_ch_2': 64, 'conv_ch_3': 128, 'conv_ch_4': 64, 'fc_count': 2, 'fc_nodes_0': 512, 'fc_nodes_1': 256, 'drop1': 0.48570980193494284, 'drop2': 0.20012285661448959, 'lr': 1.2512621811018931e-05, 'batch_size': 128, 'optimizer': 'Yogi'}. Best is trial 21 with value: 0.8091052735532294.


5% data -> Acc: 50.21%


[I 2025-10-17 03:00:25,443] Trial 45 finished with value: 0.5011709601873536 and parameters: {'embed_dim': 128, 'conv_layers': 4, 'conv_ch_0': 128, 'conv_ch_1': 128, 'conv_ch_2': 128, 'conv_ch_3': 128, 'fc_count': 2, 'fc_nodes_0': 128, 'fc_nodes_1': 512, 'drop1': 0.43247431997302777, 'drop2': 0.11019065043907665, 'lr': 0.00014896165737310907, 'batch_size': 128, 'optimizer': 'AdaBelief'}. Best is trial 21 with value: 0.8091052735532294.


5% data -> Acc: 50.12%


[I 2025-10-17 03:00:32,929] Trial 46 finished with value: 0.5524110493496233 and parameters: {'embed_dim': 128, 'conv_layers': 5, 'conv_ch_0': 64, 'conv_ch_1': 32, 'conv_ch_2': 16, 'conv_ch_3': 16, 'conv_ch_4': 32, 'fc_count': 2, 'fc_nodes_0': 512, 'fc_nodes_1': 256, 'drop1': 0.3868928009948905, 'drop2': 0.13122833856037536, 'lr': 0.00011354938999278437, 'batch_size': 128, 'optimizer': 'AdamW'}. Best is trial 21 with value: 0.8091052735532294.


5% data -> Acc: 54.27%


[I 2025-10-17 03:01:04,097] Trial 47 finished with value: 0.49882903981264637 and parameters: {'embed_dim': 128, 'conv_layers': 5, 'conv_ch_0': 32, 'conv_ch_1': 128, 'conv_ch_2': 32, 'conv_ch_3': 64, 'conv_ch_4': 128, 'fc_count': 2, 'fc_nodes_0': 1024, 'fc_nodes_1': 128, 'drop1': 0.3695163517413234, 'drop2': 0.1799366083424312, 'lr': 0.00025909922550542243, 'batch_size': 32, 'optimizer': 'Lion'}. Best is trial 21 with value: 0.8091052735532294.


5% data -> Acc: 49.54%


[I 2025-10-17 03:01:15,223] Trial 48 finished with value: 0.7606726672497954 and parameters: {'embed_dim': 64, 'conv_layers': 6, 'conv_ch_0': 16, 'conv_ch_1': 64, 'conv_ch_2': 128, 'conv_ch_3': 128, 'conv_ch_4': 64, 'conv_ch_5': 128, 'fc_count': 2, 'fc_nodes_0': 512, 'fc_nodes_1': 1024, 'drop1': 0.44532406370321, 'drop2': 0.14901005851856572, 'lr': 0.00019123236601636945, 'batch_size': 128, 'optimizer': 'Lion'}. Best is trial 21 with value: 0.8091052735532294.


5% data -> Acc: 72.04%


[I 2025-10-17 03:01:25,432] Trial 49 finished with value: 0.7460286109308428 and parameters: {'embed_dim': 128, 'conv_layers': 5, 'conv_ch_0': 128, 'conv_ch_1': 128, 'conv_ch_2': 16, 'conv_ch_3': 64, 'conv_ch_4': 64, 'fc_count': 2, 'fc_nodes_0': 256, 'fc_nodes_1': 256, 'drop1': 0.4881110236678762, 'drop2': 0.2023702740519726, 'lr': 0.0003840322741278942, 'batch_size': 128, 'optimizer': 'Lion'}. Best is trial 21 with value: 0.8091052735532294.


5% data -> Acc: 74.38%

Best Trial: {'embed_dim': 128, 'conv_layers': 5, 'conv_ch_0': 128, 'conv_ch_1': 128, 'conv_ch_2': 128, 'conv_ch_3': 128, 'conv_ch_4': 16, 'fc_count': 2, 'fc_nodes_0': 256, 'fc_nodes_1': 256, 'drop1': 0.4582219325663578, 'drop2': 0.13788391792099114, 'lr': 0.00016402905257424704, 'batch_size': 128, 'optimizer': 'Lion'}


In [128]:
# %% Evaluate final model
best_params = study.best_trial.params
final_model = URLBinaryCNN(
    vocab_size,
    best_params["embed_dim"],
    best_params["conv_layers"],
    [best_params[f"conv_ch_{i}"] for i in range(best_params["conv_layers"])],
    [best_params[f"fc_nodes_{i}"] for i in range(best_params["fc_count"])],
    [best_params["drop1"], best_params["drop2"]],
).to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(final_model.parameters(), lr=best_params["lr"])
train_loader = DataLoader(train_dataset, batch_size=best_params["batch_size"], shuffle=True)
val_loader = DataLoader(test_dataset, batch_size=256)

trainer = Trainer(final_model, criterion, optimizer)
trainer.train_model(train_loader, val_loader, epochs=10, log=True)
test_acc = trainer.evaluate_model(val_loader)
print(f"✅ Final Test Accuracy: {test_acc*100:.2f}%")

Epoch [1/10] | Train Acc: 0.6329 | Val Acc: 0.7332 | Loss: 0.5759
Epoch [2/10] | Train Acc: 0.7441 | Val Acc: 0.7661 | Loss: 0.5641
Epoch [3/10] | Train Acc: 0.7678 | Val Acc: 0.7738 | Loss: 0.5610
Epoch [4/10] | Train Acc: 0.7980 | Val Acc: 0.7882 | Loss: 0.5590
Epoch [5/10] | Train Acc: 0.8160 | Val Acc: 0.8182 | Loss: 0.5574
Epoch [6/10] | Train Acc: 0.8252 | Val Acc: 0.7971 | Loss: 0.5561
Epoch [7/10] | Train Acc: 0.8302 | Val Acc: 0.8283 | Loss: 0.5554
Epoch [8/10] | Train Acc: 0.8431 | Val Acc: 0.8362 | Loss: 0.5548
Epoch [9/10] | Train Acc: 0.8474 | Val Acc: 0.8284 | Loss: 0.5537
Epoch [10/10] | Train Acc: 0.8552 | Val Acc: 0.8384 | Loss: 0.5529
✅ Final Test Accuracy: 83.84%


In [129]:
import optuna
from optuna.visualization import plot_optimization_history,plot_parallel_coordinate, plot_param_importances, plot_slice, plot_contour, plot_edf

In [130]:
# Plot 1: How accuracy improved with each trial
plot_optimization_history(study).show()

In [132]:

# Plot 3: Which parameters were most important
plot_param_importances(study).show()

In [133]:

# Plot 4: How each parameter individually affected score
plot_slice(study).show()

In [135]:
# Plot 6: Empirical distribution of trial scores
plot_edf(study).show()