<h2>Multi Input Neural network</h2>
As bert_output can overshadow the importance of other features, we first train a branch of the neural network
on other features, then combine it with the branch trained on the dominant feature

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset,DataLoader
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from urllib.parse import urlparse
import re
import unicodedata

In [2]:
def hasMisleadingChars(url):
  for char in url:
    if not (char.isascii() or char.isspace()):
      category = unicodedata.category(char)
      if category.startswith("L") and not unicodedata.combining(char):
        return True
  return  False

def extract_url_features(url,bert_output):
    url = url.strip().lower()
    protocol = 1 if urlparse(url).scheme == 'https' else 0
    url = re.sub(r"https?://","",url)
    parts = url.split("/",1)
    domain = parts[0]
    path = parts[1] if len(parts)>1 else ""

    features = {
        "domain_length" : len(domain),
        "subdomains" : domain.count('.'),
        'num_dots': url.count('.'),
         'num_equals': url.count('='),
         'protocol': protocol,
        "missing_chars": hasMisleadingChars(url),
        'bert_output': bert_output
    }

    return list(features.values())

In [3]:
nn_data = pd.read_csv("./bertDataV3.csv")
nn_data['type']  = nn_data['type'].map(lambda x: 1 if x == 'benign' else 0)
X = X = np.array(nn_data.apply(lambda row: extract_url_features(row['url'], row['bert_output']), axis=1).tolist())
y = np.array(nn_data['type'])
nn_data = pd.read_csv("./bertDataV3.csv")
nn_data['type']  = nn_data['type'].map(lambda x: 1 if x == 'benign' else 0)
X_train,X_test,y_train,y_val = train_test_split(X,y,random_state=0,test_size=0.15)

In [4]:
scaler = StandardScaler()
other_train_scaled = scaler.fit_transform(X_train[:,:-1])
other_val_scaled = scaler.transform(X_test[:,:-1])
dominant_train = X_train[:,-1]
dominant_val = X_test[:,-1]

In [5]:
class MultiInputDataset(Dataset):
    def __init__(self,X_dominant,X_other,y):
        self.X_dominant = torch.tensor(X_dominant,dtype=torch.float32) 
        self.X_other = torch.tensor(X_other,dtype=torch.float32)
        self.y = torch.tensor(y,dtype = torch.float32)

        assert self.X_dominant.shape[0] == self.X_other.shape[0] == self.y.shape[0], "Number of samples mismatch between inputs and target"
        if len(self.X_dominant.shape) == 1:
            self.X_dominant = self.X_dominant.unsqueeze(1)

    def  __len__(self): 
        return len(self.y) 

    def __getitem__(self,idx):
        return self.X_dominant[idx],self.X_other[idx],self.y[idx] 
        

In [6]:
class MultiInputNN(nn.Module): 
    def __init__(self,num_other_features):
        super(MultiInputNN, self).__init__()

        self.dominant_path = nn.Sequential(
            nn.Linear(1,1),
            nn.ReLU(),
            nn.Dropout(0.4)
        )

        self.other_path = nn.Sequential(
            nn.Linear(num_other_features,4),
            nn.ReLU(),
            nn.BatchNorm1d(4), 
            nn.Dropout(0.3),
        )
        combined_input = 1+4
        self.combined_path = nn.Sequential(
            nn.Linear(combined_input,4),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.BatchNorm1d(4),
            nn.Linear(4,1)
        )

    def forward(self,input_dominant,input_other):
        out_dominant = self.dominant_path(input_dominant)

        out_other = self.other_path(input_other)

        combined = torch.cat((out_dominant,out_other),dim = 1)

        final_output = self.combined_path(combined)

        return final_output    

In [8]:
batch_size = 64
train_dataset = MultiInputDataset(dominant_train,other_train_scaled,y_train)
val_dataset = MultiInputDataset(dominant_val,other_val_scaled,y_val)

num_other_features = other_val_scaled.shape[1]

In [9]:
model = MultiInputNN(num_other_features=num_other_features)

criterion = nn.BCEWithLogitsLoss()
learning_rate = 0.00001
optimizer = optim.AdamW(model.parameters(),lr=learning_rate,weight_decay=1e-4)

In [10]:
num_epochs = 30
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"using device {device}")

using device cuda


In [11]:
train_losses = []
val_losses = []
train_accuracys = []
val_accuracys = []
best_val_loss = float('inf')
best_model_path = "nnSaves/multiInput"

In [12]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [13]:
try:
    # Check the device of the weight of the first linear layer in dominant_path
    first_layer_weight_device = next(model.dominant_path.parameters()).device
    print(f"--- Model's first layer weight device: {first_layer_weight_device} ---")
except StopIteration:
    print("--- Could not find parameters in model.dominant_path (Check model definition) ---")

--- Model's first layer weight device: cuda:0 ---


In [14]:
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    train_total,train_correct = 0,0
    for dom_batch,oth_batch,labels_batch in train_loader:
        dom_batch,oth_batch,labels_batch = dom_batch.to(device),oth_batch.to(device),labels_batch.to(device)

        optimizer.zero_grad()
        outputs = model(dom_batch,oth_batch)
        loss = criterion(outputs,labels_batch.unsqueeze(1))
        loss.backward()
        optimizer.step()

        predicted = torch.sigmoid(outputs) > 0.5
        train_loss += loss.item()
        train_total += labels_batch.size(0)
        train_correct += (predicted == labels_batch.unsqueeze(1).byte()).sum().item()
        
    avg_train_loss = train_loss//len(train_loader)
    train_accuracy = 100*(train_correct/train_total)
    train_losses.append(avg_train_loss)
    train_accuracys.append(train_accuracy)

    model.eval()
    val_loss = 0.0
    val_correct,val_total = 0,0
    with torch.no_grad():
        for dom_batch,oth_batch,labels_batch in val_loader:
            dom_batch,oth_batch,labels_batch = dom_batch.to(device),oth_batch.to(device),labels_batch.to(device) 
            outputs = model(dom_batch,oth_batch)
            loss = criterion(outputs,labels_batch.unsqueeze(1))
            val_loss += loss.item()

            predicted = torch.sigmoid(outputs) > 0.5
            val_total += labels_batch.size(0) 
            val_correct = (predicted == labels_batch.unsqueeze(1).byte()).sum().item()
    avg_val_loss = val_loss/len(val_loader)
    val_accuracy = 100*(val_correct/val_total)

    print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Val Acc: {val_accuracy:.2f}%')
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), best_model_path)

print("Training finished.")
print(f"Best validation loss achieved: {best_val_loss:.4f}")
print(f"Best model parameters saved to: {best_model_path}")    

Epoch [1/30], Train Loss: 0.0000, Val Loss: 0.6874, Val Acc: 0.08%
Epoch [2/30], Train Loss: 0.0000, Val Loss: 0.6575, Val Acc: 0.08%


KeyboardInterrupt: 

In [129]:
train_accuracys

[78.55270521014344,
 80.38244043236237,
 80.74323329308162,
 80.82275498483197,
 80.92252525550026,
 80.99946985538833,
 81.05358878449621,
 81.09077253850911,
 81.16514004653492,
 81.17397579006274,
 69.3985067593438,
 75.72305834535976,
 77.11726504285336,
 56.08598651076488,
 61.10616145848673]