In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tqdm.notebook import tqdm

# Load the dataset
data = pd.read_csv('phishing.csv')

# Define features and target variable
X = data.drop(columns=['Domain', 'Label'])
y = data['Label']

# Splitting the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# BERT Model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.to(device)

# Tokenize and prepare data for BERT
def tokenize_data(data):
    input_ids = []
    attention_masks = []

    for _, row in data.iterrows():
        text = ' '.join(str(feature) for feature in row)
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=64,
                            pad_to_max_length=True,
                            return_attention_mask=True,
                            return_tensors='pt',
                       )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

X_train_tokens, X_train_masks = tokenize_data(X_train)
X_test_tokens, X_test_masks = tokenize_data(X_test)

# Convert labels to tensors
y_train_tensor = torch.tensor(y_train.values)
y_test_tensor = torch.tensor(y_test.values)

# Create DataLoader for BERT
batch_size = 32
train_data = TensorDataset(X_train_tokens, X_train_masks, y_train_tensor)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

test_data = TensorDataset(X_test_tokens, X_test_masks, y_test_tensor)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

# Fine-tune BERT model
optimizer = AdamW(model.parameters(), lr=1e-5)

epochs = 3
for epoch in tqdm(range(epochs)):
    model.train()
    total_loss = 0
    for batch in tqdm(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2]}
        model.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    avg_train_loss = total_loss / len(train_dataloader)

    print(f'Epoch: {epoch + 1}')
    print(f'Average training loss: {avg_train_loss}')

# Evaluate BERT model
model.eval()
bert_preds = []
with torch.no_grad():
    for batch in tqdm(test_dataloader):
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1]}
        outputs = model(**inputs)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).flatten().cpu().numpy()
        bert_preds.extend(preds)

def classify_url(url):
    # Preprocess the URL
    data = pd.DataFrame({'Domain': [url]})

    # Tokenize and prepare data for BERT
    input_ids, attention_masks = tokenize_data(data)

    # Convert input to tensor
    input_ids = input_ids.to(device)
    attention_masks = attention_masks.to(device)

    # Make prediction
    model.eval()
    with torch.no_grad():
        inputs = {'input_ids': input_ids,
                  'attention_mask': attention_masks}
        outputs = model(**inputs)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).flatten().cpu().numpy()

    # Return the prediction
    if preds[0] == 1:
        return "Phishing"
    else:
        return "Legitimate"

bert_accuracy = accuracy_score(y_test, bert_preds)
print("BERT Accuracy:", bert_accuracy)
print("BERT Classification Report:")
print(classification_report(y_test, bert_preds))

# Example usage:
url = "example.com"
classification = classify_url(url)
print(f"The URL '{url}' is classified as: {classification}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

Epoch: 1
Average training loss: 0.03147140581347048


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch: 2
Average training loss: 0.0016796002397313713


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch: 3
Average training loss: 0.0008173306812532246


  0%|          | 0/32 [00:00<?, ?it/s]

BERT Accuracy: 1.0
BERT Classification Report:
              precision    recall  f1-score   support

           1       1.00      1.00      1.00      1000

    accuracy                           1.00      1000
   macro avg       1.00      1.00      1.00      1000
weighted avg       1.00      1.00      1.00      1000

The URL 'example.com' is classified as: Phishing


