In [None]:
import pandas as pd
import numpy as np
import torch

from transformers import BertModel, BertTokenizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE

from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
url_data = pd.read_csv('data/malicious_phish.csv')

lb_make = LabelEncoder()
url_data["type_code"] = lb_make.fit_transform(url_data["type"])

url_data = url_data.drop_duplicates(subset=['url']).dropna()

In [None]:
url_data = url_data.drop(columns=['type'])

In [None]:
url_data['type_code'].value_counts()

In [None]:
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def extract_features(url):
    if not url.startswith(("http://", "https://")):
        url = "http://" + url

    inputs = tokenizer.encode_plus(url, return_tensors='pt', add_special_tokens=True, max_length=512)

    input_ids = inputs['input_ids']

    if 'attention_mask' in inputs:
        attention_mask = inputs['attention_mask']
    else:
        attention_mask = None

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        hidden_states = outputs[2]

    token_vecs = [torch.mean(hidden_states[layer][0], dim=0) for layer in range(-4, 0)]
    return torch.stack(token_vecs).numpy()

In [None]:
features = np.array([extract_features(url) for url in url_data["url"]])

features_reshaped = features.reshape((features.shape[0], -1))

dataset = np.hstack((features_reshaped, url_data["type_code"].values.reshape((-1, 1))))

In [None]:
dataset[0].shape

In [None]:
X = dataset[:, :-1]
y = dataset[:, -1]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
sm = SMOTE(random_state=42)
X_train, y_train = sm.fit_resample(X_train, y_train)

HistGradientBoostingClassifier

In [None]:
clf = HistGradientBoostingClassifier()
clf.fit(X_train, y_train)

In [None]:
score = clf.score(X_test, y_test)
print("Accuracy:", score)

In [None]:
y_pred = clf.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

RandomForestClassifier

In [None]:
clf2 = RandomForestClassifier()
clf2.fit(X_train, y_train)

In [None]:
score = clf2.score(X_test, y_test)
print("Accuracy:", score)

In [None]:
y_pred = clf2.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)