In [1]:
import pandas as pd 
import numpy as np
import sys
from tqdm import tqdm

from preprocessing_ml_table import *

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.decomposition import PCA

from xgboost import XGBClassifier

np.set_printoptions(threshold=sys.maxsize)

## Reading the Dataset

In [2]:
df_surtur = pd.read_csv('super_final_dataset_surtur.csv')
df_surtur.head()

Unnamed: 0,content,has_IP_in_url,having_@_in_url,hostname,https,label,length_hostname,number_subdomains,number_underscores,ratio_digits_hostname,...,content_emb_12,content_emb_13,content_emb_14,content_emb_15,content_emb_16,content_emb_17,content_emb_18,content_emb_19,is_in_alexa,num_?s
0,"<html><head><meta content=""no-cache"" http-equi...",0.0,0.0,00.124.324.77.00.opteamevent.hu,0,1,31.0,6.0,0.0,0.387097,...,0.045458,0.045459,0.045461,0.045461,0.045458,0.045458,0.045458,0.04546,0,0
1,�MQTH1TDa1��fl�hF1bVOFESLCnBbRI9MRTH1PDa1SP...,0.0,0.0,00005ik.rcomhost.com,0,1,20.0,1.0,0.0,0.25,...,0.04416,0.04416,0.04416,0.04416,0.04416,0.04416,0.04416,0.044159,0,0
2,"<!DOCTYPE html>\n<html data-adblockkey=""MFwwDQ...",0.0,0.0,000098.ihostfull.com,0,1,20.0,1.0,0.0,0.3,...,0.046553,0.048641,0.048712,0.047349,0.048684,0.047914,0.048007,0.049129,0,0
3,"<html>\n<head>\n<meta content=""noarchive"" name...",0.0,0.0,000p6vl.wcomhost.com,0,1,20.0,1.0,0.0,0.2,...,0.047654,0.04767,0.047664,0.047662,0.04767,0.047674,0.04766,0.047667,1,0
4,"<html><head><meta content=""no-cache"" http-equi...",0.0,0.0,001.002.003.23.opteamevent.hu,0,1,29.0,5.0,0.0,0.37931,...,0.045458,0.045459,0.045461,0.045461,0.045458,0.045458,0.045458,0.04546,0,0


In [3]:
print(np.array(df_surtur.columns))

['content' 'has_IP_in_url' 'having_@_in_url' 'hostname' 'https' 'label'
 'length_hostname' 'number_subdomains' 'number_underscores'
 'ratio_digits_hostname' 'ratio_digits_url' 'tld' 'url' 'url_len' 'who_is'
 'js' 'js_len' 'js_ref' 'js_array_len_avg' 'js_array_len_max'
 'content_len' 'num_js_func_calls' 'malicious_func_count'
 'total_url_count' 'ext_url_count' 'num_semicolons' 'num_zeros'
 'num_spaces' 'num_hyphens' 'num_@s' 'num_queries' 'num_ampersands'
 'num_equals' 'domain' 'domain_len' 'img_mean' 'img_mean_red'
 'image_mobnet_0' 'image_mobnet_1' 'image_mobnet_2' 'image_mobnet_3'
 'image_mobnet_4' 'image_mobnet_5' 'image_mobnet_6' 'image_mobnet_7'
 'image_mobnet_8' 'image_mobnet_9' 'image_mobnet_10' 'image_mobnet_11'
 'image_mobnet_12' 'image_mobnet_13' 'image_mobnet_14' 'image_mobnet_15'
 'image_mobnet_16' 'image_mobnet_17' 'image_mobnet_18' 'image_mobnet_19'
 'dcd_color_1' 'dcd_color_2' 'dcd_color_3' 'dcd_color_4' 'dcd_color_5'
 'google_is_safe' 'ip_address' 'location' 'url_emb_me

## Machine Learning

### Summary Table

| Variation                   | Accuracy alone | Accuracy with variations above |
|-----------------------------|----------------|--------------------------------|
| Baseline (no embeddings)    | N/A            | 83.11%                         |
| Lexical                     | N/A            | <mark>75.48%</mark>            |
| Host-based                  | 65.70%         | <mark>80.02%</mark>            |
| Content-based               | 76.30%         | <mark>83.11%</mark>            |
| lngfrm all embs             | 76.38%         | 83.23%                         |
| lngfrm embs PCA 10          | 75.41%         | <mark>83.73%</mark>            |
| lngfrm embs PCA 25          | 75.24%         | 82.41%                         |
| lngfrm embs PCA 50          | 75.25%         | 82.09%                         |
| lngfrm feat selection       | 76.43%         | 83.29%                         |
| img all embs                | 60.27%         | 82.24%                         |
| img embs PCA 10             | 62.57%         | <mark>83.69%</mark>            |
| img embs PCA 25             | 61.73%         | 83.46%                         |
| img embs PCA 50             | 61.16%         | 82.82%                         |
| img feat selection          | 61.29%         | 83.62%                         |
| content all embeddings      | 67.63%         | 81.89%                         |
| content embs PCA 10         | 67.71%         | <mark>83.74%</mark>            |
| content embs PCA 25         | 68.11%         | 83.58%                         |
| content embs PCA 50         | 67.76%         | 83.06%                         |
| content feat selection      | 66.57%         | 83.59%                         |
| BERT url embs all           | 76.62%         | 82.47%                         |
| BERT url embs PCA 10        | 72.97%         | <mark>83.79%</mark>            |
| BERT url embs PCA 25        | 73.24%         | 83.57%                         |
| BERT url embs PCA 50        | 74.08%         | 82.99%                         |
| BERT url feat selection     | 75.12%         | 83.64%                         |
| Best Score (Feat selection) | N/A            | <mark>84.27%</mark>            |
| Best Score (PCA)            | N/A            | <mark>83.799%</mark>           |

### All Features without embeddings

In [4]:
baseline_feats = []
for feature in df_surtur.columns:
    if all(value not in feature for value in ['emb', 'mobnet', 'img_mean', 'url_emb_mean']):
        baseline_feats.append(feature)
        
make_classification(baseline_feats, df_surtur)



Accuracy_score = 0.8310740354535975


              precision    recall  f1-score   support

      benign     0.8182    0.9087    0.8611      6079
   malicious     0.8539    0.7255    0.7845      4470

    accuracy                         0.8311     10549
   macro avg     0.8361    0.8171    0.8228     10549
weighted avg     0.8333    0.8311    0.8286     10549

Feature                   Importance               
who_is                    12.321550%               
unique_url_nums           8.626608%                
unique_url_chars          6.183530%                
https                     4.710356%                
tld                       4.121293%                
number_subdomains         3.091990%                
url_len                   3.065580%                
num_hyphens               2.782290%                
having_@_in_url           2.776576%                
is_in_alexa               2.729821%                
dcd_color_1               2.676867%                
location   

### Lexical Features

In [5]:
feats_to_drop = ['label', 'content', 'hostname', 'url', 'js', 
                    'domain', 'google_is_safe', 'ip_address']

In [6]:
lexical_features = ['has_IP_in_url', 'having_@_in_url', 'length_hostname', 'number_subdomains', 
                    'number_underscores', 'ratio_digits_hostname', 'ratio_digits_url', 'tld', 
                    'url_len', 'num_semicolons', 'num_zeros', 'num_spaces', 'num_hyphens', 
                    'num_@s', 'num_queries', 'num_ampersands', 'num_equals', 'domain_len',
                    'hex_len', 'has_hex', 'unique_url_chars', 'unique_url_nums', 'unique_url_letters',
                    'ratio_let_chars', 'ratio_nums_chars', 'num_?s']

make_classification(lexical_features + feats_to_drop, df_surtur)

Accuracy_score = 0.754763484690492


              precision    recall  f1-score   support

      benign     0.7482    0.8659    0.8027      6079
   malicious     0.7680    0.6036    0.6759      4470

    accuracy                         0.7548     10549
   macro avg     0.7581    0.7348    0.7393     10549
weighted avg     0.7566    0.7548    0.7490     10549

Feature                   Importance               
unique_url_nums           12.100916%               
has_hex                   9.470193%                
unique_url_chars          9.390506%                
tld                       7.404562%                
having_@_in_url           6.221763%                
number_subdomains         5.808054%                
num_hyphens               4.286741%                
number_underscores        4.279814%                
num_equals                4.208982%                
num_queries               3.669810%                
url_len                   3.467369%                
num_ampersan

### Host-based Features

In [7]:
host_feats = ['https', 'who_is', 'is_in_alexa', 'location']

make_classification(host_feats + feats_to_drop, df_surtur)
make_classification(lexical_features + host_feats + feats_to_drop, df_surtur)

Accuracy_score = 0.6570291022845768


              precision    recall  f1-score   support

      benign     0.6615    0.8289    0.7358      6079
   malicious     0.6453    0.4233    0.5112      4470

    accuracy                         0.6570     10549
   macro avg     0.6534    0.6261    0.6235     10549
weighted avg     0.6547    0.6570    0.6407     10549

Feature                   Importance               
is_in_alexa               48.669788%               
location                  32.508582%               
https                     9.596092%                
who_is                    9.225541%                
Accuracy_score = 0.8002654280026543


              precision    recall  f1-score   support

      benign     0.7897    0.8906    0.8371      6079
   malicious     0.8199    0.6774    0.7419      4470

    accuracy                         0.8003     10549
   macro avg     0.8048    0.7840    0.7895     10549
weighted avg     0.8025    0.8003    0.7968     10549

Feature   

### Content-based Features

In [8]:
content_feats = ['js_len', 'js_ref', 'js_array_len_avg', 'js_array_len_max',
                 'content_len', 'num_js_func_calls', 'malicious_func_count','total_url_count', 
                 'ext_url_count', 'dcd_color_1', 'dcd_color_2', 'dcd_color_3', 'dcd_color_4', 'dcd_color_5']

make_classification(content_feats + feats_to_drop, df_surtur)
make_classification(lexical_features + host_feats + feats_to_drop + content_feats, df_surtur)

Accuracy_score = 0.7630107119158214


              precision    recall  f1-score   support

      benign     0.7628    0.8544    0.8060      6079
   malicious     0.7634    0.6387    0.6955      4470

    accuracy                         0.7630     10549
   macro avg     0.7631    0.7466    0.7508     10549
weighted avg     0.7630    0.7630    0.7592     10549

Feature                   Importance               
js_array_len_avg          11.331429%               
dcd_color_2               10.857553%               
js_len                    8.827004%                
dcd_color_1               8.539956%                
content_len               7.607314%                
num_js_func_calls         6.972174%                
js_array_len_max          6.771773%                
malicious_func_count      6.755401%                
ext_url_count             6.497730%                
total_url_count           6.315522%                
js_ref                    6.143576%                
dcd_color_5

### URL Embeddings from the longformer 

https://huggingface.co/transformers/model_doc/longformer.html

### All embeddings

In [9]:
url_embeddings = pd.read_csv('./url_feats_sliding_window_longformer.csv')
scaled_embs = scale_features(url_embeddings)
url_embs_all, url_embs_all_columns = make_columns_for_embs(scaled_embs, 'url_emb_all')
url_embs_all_columns = url_embs_all_columns

df_surtur = pd.concat([df_surtur, url_embs_all], axis=1)

In [10]:
make_classification(url_embs_all_columns + feats_to_drop, df_surtur)
make_classification(url_embs_all_columns + feats_to_drop + lexical_features 
                    + host_feats + content_feats, df_surtur)



Accuracy_score = 0.7637690776376908


              precision    recall  f1-score   support

      benign     0.7567    0.8697    0.8093      6079
   malicious     0.7777    0.6197    0.6897      4470

    accuracy                         0.7638     10549
   macro avg     0.7672    0.7447    0.7495     10549
weighted avg     0.7656    0.7638    0.7586     10549

Feature                   Importance               
url_emb_all_set_35        10.234301%               
url_emb_all_set_38        7.581991%                
url_emb_all_set_34        6.264200%                
url_emb_all_set_58        5.973733%                
url_emb_all_set_73        5.761224%                
url_emb_all_set_1         4.896512%                
url_emb_all_set_51        4.789802%                
url_emb_all_set_41        3.610507%                
url_emb_all_set_55        3.062262%                
url_emb_all_set_15        2.868628%                
url_emb_all_set_86        2.426008%                
url_emb_all

### PCA 10

In [11]:
pca_10 = perform_pca(10, scaled_embs)
url_embs_pca_10, url_embs_pca_10_columns = make_columns_for_embs(pca_10, 'url_pca_10')
# url_embs_pca_10_columns = url_embs_pca_10_columns + ['url_emb_mean', 'url_emb_mean_red']

df_surtur = pd.concat([df_surtur, url_embs_pca_10], axis=1)

In [12]:
make_classification(url_embs_pca_10_columns + feats_to_drop, df_surtur)
make_classification(url_embs_pca_10_columns + feats_to_drop + lexical_features 
                    + host_feats + content_feats, df_surtur)



Accuracy_score = 0.7540999146838563


              precision    recall  f1-score   support

      benign     0.7439    0.8743    0.8038      6079
   malicious     0.7756    0.5906    0.6706      4470

    accuracy                         0.7541     10549
   macro avg     0.7597    0.7325    0.7372     10549
weighted avg     0.7573    0.7541    0.7474     10549

Feature                   Importance               
url_pca_10_set_1          19.987781%               
url_pca_10_set_0          19.625844%               
url_pca_10_set_3          16.408576%               
url_pca_10_set_5          11.944280%               
url_pca_10_set_4          7.012963%                
url_pca_10_set_2          5.749057%                
url_pca_10_set_6          5.516529%                
url_pca_10_set_7          4.862312%                
url_pca_10_set_9          4.692968%                
url_pca_10_set_8          4.199684%                
Accuracy_score = 0.8373305526590198


              precision  

### PCA 25

In [13]:
pca_25 = perform_pca(25, scaled_embs)
url_embs_pca_25, url_embs_pca_25_columns = make_columns_for_embs(pca_25, 'url_pca_25')
# url_embs_pca_25_columns = url_embs_pca_25_columns + ['url_emb_mean', 'url_emb_mean_red']

df_surtur = pd.concat([df_surtur, url_embs_pca_25], axis=1)

In [14]:
make_classification(url_embs_pca_25_columns + feats_to_drop, df_surtur)
make_classification(url_embs_pca_25_columns + feats_to_drop + lexical_features 
                    + host_feats + content_feats, df_surtur)



Accuracy_score = 0.7523935918096502


              precision    recall  f1-score   support

      benign     0.7493    0.8570    0.7996      6079
   malicious     0.7583    0.6101    0.6762      4470

    accuracy                         0.7524     10549
   macro avg     0.7538    0.7336    0.7379     10549
weighted avg     0.7531    0.7524    0.7473     10549

Feature                   Importance               
url_pca_25_set_1          13.183624%               
url_pca_25_set_0          13.147095%               
url_pca_25_set_3          11.504777%               
url_pca_25_set_5          7.844285%                
url_pca_25_set_4          4.256955%                
url_pca_25_set_17         3.353027%                
url_pca_25_set_2          3.316629%                
url_pca_25_set_6          3.142457%                
url_pca_25_set_20         3.083378%                
url_pca_25_set_16         2.752076%                
url_pca_25_set_21         2.577624%                
url_pca_25_

### PCA 50

In [15]:
pca_50 = perform_pca(50, scaled_embs)
url_embs_pca_50, url_embs_pca_50_columns = make_columns_for_embs(pca_50, 'url_pca_50')
# url_embs_pca_50_columns = url_embs_pca_50_columns + ['url_emb_mean', 'url_emb_mean_red']

df_surtur = pd.concat([df_surtur, url_embs_pca_50], axis=1)

In [16]:
make_classification(url_embs_pca_50_columns + feats_to_drop, df_surtur)
make_classification(url_embs_pca_50_columns + feats_to_drop + lexical_features 
                    + host_feats + content_feats, df_surtur)



Accuracy_score = 0.7425348374253484


              precision    recall  f1-score   support

      benign     0.7451    0.8409    0.7901      6079
   malicious     0.7378    0.6087    0.6671      4470

    accuracy                         0.7425     10549
   macro avg     0.7414    0.7248    0.7286     10549
weighted avg     0.7420    0.7425    0.7380     10549

Feature                   Importance               
url_pca_50_set_1          9.739801%                
url_pca_50_set_0          9.575329%                
url_pca_50_set_3          7.755319%                
url_pca_50_set_5          5.898405%                
url_pca_50_set_4          2.949202%                
url_pca_50_set_17         2.316203%                
url_pca_50_set_6          2.292814%                
url_pca_50_set_2          2.087091%                
url_pca_50_set_20         2.035731%                
url_pca_50_set_16         1.908725%                
url_pca_50_set_31         1.778187%                
url_pca_50_

### Feature Selection

In [17]:
url_emb_feat_sel = ['url_emb_mean', 'url_emb_mean_red', 'url_emb_0', 
                     'url_emb_1', 'url_emb_2', 'url_emb_3', 'url_emb_4', 
                     'url_emb_5', 'url_emb_6', 'url_emb_7', 'url_emb_8', 
                     'url_emb_9', 'url_emb_10', 'url_emb_11', 'url_emb_12', 
                     'url_emb_13', 'url_emb_14', 'url_emb_15', 'url_emb_16', 
                     'url_emb_17', 'url_emb_18', 'url_emb_19']

make_classification(url_emb_feat_sel + feats_to_drop, df_surtur)
make_classification(url_emb_feat_sel + feats_to_drop + lexical_features + host_feats + content_feats, df_surtur)

Accuracy_score = 0.7643378519290928


              precision    recall  f1-score   support

      benign     0.7532    0.8791    0.8113      6079
   malicious     0.7872    0.6083    0.6863      4470

    accuracy                         0.7643     10549
   macro avg     0.7702    0.7437    0.7488     10549
weighted avg     0.7676    0.7643    0.7583     10549

Feature                   Importance               
url_emb_mean_red          19.876653%               
url_emb_1                 18.713418%               
url_emb_mean              9.050211%                
url_emb_3                 3.517240%                
url_emb_8                 3.114260%                
url_emb_7                 3.106756%                
url_emb_4                 3.079773%                
url_emb_5                 3.053969%                
url_emb_2                 3.037693%                
url_emb_10                2.862465%                
url_emb_14                2.859021%                
url_emb_13 

### Best one: 

The best set to continue with is the PCA 10 components set

## Image Embeddings

### All embeddings

In [18]:
img_embeddings = pd.read_csv('./img_features_adjusted.csv')
scaled_embs = scale_features(np.array(img_embeddings))
img_embs_all, img_embs_all_columns = make_columns_for_embs(scaled_embs, 'img_emb_all')

df_surtur = pd.concat([df_surtur, img_embs_all], axis=1)

In [19]:
make_classification(img_embs_all_columns + feats_to_drop, df_surtur)
make_classification(img_embs_all_columns + feats_to_drop + lexical_features 
                    + host_feats + content_feats + url_embs_pca_10_columns, df_surtur)



Accuracy_score = 0.602711157455683


              precision    recall  f1-score   support

      benign     0.6214    0.7949    0.6975      6079
   malicious     0.5503    0.3414    0.4214      4470

    accuracy                         0.6027     10549
   macro avg     0.5859    0.5681    0.5594     10549
weighted avg     0.5913    0.6027    0.5805     10549

Feature                   Importance               
img_emb_all_set_700       5.541011%                
img_emb_all_set_143       5.250940%                
img_emb_all_set_323       1.392839%                
img_emb_all_set_300       0.434946%                
img_emb_all_set_421       0.273908%                
img_emb_all_set_110       0.198433%                
img_emb_all_set_987       0.184285%                
img_emb_all_set_278       0.159354%                
img_emb_all_set_715       0.141181%                
img_emb_all_set_889       0.137508%                
img_emb_all_set_688       0.125940%                
img_emb_all_

### PCA 10

In [20]:
pca_10 = perform_pca(10, scaled_embs)
img_embs_pca_10, img_embs_pca_10_columns = make_columns_for_embs(pca_10, 'img_pca_10')

df_surtur = pd.concat([df_surtur, img_embs_pca_10], axis=1)

In [21]:
make_classification(img_embs_pca_10_columns + feats_to_drop, df_surtur)
make_classification(img_embs_pca_10_columns + feats_to_drop + lexical_features 
                    + host_feats + content_feats + url_embs_pca_10_columns, df_surtur)



Accuracy_score = 0.6256517205422315


              precision    recall  f1-score   support

      benign     0.6223    0.8914    0.7329      6079
   malicious     0.6415    0.2642    0.3743      4470

    accuracy                         0.6257     10549
   macro avg     0.6319    0.5778    0.5536     10549
weighted avg     0.6304    0.6257    0.5810     10549

Feature                   Importance               
img_pca_10_set_0          36.987925%               
img_pca_10_set_9          7.199889%                
img_pca_10_set_5          7.144085%                
img_pca_10_set_8          7.066698%                
img_pca_10_set_3          7.007945%                
img_pca_10_set_1          6.992283%                
img_pca_10_set_4          6.921063%                
img_pca_10_set_2          6.915239%                
img_pca_10_set_6          6.914473%                
img_pca_10_set_7          6.850401%                
Accuracy_score = 0.8369513697980852


              precision  

### PCA 25

In [22]:
pca_25 = perform_pca(25, scaled_embs)
img_embs_pca_25, img_embs_pca_25_columns = make_columns_for_embs(pca_25, 'img_pca_25')

df_surtur = pd.concat([df_surtur, img_embs_pca_25], axis=1)

In [23]:
make_classification(img_embs_pca_25_columns + feats_to_drop, df_surtur)
make_classification(img_embs_pca_25_columns + feats_to_drop + lexical_features 
                    + host_feats + content_feats + url_embs_pca_10_columns, df_surtur)



Accuracy_score = 0.6173096976016684


              precision    recall  f1-score   support

      benign     0.6208    0.8631    0.7222      6079
   malicious     0.6032    0.2830    0.3853      4470

    accuracy                         0.6173     10549
   macro avg     0.6120    0.5731    0.5537     10549
weighted avg     0.6134    0.6173    0.5794     10549

Feature                   Importance               
img_pca_25_set_0          26.205313%               
img_pca_25_set_3          3.261687%                
img_pca_25_set_5          3.257721%                
img_pca_25_set_13         3.157325%                
img_pca_25_set_15         3.157235%                
img_pca_25_set_8          3.147843%                
img_pca_25_set_17         3.146121%                
img_pca_25_set_4          3.127183%                
img_pca_25_set_6          3.116515%                
img_pca_25_set_18         3.097741%                
img_pca_25_set_21         3.095918%                
img_pca_25_

### PCA 50

In [24]:
pca_50 = perform_pca(50, scaled_embs)
img_embs_pca_50, img_embs_pca_50_columns = make_columns_for_embs(pca_50, 'img_pca_50')
# img_embs_pca_50_columns = img_embs_pca_50_columns + ['img_emb_mean', 'img_emb_mean_red']

df_surtur = pd.concat([df_surtur, img_embs_pca_50], axis=1)

In [25]:
make_classification(img_embs_pca_50_columns + feats_to_drop, df_surtur)
make_classification(img_embs_pca_50_columns + feats_to_drop + lexical_features 
                    + host_feats + content_feats + url_embs_pca_10_columns, df_surtur)



Accuracy_score = 0.6116219546876481


              precision    recall  f1-score   support

      benign     0.6208    0.8380    0.7132      6079
   malicious     0.5796    0.3038    0.3986      4470

    accuracy                         0.6116     10549
   macro avg     0.6002    0.5709    0.5559     10549
weighted avg     0.6033    0.6116    0.5799     10549

Feature                   Importance               
img_pca_50_set_0          19.193351%               
img_pca_50_set_44         1.862078%                
img_pca_50_set_4          1.781436%                
img_pca_50_set_29         1.741236%                
img_pca_50_set_34         1.718133%                
img_pca_50_set_35         1.712518%                
img_pca_50_set_18         1.701758%                
img_pca_50_set_12         1.700862%                
img_pca_50_set_16         1.698058%                
img_pca_50_set_43         1.690992%                
img_pca_50_set_8          1.690651%                
img_pca_50_

### Feature Selection

In [26]:
img_mobnet_embeddings = ['img_mean', 'img_mean_red', 'image_mobnet_0', 
                         'image_mobnet_1', 'image_mobnet_2', 'image_mobnet_3', 
                         'image_mobnet_4', 'image_mobnet_5', 'image_mobnet_6', 'image_mobnet_7', 
                         'image_mobnet_8', 'image_mobnet_9', 'image_mobnet_10', 'image_mobnet_11', 
                         'image_mobnet_12', 'image_mobnet_13', 'image_mobnet_14', 'image_mobnet_15',
                         'image_mobnet_16', 'image_mobnet_17', 'image_mobnet_18', 'image_mobnet_19']

make_classification(img_mobnet_embeddings + feats_to_drop, df_surtur)
make_classification(img_mobnet_embeddings + feats_to_drop + lexical_features + host_feats
                    + content_feats + url_embs_pca_10_columns, df_surtur)

Accuracy_score = 0.6128542989856859


              precision    recall  f1-score   support

      benign     0.6194    0.8510    0.7170      6079
   malicious     0.5878    0.2890    0.3875      4470

    accuracy                         0.6129     10549
   macro avg     0.6036    0.5700    0.5523     10549
weighted avg     0.6060    0.6129    0.5774     10549

Feature                   Importance               
img_mean_red              15.427394%               
img_mean                  12.161608%               
image_mobnet_17           4.936055%                
image_mobnet_14           3.774932%                
image_mobnet_10           3.752293%                
image_mobnet_15           3.655128%                
image_mobnet_16           3.629029%                
image_mobnet_11           3.626671%                
image_mobnet_8            3.581095%                
image_mobnet_3            3.567450%                
image_mobnet_19           3.534382%                
image_mobne

### Best one: 

The best one is once again PCA with 10 components

## Content Embeddings

### All embeddings

In [27]:
content_embeddings = pd.read_csv('./content_feats_code_bert.csv')
scaled_embs = scale_features(np.array(content_embeddings))
content_embs_all, content_embs_all_columns = make_columns_for_embs(scaled_embs, 'content_emb_all')

df_surtur = pd.concat([df_surtur, content_embs_all], axis=1)

In [28]:
make_classification(content_embs_all_columns + feats_to_drop, df_surtur)
make_classification(content_embs_all_columns + feats_to_drop + lexical_features 
                    + host_feats + content_feats 
                    + url_embs_pca_10_columns + img_embs_pca_10_columns, df_surtur)



Accuracy_score = 0.6762726324770121


              precision    recall  f1-score   support

      benign     0.6686    0.8691    0.7557      6079
   malicious     0.6993    0.4141    0.5202      4470

    accuracy                         0.6763     10549
   macro avg     0.6839    0.6416    0.6380     10549
weighted avg     0.6816    0.6763    0.6559     10549

Feature                   Importance               
content_emb_all_set_4     3.781461%                
content_emb_all_set_504   2.106040%                
content_emb_all_set_483   1.152502%                
content_emb_all_set_399   0.999686%                
content_emb_all_set_39    0.752665%                
content_emb_all_set_356   0.642331%                
content_emb_all_set_5     0.606212%                
content_emb_all_set_8     0.556717%                
content_emb_all_set_469   0.529376%                
content_emb_all_set_52    0.489620%                
content_emb_all_set_472   0.471283%                
content_emb

### PCA 10

In [29]:
pca_10 = perform_pca(10, scaled_embs)
content_embs_pca_10, content_embs_pca_10_columns = make_columns_for_embs(pca_10, 'content_pca_10')

df_surtur = pd.concat([df_surtur, content_embs_pca_10], axis=1)

In [30]:
make_classification(content_embs_pca_10_columns + feats_to_drop, df_surtur)
make_classification(content_embs_pca_10_columns + feats_to_drop + lexical_features 
                    + host_feats + content_feats 
                    + url_embs_pca_10_columns + img_embs_pca_10_columns, df_surtur)



Accuracy_score = 0.6771257939141151


              precision    recall  f1-score   support

      benign     0.6578    0.9163    0.7658      6079
   malicious     0.7555    0.3519    0.4802      4470

    accuracy                         0.6771     10549
   macro avg     0.7067    0.6341    0.6230     10549
weighted avg     0.6992    0.6771    0.6448     10549

Feature                   Importance               
content_pca_10_set_8      18.728861%               
content_pca_10_set_5      10.111886%               
content_pca_10_set_2      10.019848%               
content_pca_10_set_0      9.942704%                
content_pca_10_set_9      9.700941%                
content_pca_10_set_3      9.301306%                
content_pca_10_set_4      9.283711%                
content_pca_10_set_1      8.199228%                
content_pca_10_set_6      7.679328%                
content_pca_10_set_7      7.032177%                
Accuracy_score = 0.8374253483742535


              precision  

### PCA 25

In [31]:
pca_25 = perform_pca(25, scaled_embs)
content_embs_pca_25, content_embs_pca_25_columns = make_columns_for_embs(pca_25, 'content_pca_25')

df_surtur = pd.concat([df_surtur, content_embs_pca_25], axis=1)

In [32]:
make_classification(content_embs_pca_25_columns + feats_to_drop, df_surtur)
make_classification(content_embs_pca_25_columns + feats_to_drop + lexical_features 
                    + host_feats + content_feats
                    + url_embs_pca_10_columns + img_embs_pca_10_columns, df_surtur)



Accuracy_score = 0.6811072139539293


              precision    recall  f1-score   support

      benign     0.6654    0.8985    0.7646      6079
   malicious     0.7363    0.3855    0.5060      4470

    accuracy                         0.6811     10549
   macro avg     0.7008    0.6420    0.6353     10549
weighted avg     0.6954    0.6811    0.6550     10549

Feature                   Importance               
content_pca_25_set_8      10.803020%               
content_pca_25_set_15     5.423287%                
content_pca_25_set_0      5.222743%                
content_pca_25_set_19     4.987109%                
content_pca_25_set_3      4.771776%                
content_pca_25_set_10     4.139826%                
content_pca_25_set_5      4.131318%                
content_pca_25_set_7      4.110887%                
content_pca_25_set_18     3.800177%                
content_pca_25_set_17     3.695083%                
content_pca_25_set_2      3.662900%                
content_pca

### PCA 50

In [33]:
pca_50 = perform_pca(50, scaled_embs)
content_embs_pca_50, content_embs_pca_50_columns = make_columns_for_embs(pca_50, 'content_pca_50')

df_surtur = pd.concat([df_surtur, content_embs_pca_50], axis=1)

In [34]:
make_classification(content_embs_pca_50_columns + feats_to_drop, df_surtur)
make_classification(content_embs_pca_50_columns + feats_to_drop + lexical_features 
                    + host_feats + content_feats 
                    + url_embs_pca_10_columns + img_embs_pca_10_columns, df_surtur)



Accuracy_score = 0.6775997724902835


              precision    recall  f1-score   support

      benign     0.6648    0.8885    0.7605      6079
   malicious     0.7204    0.3908    0.5067      4470

    accuracy                         0.6776     10549
   macro avg     0.6926    0.6396    0.6336     10549
weighted avg     0.6884    0.6776    0.6530     10549

Feature                   Importance               
content_pca_50_set_8      8.957878%                
content_pca_50_set_19     3.960598%                
content_pca_50_set_15     3.270784%                
content_pca_50_set_3      2.773380%                
content_pca_50_set_0      2.753561%                
content_pca_50_set_5      2.519165%                
content_pca_50_set_18     2.386509%                
content_pca_50_set_25     2.193217%                
content_pca_50_set_2      2.156175%                
content_pca_50_set_42     2.129930%                
content_pca_50_set_10     2.128766%                
content_pca

## Feature Selection

In [35]:
content_embeddings = ['content_emb_0', 'content_emb_1', 'content_emb_2', 
                      'content_emb_3', 'content_emb_4', 'content_emb_5', 
                      'content_emb_6', 'content_emb_7', 'content_emb_8', 
                      'content_emb_9', 'content_emb_10', 'content_emb_11', 
                      'content_emb_12', 'content_emb_13', 'content_emb_14', 
                      'content_emb_15', 'content_emb_16', 'content_emb_17', 
                      'content_emb_18', 'content_emb_19']

make_classification(content_embeddings + feats_to_drop, df_surtur)
make_classification(url_embs_pca_10_columns + img_embs_pca_10_columns 
                    + feats_to_drop + lexical_features + host_feats
                    + content_feats + content_embeddings, df_surtur)

Accuracy_score = 0.6656555123708409


              precision    recall  f1-score   support

      benign     0.6535    0.8937    0.7550      6079
   malicious     0.7110    0.3555    0.4740      4470

    accuracy                         0.6657     10549
   macro avg     0.6822    0.6246    0.6145     10549
weighted avg     0.6778    0.6657    0.6359     10549

Feature                   Importance               
content_emb_19            13.811943%               
content_emb_2             7.238602%                
content_emb_13            5.884395%                
content_emb_12            5.725258%                
content_emb_10            5.571158%                
content_emb_0             4.946371%                
content_emb_18            4.665475%                
content_emb_3             4.563457%                
content_emb_6             4.528358%                
content_emb_8             4.362756%                
content_emb_9             4.191156%                
content_emb

### Best one:

PCA 10 once again

### DCD

In [36]:
dcd_feats = ['dcd_color_1', 'dcd_color_2', 'dcd_color_3', 'dcd_color_4', 'dcd_color_5']

make_classification(dcd_feats + feats_to_drop, df_surtur)

Accuracy_score = 0.6938098397952412


              precision    recall  f1-score   support

      benign     0.7031    0.8112    0.7533      6079
   malicious     0.6753    0.5342    0.5966      4470

    accuracy                         0.6938     10549
   macro avg     0.6892    0.6727    0.6749     10549
weighted avg     0.6913    0.6938    0.6869     10549

Feature                   Importance               
dcd_color_2               29.230213%               
dcd_color_1               25.545219%               
dcd_color_3               15.549251%               
dcd_color_5               14.965314%               
dcd_color_4               14.710005%               


## Additional embeddings from BERT

### All embeddings

In [37]:
bert_url_embeddings = pd.read_csv('./url_feats_sliding_window.csv')
scaled_embs = scale_features(np.array(bert_url_embeddings))
bert_url_embs_all, bert_url_embs_all_columns = make_columns_for_embs(scaled_embs, 'bert_url_emb_all')

df_surtur = pd.concat([df_surtur, bert_url_embs_all], axis=1)

In [38]:
make_classification(bert_url_embs_all_columns + feats_to_drop, df_surtur)
make_classification(bert_url_embs_all_columns + content_feats + feats_to_drop + lexical_features 
                    + host_feats + url_embs_pca_10_columns + img_embs_pca_10_columns 
                    + content_embs_pca_10_columns, df_surtur)



Accuracy_score = 0.7662337662337663


              precision    recall  f1-score   support

      benign     0.7715    0.8444    0.8063      6079
   malicious     0.7572    0.6600    0.7052      4470

    accuracy                         0.7662     10549
   macro avg     0.7644    0.7522    0.7558     10549
weighted avg     0.7655    0.7662    0.7635     10549

Feature                   Importance               
bert_url_emb_all_set_1    3.494097%                
bert_url_emb_all_set_371  3.481615%                
bert_url_emb_all_set_312  1.815502%                
bert_url_emb_all_set_311  1.334491%                
bert_url_emb_all_set_33   1.329478%                
bert_url_emb_all_set_193  1.231988%                
bert_url_emb_all_set_456  1.176592%                
bert_url_emb_all_set_370  0.949943%                
bert_url_emb_all_set_17   0.940331%                
bert_url_emb_all_set_457  0.875717%                
bert_url_emb_all_set_2    0.767526%                
bert_url_em

### PCA 10

In [39]:
pca_10 = perform_pca(10, scaled_embs)
bert_url_embs_pca_10, bert_url_embs_pca_10_columns = make_columns_for_embs(pca_10, 'bert_url_pca_10')

df_surtur = pd.concat([df_surtur, bert_url_embs_pca_10], axis=1)

In [40]:
make_classification(bert_url_embs_pca_10_columns + feats_to_drop, df_surtur)
make_classification(bert_url_embs_pca_10_columns + feats_to_drop + lexical_features 
                    + host_feats + content_feats 
                    + url_embs_pca_10_columns + img_embs_pca_10_columns + content_embs_pca_10_columns, df_surtur)



Accuracy_score = 0.7297374158688027


              precision    recall  f1-score   support

      benign     0.7425    0.8130    0.7761      6079
   malicious     0.7079    0.6166    0.6591      4470

    accuracy                         0.7297     10549
   macro avg     0.7252    0.7148    0.7176     10549
weighted avg     0.7278    0.7297    0.7265     10549

Feature                   Importance               
bert_url_pca_10_set_0     26.504382%               
bert_url_pca_10_set_4     11.467667%               
bert_url_pca_10_set_2     10.220165%               
bert_url_pca_10_set_6     8.486682%                
bert_url_pca_10_set_7     8.413029%                
bert_url_pca_10_set_8     7.729834%                
bert_url_pca_10_set_1     7.340953%                
bert_url_pca_10_set_9     7.216866%                
bert_url_pca_10_set_3     6.564174%                
bert_url_pca_10_set_5     6.056245%                
Accuracy_score = 0.8379941226656555


              precision  

### PCA 25

In [41]:
pca_25 = perform_pca(25, scaled_embs)
bert_url_embs_pca_25, bert_url_embs_pca_25_columns = make_columns_for_embs(pca_25, 'bert_url_pca_25')

df_surtur = pd.concat([df_surtur, bert_url_embs_pca_25], axis=1)

In [42]:
make_classification(bert_url_embs_pca_25_columns + feats_to_drop, df_surtur)
make_classification(bert_url_embs_pca_25_columns + feats_to_drop + lexical_features 
                    + host_feats + content_feats
                    + url_embs_pca_10_columns + img_embs_pca_10_columns + content_embs_pca_10_columns, df_surtur)



Accuracy_score = 0.7323916958953456


              precision    recall  f1-score   support

      benign     0.7460    0.8121    0.7777      6079
   malicious     0.7095    0.6239    0.6640      4470

    accuracy                         0.7324     10549
   macro avg     0.7277    0.7180    0.7208     10549
weighted avg     0.7305    0.7324    0.7295     10549

Feature                   Importance               
bert_url_pca_25_set_0     17.310049%               
bert_url_pca_25_set_4     6.658917%                
bert_url_pca_25_set_2     5.347259%                
bert_url_pca_25_set_6     4.457737%                
bert_url_pca_25_set_12    4.183405%                
bert_url_pca_25_set_19    3.793959%                
bert_url_pca_25_set_20    3.677570%                
bert_url_pca_25_set_1     3.624072%                
bert_url_pca_25_set_9     3.513856%                
bert_url_pca_25_set_23    3.468121%                
bert_url_pca_25_set_3     3.459768%                
bert_url_pc

### PCA 50

In [43]:
pca_50 = perform_pca(50, scaled_embs)
bert_url_embs_pca_50, bert_url_embs_pca_50_columns = make_columns_for_embs(pca_50, 'bert_url_pca_50')

df_surtur = pd.concat([df_surtur, bert_url_embs_pca_50], axis=1)

In [44]:
make_classification(bert_url_embs_pca_50_columns + feats_to_drop, df_surtur)
make_classification(bert_url_embs_pca_50_columns + feats_to_drop + lexical_features 
                    + host_feats + content_feats 
                    + url_embs_pca_10_columns + img_embs_pca_10_columns + content_embs_pca_10_columns, df_surtur)



Accuracy_score = 0.7408285145511423


              precision    recall  f1-score   support

      benign     0.7517    0.8217    0.7851      6079
   malicious     0.7223    0.6309    0.6735      4470

    accuracy                         0.7408     10549
   macro avg     0.7370    0.7263    0.7293     10549
weighted avg     0.7393    0.7408    0.7378     10549

Feature                   Importance               
bert_url_pca_50_set_0     12.435620%               
bert_url_pca_50_set_29    7.409844%                
bert_url_pca_50_set_26    6.759337%                
bert_url_pca_50_set_2     3.176380%                
bert_url_pca_50_set_4     2.828107%                
bert_url_pca_50_set_3     2.440602%                
bert_url_pca_50_set_1     2.250203%                
bert_url_pca_50_set_20    2.224125%                
bert_url_pca_50_set_25    2.027822%                
bert_url_pca_50_set_47    2.017964%                
bert_url_pca_50_set_17    1.949148%                
bert_url_pc

### Feature Selection

In [59]:
feat_selected_bert = feature_selection(20, scale_features(np.array(bert_url_embeddings)), 'url_bert', df_surtur)

df_surtur = pd.concat([df_surtur, feat_selected_bert], axis=1)

df_surtur['url_emb_mean_bert'] = bert_url_embs_all.mean(axis=1).values
df_surtur['url_emb_mean_red_bert'] = feat_selected_bert.mean(axis=1).values

100%|██████████| 20/20 [00:00<00:00, 160701.30it/s]


In [63]:
url_bert_emb_feats_bert = ['url_emb_mean_bert', 'url_emb_mean_red_bert', 'url_bert_emb_20', 
                     'url_bert_emb_21', 'url_bert_emb_22', 'url_bert_emb_23', 'url_bert_emb_24', 
                     'url_bert_emb_25', 'url_bert_emb_26', 'url_bert_emb_27', 'url_bert_emb_28', 
                     'url_bert_emb_29', 'url_bert_emb_30', 'url_bert_emb_31', 'url_bert_emb_32', 
                     'url_bert_emb_33', 'url_bert_emb_34', 'url_bert_emb_35', 'url_bert_emb_36', 
                     'url_bert_emb_37', 'url_bert_emb_38', 'url_bert_emb_39']


make_classification(url_bert_emb_feats_bert + feats_to_drop, df_surtur)
make_classification(feats_to_drop + lexical_features + host_feats
                    + content_feats + url_bert_emb_feats_bert + url_embs_pca_10_columns 
                    + img_embs_pca_10_columns + content_embs_pca_10_columns, df_surtur)

Accuracy_score = 0.7511612475116125


              precision    recall  f1-score   support

      benign     0.7529    0.8457    0.7966      6079
   malicious     0.7479    0.6226    0.6795      4470

    accuracy                         0.7512     10549
   macro avg     0.7504    0.7341    0.7381     10549
weighted avg     0.7508    0.7512    0.7470     10549

Feature                   Importance               
url_bert_emb_20           19.957052%               
url_bert_emb_29           12.471469%               
url_emb_mean_red_bert     6.176529%                
url_emb_mean_bert         5.323802%                
url_bert_emb_24           4.307182%                
url_bert_emb_34           4.020206%                
url_bert_emb_33           3.940124%                
url_bert_emb_21           3.879892%                
url_bert_emb_27           3.556989%                
url_bert_emb_35           3.455467%                
url_bert_emb_22           3.353843%                
url_bert_em

## Best Score using feature selection

In [66]:
make_classification(feats_to_drop + lexical_features + host_feats
                    + content_feats + ['img_mean', 'img_mean_red', 'image_mobnet_9']
                    + ['url_emb_mean', 'url_emb_mean_red', 'url_emb_1', 
                       'url_emb_mean_bert', 'url_emb_mean_red_bert', 'url_bert_emb_20'], df_surtur)

Accuracy_score = 0.8427339084273391


              precision    recall  f1-score   support

      benign     0.8290    0.9161    0.8704      6079
   malicious     0.8669    0.7430    0.8001      4470

    accuracy                         0.8427     10549
   macro avg     0.8479    0.8295    0.8353     10549
weighted avg     0.8450    0.8427    0.8406     10549

Feature                   Importance               
url_bert_emb_20           5.510643%                
url_emb_mean_red          5.413893%                
url_emb_1                 5.160495%                
num_ampersands            5.007708%                
https                     4.076918%                
url_emb_mean_bert         3.796947%                
is_in_alexa               3.084218%                
who_is                    2.945010%                
tld                       2.922234%                
number_subdomains         2.876776%                
js_len                    2.760551%                
location   

## Best Score with PCA

In [67]:
make_classification(bert_url_embs_pca_10_columns + feats_to_drop + lexical_features 
                    + host_feats + content_feats 
                    + url_embs_pca_10_columns + img_embs_pca_10_columns + content_embs_pca_10_columns, df_surtur)

Accuracy_score = 0.8379941226656555


              precision    recall  f1-score   support

      benign     0.8266    0.9097    0.8662      6079
   malicious     0.8577    0.7405    0.7948      4470

    accuracy                         0.8380     10549
   macro avg     0.8422    0.8251    0.8305     10549
weighted avg     0.8398    0.8380    0.8359     10549

Feature                   Importance               
bert_url_pca_10_set_0     5.976950%                
img_pca_10_set_0          5.881989%                
num_ampersands            3.968403%                
https                     3.735266%                
num_semicolons            3.449666%                
url_pca_10_set_1          3.358796%                
who_is                    3.185130%                
url_pca_10_set_3          3.129755%                
tld                       2.666849%                
num_hyphens               2.602678%                
url_pca_10_set_5          2.336108%                
num_equals 