In [1]:
import pandas as pd 
import numpy as np
import sys
from tqdm import tqdm

from preprocessing_ml_table import *

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.decomposition import PCA

from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier

from xgboost import XGBClassifier

np.set_printoptions(threshold=sys.maxsize)

## Reading the Dataset

In [2]:
df_surtur = pd.read_csv('./datasets_of_interest/super_final_dataset_surtur.csv')
df_surtur.head()

Unnamed: 0,content,has_IP_in_url,having_@_in_url,hostname,https,label,length_hostname,number_subdomains,number_underscores,ratio_digits_hostname,...,content_emb_12,content_emb_13,content_emb_14,content_emb_15,content_emb_16,content_emb_17,content_emb_18,content_emb_19,is_in_alexa,num_?s
0,"<html><head><meta content=""no-cache"" http-equi...",0.0,0.0,00.124.324.77.00.opteamevent.hu,0,1,31.0,6.0,0.0,0.387097,...,0.045458,0.045459,0.045461,0.045461,0.045458,0.045458,0.045458,0.04546,0,0
1,�MQTH1TDa1��fl�hF1bVOFESLCnBbRI9MRTH1PDa1SP...,0.0,0.0,00005ik.rcomhost.com,0,1,20.0,1.0,0.0,0.25,...,0.04416,0.04416,0.04416,0.04416,0.04416,0.04416,0.04416,0.044159,0,0
2,"<!DOCTYPE html>\n<html data-adblockkey=""MFwwDQ...",0.0,0.0,000098.ihostfull.com,0,1,20.0,1.0,0.0,0.3,...,0.046553,0.048641,0.048712,0.047349,0.048684,0.047914,0.048007,0.049129,0,0
3,"<html>\n<head>\n<meta content=""noarchive"" name...",0.0,0.0,000p6vl.wcomhost.com,0,1,20.0,1.0,0.0,0.2,...,0.047654,0.04767,0.047664,0.047662,0.04767,0.047674,0.04766,0.047667,1,0
4,"<html><head><meta content=""no-cache"" http-equi...",0.0,0.0,001.002.003.23.opteamevent.hu,0,1,29.0,5.0,0.0,0.37931,...,0.045458,0.045459,0.045461,0.045461,0.045458,0.045458,0.045458,0.04546,0,0


In [3]:
print(np.array(df_surtur.columns))

['content' 'has_IP_in_url' 'having_@_in_url' 'hostname' 'https' 'label'
 'length_hostname' 'number_subdomains' 'number_underscores'
 'ratio_digits_hostname' 'ratio_digits_url' 'tld' 'url' 'url_len' 'who_is'
 'js' 'js_len' 'js_ref' 'js_array_len_avg' 'js_array_len_max'
 'content_len' 'num_js_func_calls' 'malicious_func_count'
 'total_url_count' 'ext_url_count' 'num_semicolons' 'num_zeros'
 'num_spaces' 'num_hyphens' 'num_@s' 'num_queries' 'num_ampersands'
 'num_equals' 'domain' 'domain_len' 'img_mean' 'img_mean_red'
 'image_mobnet_0' 'image_mobnet_1' 'image_mobnet_2' 'image_mobnet_3'
 'image_mobnet_4' 'image_mobnet_5' 'image_mobnet_6' 'image_mobnet_7'
 'image_mobnet_8' 'image_mobnet_9' 'image_mobnet_10' 'image_mobnet_11'
 'image_mobnet_12' 'image_mobnet_13' 'image_mobnet_14' 'image_mobnet_15'
 'image_mobnet_16' 'image_mobnet_17' 'image_mobnet_18' 'image_mobnet_19'
 'dcd_color_1' 'dcd_color_2' 'dcd_color_3' 'dcd_color_4' 'dcd_color_5'
 'google_is_safe' 'ip_address' 'location' 'url_emb_me

## Machine Learning

### Summary Table

| Variation                   | Accuracy alone | Accuracy with variations above |
|-----------------------------|----------------|--------------------------------|
| Baseline (no embeddings)    | N/A            | 83.11%                         |
| Lexical                     | N/A            | <mark>75.48%</mark>            |
| Host-based                  | 65.70%         | <mark>80.02%</mark>            |
| Content-based               | 76.30%         | <mark>83.11%</mark>            |
| lngfrm all embs             | 76.38%         | 83.23%                         |
| lngfrm embs PCA 10          | 75.41%         | <mark>83.73%</mark>            |
| lngfrm embs PCA 25          | 75.24%         | 82.41%                         |
| lngfrm embs PCA 50          | 75.25%         | 82.09%                         |
| lngfrm feat selection       | 76.43%         | 83.29%                         |
| img all embs                | 60.27%         | 82.24%                         |
| img embs PCA 10             | 62.57%         | <mark>83.69%</mark>            |
| img embs PCA 25             | 61.73%         | 83.46%                         |
| img embs PCA 50             | 61.16%         | 82.82%                         |
| img feat selection          | 61.29%         | 83.62%                         |
| content all embeddings      | 67.63%         | 81.89%                         |
| content embs PCA 10         | 67.71%         | <mark>83.74%</mark>            |
| content embs PCA 25         | 68.11%         | 83.58%                         |
| content embs PCA 50         | 67.76%         | 83.06%                         |
| content feat selection      | 66.57%         | 83.59%                         |
| BERT url embs all           | 76.62%         | 82.47%                         |
| BERT url embs PCA 10        | 72.97%         | <mark>83.79%</mark>            |
| BERT url embs PCA 25        | 73.24%         | 83.57%                         |
| BERT url embs PCA 50        | 74.08%         | 82.99%                         |
| BERT url feat selection     | 75.12%         | 83.64%                         |
| Best Score (Feat selection) | N/A            | <mark>84.27%</mark>            |
| Best Score (PCA)            | N/A            | <mark>83.79%</mark>           |

### All Features without embeddings

In [4]:
baseline_feats = []
for feature in df_surtur.columns:
    if all(value not in feature for value in ['emb', 'mobnet', 'img_mean', 'url_emb_mean']):
        baseline_feats.append(feature)
        
make_classification(baseline_feats, df_surtur, tuning=True)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of  15 | elapsed:    9.3s remaining:  1.0min
[Parallel(n_jobs=-1)]: Done   8 out of  15 | elapsed:  1.8min remaining:  1.6min
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed: 119.2min finished


Train Accuracy_score = 0.8900522457234348
Test Accuracy_score = 0.8319271968907005


              precision    recall  f1-score   support

      benign     0.8215    0.9049    0.8612      6079
   malicious     0.8500    0.7327    0.7870      4470

    accuracy                         0.8319     10549
   macro avg     0.8358    0.8188    0.8241     10549
weighted avg     0.8336    0.8319    0.8298     10549

best_estimator = {'tree_method': 'approx', 'subsample': 0.8, 'n_estimators': 120, 'min_child_weight': 10, 'max_depth': 10, 'learning_rate': 0.2, 'gamma': 0.01, 'colsample_bytree': 0.6, 'booster': 'dart'}


AttributeError: 'RandomizedSearchCV' object has no attribute 'feature_importances_'

### Lexical Features

In [4]:
feats_to_drop = ['label', 'content', 'hostname', 'url', 'js', 
                    'domain', 'google_is_safe', 'ip_address']

In [5]:
lexical_features = ['has_IP_in_url', 'having_@_in_url', 'length_hostname', 'number_subdomains', 
                    'number_underscores', 'ratio_digits_hostname', 'ratio_digits_url', 'tld', 
                    'url_len', 'num_semicolons', 'num_zeros', 'num_spaces', 'num_hyphens', 
                    'num_@s', 'num_queries', 'num_ampersands', 'num_equals', 'domain_len',
                    'hex_len', 'has_hex', 'unique_url_chars', 'unique_url_nums', 'unique_url_letters',
                    'ratio_let_chars', 'ratio_nums_chars', 'num_?s']

make_classification(lexical_features + feats_to_drop, df_surtur)



Train Accuracy_score = 0.7976426223982472
Test Accuracy_score = 0.7514456346573135


              precision    recall  f1-score   support

      benign     0.7447    0.8653    0.8005      6079
   malicious     0.7651    0.5966    0.6704      4470

    accuracy                         0.7514     10549
   macro avg     0.7549    0.7310    0.7355     10549
weighted avg     0.7533    0.7514    0.7454     10549

Feature                   Importance               
having_@_in_url           11.902408%               
has_hex                   7.210953%                
tld                       6.109139%                
unique_url_nums           5.862176%                
num_?s                    4.904404%                
unique_url_chars          4.633127%                
number_subdomains         4.515374%                
num_@s                    4.272351%                
num_hyphens               3.824569%                
url_len                   3.634956%                
num_equals      

### Host-based Features

In [6]:
host_feats = ['https', 'who_is', 'is_in_alexa', 'location']

make_classification(host_feats + feats_to_drop, df_surtur)
make_classification(lexical_features + host_feats + feats_to_drop, df_surtur)

Train Accuracy_score = 0.6603079969663773
Test Accuracy_score = 0.65608114513224


              precision    recall  f1-score   support

      benign     0.6620    0.8237    0.7341      6079
   malicious     0.6410    0.4282    0.5134      4470

    accuracy                         0.6561     10549
   macro avg     0.6515    0.6259    0.6237     10549
weighted avg     0.6531    0.6561    0.6406     10549

Feature                   Importance               
is_in_alexa               65.983033%               
location                  20.011756%               
https                     8.011694%                
who_is                    5.993522%                
Train Accuracy_score = 0.8381119912362013
Test Accuracy_score = 0.8013081808702247


              precision    recall  f1-score   support

      benign     0.7896    0.8932    0.8382      6079
   malicious     0.8233    0.6763    0.7426      4470

    accuracy                         0.8013     10549
   macro avg     0.8064    

### Content-based Features

In [7]:
content_feats = ['js_len', 'js_ref', 'js_array_len_avg', 'js_array_len_max',
                 'content_len', 'num_js_func_calls', 'malicious_func_count','total_url_count', 
                 'ext_url_count', 'dcd_color_1', 'dcd_color_2', 'dcd_color_3', 'dcd_color_4', 'dcd_color_5']

make_classification(content_feats + feats_to_drop, df_surtur)
make_classification(lexical_features + host_feats + feats_to_drop + content_feats, df_surtur)

Train Accuracy_score = 0.8332771551360917
Test Accuracy_score = 0.7661389705185325


              precision    recall  f1-score   support

      benign     0.7665    0.8544    0.8081      6079
   malicious     0.7654    0.6461    0.7007      4470

    accuracy                         0.7661     10549
   macro avg     0.7660    0.7503    0.7544     10549
weighted avg     0.7661    0.7661    0.7626     10549

Feature                   Importance               
js_array_len_avg          9.715787%                
dcd_color_2               9.377056%                
dcd_color_1               9.186229%                
js_len                    8.404426%                
js_ref                    8.269977%                
content_len               7.688244%                
num_js_func_calls         7.453144%                
ext_url_count             6.755333%                
js_array_len_max          6.697916%                
total_url_count           6.306274%                
malicious_func_c

## URL Embeddings from the longformer 

https://huggingface.co/transformers/model_doc/longformer.html

### All embeddings

In [8]:
url_embeddings = pd.read_csv('./datasets_of_interest/url_feats_sliding_window_longformer.csv')
scaled_embs = scale_features(url_embeddings)
url_embs_all, url_embs_all_columns = make_columns_for_embs(scaled_embs, 'url_emb_all')
url_embs_all_columns = url_embs_all_columns

df_surtur = pd.concat([df_surtur, url_embs_all], axis=1)

In [9]:
make_classification(url_embs_all_columns + feats_to_drop, df_surtur)
make_classification(url_embs_all_columns + feats_to_drop + lexical_features 
                    + host_feats + content_feats, df_surtur)



Train Accuracy_score = 0.8640768517738265
Test Accuracy_score = 0.761967959048251


              precision    recall  f1-score   support

      benign     0.7576    0.8631    0.8069      6079
   malicious     0.7704    0.6244    0.6897      4470

    accuracy                         0.7620     10549
   macro avg     0.7640    0.7438    0.7483     10549
weighted avg     0.7630    0.7620    0.7573     10549

Feature                   Importance               
url_emb_all_set_39        16.628505%               
url_emb_all_set_41        13.538173%               
url_emb_all_set_37        9.441867%                
url_emb_all_set_33        5.275938%                
url_emb_all_set_51        4.485160%                
url_emb_all_set_46        3.755186%                
url_emb_all_set_40        2.342412%                
url_emb_all_set_44        2.189400%                
url_emb_all_set_69        2.171496%                
url_emb_all_set_35        2.065909%                
url_emb_all_set_1

Train Accuracy_score = 0.9146793629392432
Test Accuracy_score = 0.8297468954403261


              precision    recall  f1-score   support

      benign     0.8212    0.9006    0.8591      6079
   malicious     0.8444    0.7333    0.7850      4470

    accuracy                         0.8297     10549
   macro avg     0.8328    0.8170    0.8220     10549
weighted avg     0.8310    0.8297    0.8277     10549

Feature                   Importance               
url_emb_all_set_37        21.751855%               
url_emb_all_set_34        12.757766%               
url_emb_all_set_41        4.625929%                
url_emb_all_set_39        4.342639%                
url_emb_all_set_1         2.127637%                
url_emb_all_set_42        1.724001%                
who_is                    1.669163%                
unique_url_nums           1.552110%                
url_emb_all_set_51        1.545994%                
url_emb_all_set_69        1.303843%                
url_emb_all_set_

### PCA 10

In [10]:
pca_10 = perform_pca(10, scaled_embs)
url_embs_pca_10, url_embs_pca_10_columns = make_columns_for_embs(pca_10, 'url_pca_10')
# url_embs_pca_10_columns = url_embs_pca_10_columns + ['url_emb_mean', 'url_emb_mean_red']

df_surtur = pd.concat([df_surtur, url_embs_pca_10], axis=1)

In [11]:
make_classification(url_embs_pca_10_columns + feats_to_drop, df_surtur)
make_classification(url_embs_pca_10_columns + feats_to_drop + lexical_features 
                    + host_feats + content_feats, df_surtur)



Train Accuracy_score = 0.8419988202578579
Test Accuracy_score = 0.748127784624135


              precision    recall  f1-score   support

      benign     0.7465    0.8523    0.7959      6079
   malicious     0.7512    0.6065    0.6711      4470

    accuracy                         0.7481     10549
   macro avg     0.7489    0.7294    0.7335     10549
weighted avg     0.7485    0.7481    0.7430     10549

Feature                   Importance               
url_pca_10_set_1          15.485293%               
url_pca_10_set_3          14.917836%               
url_pca_10_set_0          14.902802%               
url_pca_10_set_5          10.987396%               
url_pca_10_set_4          10.494011%               
url_pca_10_set_2          9.627783%                
url_pca_10_set_6          6.240426%                
url_pca_10_set_7          5.983714%                
url_pca_10_set_9          5.777394%                
url_pca_10_set_8          5.583345%                
Train Accuracy_sc

### PCA 25

In [12]:
pca_25 = perform_pca(25, scaled_embs)
url_embs_pca_25, url_embs_pca_25_columns = make_columns_for_embs(pca_25, 'url_pca_25')
# url_embs_pca_25_columns = url_embs_pca_25_columns + ['url_emb_mean', 'url_emb_mean_red']

df_surtur = pd.concat([df_surtur, url_embs_pca_25], axis=1)

In [13]:
make_classification(url_embs_pca_25_columns + feats_to_drop, df_surtur)
make_classification(url_embs_pca_25_columns + feats_to_drop + lexical_features 
                    + host_feats + content_feats, df_surtur)



Train Accuracy_score = 0.8777702873514789
Test Accuracy_score = 0.7474642146174992


              precision    recall  f1-score   support

      benign     0.7517    0.8390    0.7929      6079
   malicious     0.7399    0.6230    0.6765      4470

    accuracy                         0.7475     10549
   macro avg     0.7458    0.7310    0.7347     10549
weighted avg     0.7467    0.7475    0.7436     10549

Feature                   Importance               
url_pca_25_set_1          11.308041%               
url_pca_25_set_0          7.492541%                
url_pca_25_set_3          6.823780%                
url_pca_25_set_5          6.135724%                
url_pca_25_set_2          6.001906%                
url_pca_25_set_4          5.630953%                
url_pca_25_set_24         4.380766%                
url_pca_25_set_6          4.074299%                
url_pca_25_set_20         3.590096%                
url_pca_25_set_17         3.431594%                
url_pca_25_set_1

### PCA 50

In [14]:
pca_50 = perform_pca(50, scaled_embs)
url_embs_pca_50, url_embs_pca_50_columns = make_columns_for_embs(pca_50, 'url_pca_50')
# url_embs_pca_50_columns = url_embs_pca_50_columns + ['url_emb_mean', 'url_emb_mean_red']

df_surtur = pd.concat([df_surtur, url_embs_pca_50], axis=1)

In [15]:
make_classification(url_embs_pca_50_columns + feats_to_drop, df_surtur)
make_classification(url_embs_pca_50_columns + feats_to_drop + lexical_features 
                    + host_feats + content_feats, df_surtur)



Train Accuracy_score = 0.887218757900059
Test Accuracy_score = 0.7379846430941321


              precision    recall  f1-score   support

      benign     0.7437    0.8320    0.7854      6079
   malicious     0.7276    0.6101    0.6637      4470

    accuracy                         0.7380     10549
   macro avg     0.7357    0.7211    0.7245     10549
weighted avg     0.7369    0.7380    0.7338     10549

Feature                   Importance               
url_pca_50_set_1          7.294457%                
url_pca_50_set_3          7.187583%                
url_pca_50_set_0          4.697523%                
url_pca_50_set_2          4.542486%                
url_pca_50_set_5          3.637443%                
url_pca_50_set_4          3.115207%                
url_pca_50_set_27         2.622822%                
url_pca_50_set_31         2.323692%                
url_pca_50_set_6          2.101960%                
url_pca_50_set_17         2.090683%                
url_pca_50_set_21

### Feature Selection

In [16]:
url_emb_feat_sel = ['url_emb_mean', 'url_emb_mean_red', 'url_emb_0', 
                     'url_emb_1', 'url_emb_2', 'url_emb_3', 'url_emb_4', 
                     'url_emb_5', 'url_emb_6', 'url_emb_7', 'url_emb_8', 
                     'url_emb_9', 'url_emb_10', 'url_emb_11', 'url_emb_12', 
                     'url_emb_13', 'url_emb_14', 'url_emb_15', 'url_emb_16', 
                     'url_emb_17', 'url_emb_18', 'url_emb_19']

make_classification(url_emb_feat_sel + feats_to_drop, df_surtur)
make_classification(url_emb_feat_sel + feats_to_drop + lexical_features + host_feats + content_feats, df_surtur)

Train Accuracy_score = 0.8527429004803236
Test Accuracy_score = 0.7632950990615224


              precision    recall  f1-score   support

      benign     0.7577    0.8663    0.8084      6079
   malicious     0.7741    0.6233    0.6905      4470

    accuracy                         0.7633     10549
   macro avg     0.7659    0.7448    0.7494     10549
weighted avg     0.7646    0.7633    0.7584     10549

Feature                   Importance               
url_emb_mean_red          16.420399%               
url_emb_1                 9.800469%                
url_emb_mean              9.546562%                
url_emb_10                4.505610%                
url_emb_3                 4.314799%                
url_emb_19                3.963539%                
url_emb_7                 3.726900%                
url_emb_2                 3.670335%                
url_emb_12                3.504587%                
url_emb_0                 3.429273%                
url_emb_11      

### Best one: 

The best set to continue with is the PCA 10 components set

## Image Embeddings

### All embeddings

In [17]:
img_embeddings = pd.read_csv('./datasets_of_interest/img_features_adjusted.csv')
scaled_embs = scale_features(np.array(img_embeddings))
img_embs_all, img_embs_all_columns = make_columns_for_embs(scaled_embs, 'img_emb_all')

df_surtur = pd.concat([df_surtur, img_embs_all], axis=1)

In [18]:
make_classification(img_embs_all_columns + feats_to_drop, df_surtur)
make_classification(img_embs_all_columns + feats_to_drop + lexical_features 
                    + host_feats + content_feats + url_embs_pca_10_columns, df_surtur)



Train Accuracy_score = 0.9833782758911267
Test Accuracy_score = 0.5992985117072709


              precision    recall  f1-score   support

      benign     0.6197    0.7885    0.6940      6079
   malicious     0.5432    0.3421    0.4198      4470

    accuracy                         0.5993     10549
   macro avg     0.5814    0.5653    0.5569     10549
weighted avg     0.5873    0.5993    0.5778     10549

Feature                   Importance               
img_emb_all_set_351       2.442966%                
img_emb_all_set_95        1.951727%                
img_emb_all_set_323       1.226826%                
img_emb_all_set_297       0.908510%                
img_emb_all_set_393       0.688862%                
img_emb_all_set_143       0.407923%                
img_emb_all_set_138       0.371462%                
img_emb_all_set_366       0.216112%                
img_emb_all_set_964       0.138975%                
img_emb_all_set_371       0.127678%                
img_emb_all_set_

Train Accuracy_score = 0.9850846886323418
Test Accuracy_score = 0.8210256896388283


              precision    recall  f1-score   support

      benign     0.8113    0.8983    0.8526      6079
   malicious     0.8381    0.7159    0.7722      4470

    accuracy                         0.8210     10549
   macro avg     0.8247    0.8071    0.8124     10549
weighted avg     0.8227    0.8210    0.8185     10549

Feature                   Importance               
img_emb_all_set_95        2.887748%                
img_emb_all_set_393       2.840880%                
img_emb_all_set_351       1.983370%                
url_pca_10_set_0          0.909689%                
unique_url_nums           0.763350%                
who_is                    0.701608%                
img_emb_all_set_143       0.655296%                
url_pca_10_set_1          0.590737%                
url_len                   0.570207%                
num_hyphens               0.543037%                
tld             

### PCA 10

In [19]:
pca_10 = perform_pca(10, scaled_embs)
img_embs_pca_10, img_embs_pca_10_columns = make_columns_for_embs(pca_10, 'img_pca_10')

df_surtur = pd.concat([df_surtur, img_embs_pca_10], axis=1)

In [20]:
make_classification(img_embs_pca_10_columns + feats_to_drop, df_surtur)
make_classification(img_embs_pca_10_columns + feats_to_drop + lexical_features 
                    + host_feats + content_feats + url_embs_pca_10_columns, df_surtur)



Train Accuracy_score = 0.8201735906294767
Test Accuracy_score = 0.6145606218598919


              precision    recall  f1-score   support

      benign     0.6240    0.8334    0.7136      6079
   malicious     0.5831    0.3170    0.4107      4470

    accuracy                         0.6146     10549
   macro avg     0.6035    0.5752    0.5622     10549
weighted avg     0.6067    0.6146    0.5853     10549

Feature                   Importance               
img_pca_10_set_0          17.659116%               
img_pca_10_set_3          14.792331%               
img_pca_10_set_8          8.969814%                
img_pca_10_set_1          8.462960%                
img_pca_10_set_6          8.404150%                
img_pca_10_set_2          8.381357%                
img_pca_10_set_5          8.378091%                
img_pca_10_set_4          8.348913%                
img_pca_10_set_9          8.345468%                
img_pca_10_set_7          8.257797%                
Train Accuracy_s

### PCA 25

In [21]:
pca_25 = perform_pca(25, scaled_embs)
img_embs_pca_25, img_embs_pca_25_columns = make_columns_for_embs(pca_25, 'img_pca_25')

df_surtur = pd.concat([df_surtur, img_embs_pca_25], axis=1)

In [22]:
make_classification(img_embs_pca_25_columns + feats_to_drop, df_surtur)
make_classification(img_embs_pca_25_columns + feats_to_drop + lexical_features 
                    + host_feats + content_feats + url_embs_pca_10_columns, df_surtur)



Train Accuracy_score = 0.8985211089576135
Test Accuracy_score = 0.6100104275286757


              precision    recall  f1-score   support

      benign     0.6234    0.8164    0.7070      6079
   malicious     0.5688    0.3293    0.4171      4470

    accuracy                         0.6100     10549
   macro avg     0.5961    0.5729    0.5620     10549
weighted avg     0.6003    0.6100    0.5842     10549

Feature                   Importance               
img_pca_25_set_1          10.051437%               
img_pca_25_set_0          6.934480%                
img_pca_25_set_3          4.736520%                
img_pca_25_set_5          3.953282%                
img_pca_25_set_2          3.951347%                
img_pca_25_set_13         3.640009%                
img_pca_25_set_10         3.608816%                
img_pca_25_set_23         3.601820%                
img_pca_25_set_12         3.595047%                
img_pca_25_set_16         3.593321%                
img_pca_25_set_9

### PCA 50

In [23]:
pca_50 = perform_pca(50, scaled_embs)
img_embs_pca_50, img_embs_pca_50_columns = make_columns_for_embs(pca_50, 'img_pca_50')
# img_embs_pca_50_columns = img_embs_pca_50_columns + ['img_emb_mean', 'img_emb_mean_red']

df_surtur = pd.concat([df_surtur, img_embs_pca_50], axis=1)

In [24]:
make_classification(img_embs_pca_50_columns + feats_to_drop, df_surtur)
make_classification(img_embs_pca_50_columns + feats_to_drop + lexical_features 
                    + host_feats + content_feats + url_embs_pca_10_columns, df_surtur)



Train Accuracy_score = 0.9350615151259796
Test Accuracy_score = 0.6021423831642809


              precision    recall  f1-score   support

      benign     0.6200    0.7998    0.6985      6079
   malicious     0.5504    0.3333    0.4152      4470

    accuracy                         0.6021     10549
   macro avg     0.5852    0.5666    0.5569     10549
weighted avg     0.5905    0.6021    0.5785     10549

Feature                   Importance               
img_pca_50_set_3          8.641599%                
img_pca_50_set_0          4.833728%                
img_pca_50_set_35         1.956397%                
img_pca_50_set_16         1.880689%                
img_pca_50_set_13         1.880120%                
img_pca_50_set_24         1.873067%                
img_pca_50_set_18         1.869619%                
img_pca_50_set_4          1.854216%                
img_pca_50_set_8          1.849229%                
img_pca_50_set_43         1.840745%                
img_pca_50_set_7

### Feature Selection

In [25]:
img_mobnet_embeddings = ['img_mean', 'img_mean_red', 'image_mobnet_0', 
                         'image_mobnet_1', 'image_mobnet_2', 'image_mobnet_3', 
                         'image_mobnet_4', 'image_mobnet_5', 'image_mobnet_6', 'image_mobnet_7', 
                         'image_mobnet_8', 'image_mobnet_9', 'image_mobnet_10', 'image_mobnet_11', 
                         'image_mobnet_12', 'image_mobnet_13', 'image_mobnet_14', 'image_mobnet_15',
                         'image_mobnet_16', 'image_mobnet_17', 'image_mobnet_18', 'image_mobnet_19']

make_classification(img_mobnet_embeddings + feats_to_drop, df_surtur)
make_classification(img_mobnet_embeddings + feats_to_drop + lexical_features + host_feats
                    + content_feats + url_embs_pca_10_columns, df_surtur)

Train Accuracy_score = 0.8814464481334794
Test Accuracy_score = 0.6086832875154043


              precision    recall  f1-score   support

      benign     0.6208    0.8245    0.7083      6079
   malicious     0.5691    0.3152    0.4057      4470

    accuracy                         0.6087     10549
   macro avg     0.5949    0.5698    0.5570     10549
weighted avg     0.5989    0.6087    0.5801     10549

Feature                   Importance               
image_mobnet_0            14.895786%               
image_mobnet_18           4.960954%                
image_mobnet_15           4.137068%                
image_mobnet_19           4.097805%                
img_mean                  4.070607%                
image_mobnet_9            4.070294%                
image_mobnet_10           4.067983%                
image_mobnet_7            4.056895%                
img_mean_red              4.053635%                
image_mobnet_2            4.049619%                
image_mobnet_8  

### Best one: 

The best one is once again PCA with 10 components

## Content Embeddings

### All embeddings

In [26]:
content_embeddings = pd.read_csv('./datasets_of_interest/content_feats_code_bert.csv')
scaled_embs = scale_features(np.array(content_embeddings))
content_embs_all, content_embs_all_columns = make_columns_for_embs(scaled_embs, 'content_emb_all')

df_surtur = pd.concat([df_surtur, content_embs_all], axis=1)

In [27]:
make_classification(content_embs_all_columns + feats_to_drop, df_surtur)
make_classification(content_embs_all_columns + feats_to_drop + lexical_features 
                    + host_feats + content_feats 
                    + url_embs_pca_10_columns + img_embs_pca_10_columns, df_surtur)



Train Accuracy_score = 0.818719979775849
Test Accuracy_score = 0.6744715138875723


              precision    recall  f1-score   support

      benign     0.6675    0.8671    0.7543      6079
   malicious     0.6953    0.4125    0.5178      4470

    accuracy                         0.6745     10549
   macro avg     0.6814    0.6398    0.6361     10549
weighted avg     0.6793    0.6745    0.6541     10549

Feature                   Importance               
content_emb_all_set_4     1.867806%                
content_emb_all_set_24    1.618935%                
content_emb_all_set_2     0.929345%                
content_emb_all_set_8     0.896754%                
content_emb_all_set_1     0.806293%                
content_emb_all_set_235   0.601795%                
content_emb_all_set_5     0.563180%                
content_emb_all_set_12    0.493777%                
content_emb_all_set_211   0.423282%                
content_emb_all_set_110   0.413143%                
content_emb_all_s

Train Accuracy_score = 0.9579190191286762
Test Accuracy_score = 0.8157171295857427


              precision    recall  f1-score   support

      benign     0.8126    0.8840    0.8468      6079
   malicious     0.8209    0.7228    0.7687      4470

    accuracy                         0.8157     10549
   macro avg     0.8168    0.8034    0.8078     10549
weighted avg     0.8161    0.8157    0.8137     10549

Feature                   Importance               
img_pca_10_set_0          1.553396%                
who_is                    1.506354%                
url_len                   1.191279%                
url_pca_10_set_1          1.043553%                
num_semicolons            1.002241%                
content_emb_all_set_4     0.945244%                
url_pca_10_set_0          0.894573%                
unique_url_nums           0.825187%                
url_pca_10_set_3          0.758147%                
tld                       0.755611%                
num_hyphens     

### PCA 10

In [28]:
pca_10 = perform_pca(10, scaled_embs)
content_embs_pca_10, content_embs_pca_10_columns = make_columns_for_embs(pca_10, 'content_pca_10')

df_surtur = pd.concat([df_surtur, content_embs_pca_10], axis=1)

In [29]:
make_classification(content_embs_pca_10_columns + feats_to_drop, df_surtur)
make_classification(content_embs_pca_10_columns + feats_to_drop + lexical_features 
                    + host_feats + content_feats 
                    + url_embs_pca_10_columns + img_embs_pca_10_columns, df_surtur)



Train Accuracy_score = 0.7582055279346086
Test Accuracy_score = 0.6773153853445825


              precision    recall  f1-score   support

      benign     0.6621    0.8988    0.7625      6079
   malicious     0.7321    0.3761    0.4969      4470

    accuracy                         0.6773     10549
   macro avg     0.6971    0.6374    0.6297     10549
weighted avg     0.6918    0.6773    0.6499     10549

Feature                   Importance               
content_pca_10_set_0      12.250325%               
content_pca_10_set_8      11.612798%               
content_pca_10_set_3      10.957287%               
content_pca_10_set_9      10.366482%               
content_pca_10_set_5      9.675812%                
content_pca_10_set_2      9.646496%                
content_pca_10_set_4      9.466386%                
content_pca_10_set_1      9.336551%                
content_pca_10_set_7      8.979737%                
content_pca_10_set_6      7.708128%                
Train Accuracy_s

### PCA 25

In [30]:
pca_25 = perform_pca(25, scaled_embs)
content_embs_pca_25, content_embs_pca_25_columns = make_columns_for_embs(pca_25, 'content_pca_25')

df_surtur = pd.concat([df_surtur, content_embs_pca_25], axis=1)

In [31]:
make_classification(content_embs_pca_25_columns + feats_to_drop, df_surtur)
make_classification(content_embs_pca_25_columns + feats_to_drop + lexical_features 
                    + host_feats + content_feats
                    + url_embs_pca_10_columns + img_embs_pca_10_columns, df_surtur)



Train Accuracy_score = 0.7898268307070027
Test Accuracy_score = 0.6757986539008437


              precision    recall  f1-score   support

      benign     0.6641    0.8850    0.7588      6079
   malicious     0.7145    0.3913    0.5056      4470

    accuracy                         0.6758     10549
   macro avg     0.6893    0.6381    0.6322     10549
weighted avg     0.6854    0.6758    0.6515     10549

Feature                   Importance               
content_pca_25_set_8      10.182455%               
content_pca_25_set_10     4.552336%                
content_pca_25_set_12     4.325399%                
content_pca_25_set_19     4.212894%                
content_pca_25_set_0      4.209627%                
content_pca_25_set_1      4.208511%                
content_pca_25_set_9      4.156437%                
content_pca_25_set_14     4.036177%                
content_pca_25_set_5      3.923793%                
content_pca_25_set_6      3.858075%                
content_pca_25_s

### PCA 50

In [32]:
pca_50 = perform_pca(50, scaled_embs)
content_embs_pca_50, content_embs_pca_50_columns = make_columns_for_embs(pca_50, 'content_pca_50')

df_surtur = pd.concat([df_surtur, content_embs_pca_50], axis=1)

In [33]:
make_classification(content_embs_pca_50_columns + feats_to_drop, df_surtur)
make_classification(content_embs_pca_50_columns + feats_to_drop + lexical_features 
                    + host_feats + content_feats 
                    + url_embs_pca_10_columns + img_embs_pca_10_columns, df_surtur)



Train Accuracy_score = 0.808492036740541
Test Accuracy_score = 0.6739975353114039


              precision    recall  f1-score   support

      benign     0.6660    0.8710    0.7549      6079
   malicious     0.6983    0.4060    0.5135      4470

    accuracy                         0.6740     10549
   macro avg     0.6822    0.6385    0.6342     10549
weighted avg     0.6797    0.6740    0.6526     10549

Feature                   Importance               
content_pca_50_set_8      6.376553%                
content_pca_50_set_3      3.233193%                
content_pca_50_set_2      2.826967%                
content_pca_50_set_14     2.529902%                
content_pca_50_set_12     2.451711%                
content_pca_50_set_9      2.323506%                
content_pca_50_set_10     2.255121%                
content_pca_50_set_22     2.226933%                
content_pca_50_set_25     2.185556%                
content_pca_50_set_19     2.178315%                
content_pca_50_se

## Feature Selection

In [34]:
content_embeddings = ['content_emb_0', 'content_emb_1', 'content_emb_2', 
                      'content_emb_3', 'content_emb_4', 'content_emb_5', 
                      'content_emb_6', 'content_emb_7', 'content_emb_8', 
                      'content_emb_9', 'content_emb_10', 'content_emb_11', 
                      'content_emb_12', 'content_emb_13', 'content_emb_14', 
                      'content_emb_15', 'content_emb_16', 'content_emb_17', 
                      'content_emb_18', 'content_emb_19']

make_classification(content_embeddings + feats_to_drop, df_surtur)
make_classification(url_embs_pca_10_columns + img_embs_pca_10_columns 
                    + feats_to_drop + lexical_features + host_feats
                    + content_feats + content_embeddings, df_surtur)

Train Accuracy_score = 0.7689180079211259
Test Accuracy_score = 0.6673618352450469


              precision    recall  f1-score   support

      benign     0.6559    0.8895    0.7550      6079
   malicious     0.7085    0.3653    0.4821      4470

    accuracy                         0.6674     10549
   macro avg     0.6822    0.6274    0.6185     10549
weighted avg     0.6782    0.6674    0.6394     10549

Feature                   Importance               
content_emb_19            8.919871%                
content_emb_13            6.591604%                
content_emb_4             6.303865%                
content_emb_18            5.736533%                
content_emb_2             5.679799%                
content_emb_0             5.596259%                
content_emb_6             4.720429%                
content_emb_12            4.659799%                
content_emb_3             4.639607%                
content_emb_16            4.572462%                
content_emb_1   

### Best one:

PCA 10 once again

### DCD

In [35]:
dcd_feats = ['dcd_color_1', 'dcd_color_2', 'dcd_color_3', 'dcd_color_4', 'dcd_color_5']

make_classification(dcd_feats + feats_to_drop, df_surtur)

Train Accuracy_score = 0.7598066065559956
Test Accuracy_score = 0.6960849369608494


              precision    recall  f1-score   support

      benign     0.7078    0.8049    0.7532      6079
   malicious     0.6738    0.5481    0.6045      4470

    accuracy                         0.6961     10549
   macro avg     0.6908    0.6765    0.6789     10549
weighted avg     0.6934    0.6961    0.6902     10549

Feature                   Importance               
dcd_color_1               32.113487%               
dcd_color_2               19.867830%               
dcd_color_5               17.279164%               
dcd_color_3               16.854385%               
dcd_color_4               13.885139%               


## Additional embeddings from BERT

### All embeddings

In [36]:
bert_url_embeddings = pd.read_csv('./datasets_of_interest/url_feats_sliding_window.csv')
scaled_embs = scale_features(np.array(bert_url_embeddings))
bert_url_embs_all, bert_url_embs_all_columns = make_columns_for_embs(scaled_embs, 'bert_url_emb_all')

df_surtur = pd.concat([df_surtur, bert_url_embs_all], axis=1)

In [37]:
make_classification(bert_url_embs_all_columns + feats_to_drop, df_surtur)
make_classification(bert_url_embs_all_columns + content_feats + feats_to_drop + lexical_features 
                    + host_feats + url_embs_pca_10_columns + img_embs_pca_10_columns 
                    + content_embs_pca_10_columns, df_surtur)



Train Accuracy_score = 0.9184187241931406
Test Accuracy_score = 0.7623471419091857


              precision    recall  f1-score   support

      benign     0.7691    0.8396    0.8028      6079
   malicious     0.7508    0.6573    0.7009      4470

    accuracy                         0.7623     10549
   macro avg     0.7600    0.7484    0.7519     10549
weighted avg     0.7614    0.7623    0.7597     10549

Feature                   Importance               
bert_url_emb_all_set_371  4.638697%                
bert_url_emb_all_set_214  1.988856%                
bert_url_emb_all_set_1    1.980855%                
bert_url_emb_all_set_312  1.830614%                
bert_url_emb_all_set_456  1.466852%                
bert_url_emb_all_set_162  1.351904%                
bert_url_emb_all_set_193  0.873710%                
bert_url_emb_all_set_33   0.801697%                
bert_url_emb_all_set_86   0.697724%                
bert_url_emb_all_set_389  0.636238%                
bert_url_emb_all

Train Accuracy_score = 0.9624799865172327
Test Accuracy_score = 0.8188453881884539


              precision    recall  f1-score   support

      benign     0.8165    0.8844    0.8491      6079
   malicious     0.8227    0.7298    0.7734      4470

    accuracy                         0.8188     10549
   macro avg     0.8196    0.8071    0.8113     10549
weighted avg     0.8191    0.8188    0.8170     10549

Feature                   Importance               
bert_url_emb_all_set_371  5.181393%                
bert_url_emb_all_set_1    2.484283%                
bert_url_emb_all_set_390  2.478144%                
bert_url_emb_all_set_312  1.174363%                
url_pca_10_set_1          1.002067%                
unique_url_nums           0.806454%                
bert_url_emb_all_set_389  0.786736%                
tld                       0.714263%                
bert_url_emb_all_set_177  0.649430%                
bert_url_emb_all_set_456  0.645649%                
url_pca_10_set_0

### PCA 10

In [38]:
pca_10 = perform_pca(10, scaled_embs)
bert_url_embs_pca_10, bert_url_embs_pca_10_columns = make_columns_for_embs(pca_10, 'bert_url_pca_10')

df_surtur = pd.concat([df_surtur, bert_url_embs_pca_10], axis=1)

In [39]:
make_classification(bert_url_embs_pca_10_columns + feats_to_drop, df_surtur)
make_classification(bert_url_embs_pca_10_columns + feats_to_drop + lexical_features 
                    + host_feats + content_feats 
                    + url_embs_pca_10_columns + img_embs_pca_10_columns + content_embs_pca_10_columns, df_surtur)



Train Accuracy_score = 0.8419777534338923
Test Accuracy_score = 0.7297374158688027


              precision    recall  f1-score   support

      benign     0.7431    0.8115    0.7758      6079
   malicious     0.7070    0.6186    0.6598      4470

    accuracy                         0.7297     10549
   macro avg     0.7251    0.7150    0.7178     10549
weighted avg     0.7278    0.7297    0.7267     10549

Feature                   Importance               
bert_url_pca_10_set_0     19.949664%               
bert_url_pca_10_set_4     11.531843%               
bert_url_pca_10_set_2     10.841742%               
bert_url_pca_10_set_3     9.481049%                
bert_url_pca_10_set_9     9.181257%                
bert_url_pca_10_set_1     9.006233%                
bert_url_pca_10_set_8     7.592295%                
bert_url_pca_10_set_7     7.577844%                
bert_url_pca_10_set_5     7.507662%                
bert_url_pca_10_set_6     7.330407%                
Train Accuracy_s

### PCA 25

In [40]:
pca_25 = perform_pca(25, scaled_embs)
bert_url_embs_pca_25, bert_url_embs_pca_25_columns = make_columns_for_embs(pca_25, 'bert_url_pca_25')

df_surtur = pd.concat([df_surtur, bert_url_embs_pca_25], axis=1)

In [41]:
make_classification(bert_url_embs_pca_25_columns + feats_to_drop, df_surtur)
make_classification(bert_url_embs_pca_25_columns + feats_to_drop + lexical_features 
                    + host_feats + content_feats
                    + url_embs_pca_10_columns + img_embs_pca_10_columns + content_embs_pca_10_columns, df_surtur)



Train Accuracy_score = 0.8739887924496503
Test Accuracy_score = 0.7367522987960944


              precision    recall  f1-score   support

      benign     0.7498    0.8153    0.7811      6079
   malicious     0.7149    0.6300    0.6698      4470

    accuracy                         0.7368     10549
   macro avg     0.7323    0.7226    0.7255     10549
weighted avg     0.7350    0.7368    0.7339     10549

Feature                   Importance               
bert_url_pca_25_set_0     9.807891%                
bert_url_pca_25_set_2     5.337258%                
bert_url_pca_25_set_1     5.240140%                
bert_url_pca_25_set_4     5.072241%                
bert_url_pca_25_set_6     5.056586%                
bert_url_pca_25_set_5     4.562211%                
bert_url_pca_25_set_10    4.552062%                
bert_url_pca_25_set_8     4.311215%                
bert_url_pca_25_set_9     4.238610%                
bert_url_pca_25_set_12    4.130280%                
bert_url_pca_25_

### PCA 50

In [42]:
pca_50 = perform_pca(50, scaled_embs)
bert_url_embs_pca_50, bert_url_embs_pca_50_columns = make_columns_for_embs(pca_50, 'bert_url_pca_50')

df_surtur = pd.concat([df_surtur, bert_url_embs_pca_50], axis=1)

In [43]:
make_classification(bert_url_embs_pca_50_columns + feats_to_drop, df_surtur)
make_classification(bert_url_embs_pca_50_columns + feats_to_drop + lexical_features 
                    + host_feats + content_feats 
                    + url_embs_pca_10_columns + img_embs_pca_10_columns + content_embs_pca_10_columns, df_surtur)



Train Accuracy_score = 0.893686272857504
Test Accuracy_score = 0.7390273959617025


              precision    recall  f1-score   support

      benign     0.7513    0.8179    0.7832      6079
   malicious     0.7184    0.6318    0.6723      4470

    accuracy                         0.7390     10549
   macro avg     0.7348    0.7248    0.7277     10549
weighted avg     0.7373    0.7390    0.7362     10549

Feature                   Importance               
bert_url_pca_50_set_0     7.445454%                
bert_url_pca_50_set_29    5.660959%                
bert_url_pca_50_set_26    4.854850%                
bert_url_pca_50_set_2     3.236268%                
bert_url_pca_50_set_9     2.830406%                
bert_url_pca_50_set_1     2.758091%                
bert_url_pca_50_set_4     2.603700%                
bert_url_pca_50_set_47    2.470016%                
bert_url_pca_50_set_3     2.449780%                
bert_url_pca_50_set_38    2.334876%                
bert_url_pca_50_s

### Feature Selection

In [44]:
feat_selected_bert = feature_selection(20, scale_features(np.array(bert_url_embeddings)), 'url_bert', df_surtur)

df_surtur = pd.concat([df_surtur, feat_selected_bert], axis=1)

df_surtur['url_emb_mean_bert'] = bert_url_embs_all.mean(axis=1).values
df_surtur['url_emb_mean_red_bert'] = feat_selected_bert.mean(axis=1).values

100%|██████████| 20/20 [00:00<00:00, 45764.36it/s]


In [45]:
url_bert_emb_feats_bert = ['url_emb_mean_bert', 'url_emb_mean_red_bert', 'url_bert_emb_20', 
                     'url_bert_emb_21', 'url_bert_emb_22', 'url_bert_emb_23', 'url_bert_emb_24', 
                     'url_bert_emb_25', 'url_bert_emb_26', 'url_bert_emb_27', 'url_bert_emb_28', 
                     'url_bert_emb_29', 'url_bert_emb_30', 'url_bert_emb_31', 'url_bert_emb_32', 
                     'url_bert_emb_33', 'url_bert_emb_34', 'url_bert_emb_35', 'url_bert_emb_36', 
                     'url_bert_emb_37', 'url_bert_emb_38', 'url_bert_emb_39']


make_classification(url_bert_emb_feats_bert + feats_to_drop, df_surtur)
make_classification(feats_to_drop + lexical_features + host_feats
                    + content_feats + url_bert_emb_feats_bert + url_embs_pca_10_columns 
                    + img_embs_pca_10_columns + content_embs_pca_10_columns, df_surtur)



Train Accuracy_score = 0.8521740962332519
Test Accuracy_score = 0.7604512276045122


              precision    recall  f1-score   support

      benign     0.7593    0.8556    0.8045      6079
   malicious     0.7626    0.6311    0.6907      4470

    accuracy                         0.7605     10549
   macro avg     0.7610    0.7433    0.7476     10549
weighted avg     0.7607    0.7605    0.7563     10549

Feature                   Importance               
url_bert_emb_20           17.877296%               
url_bert_emb_29           8.251366%                
url_emb_mean_bert         6.916378%                
url_bert_emb_34           6.821473%                
url_emb_mean_red_bert     6.648507%                
url_bert_emb_27           4.300094%                
url_bert_emb_23           3.464481%                
url_bert_emb_26           3.446665%                
url_bert_emb_30           3.430932%                
url_bert_emb_28           3.418949%                
url_bert_emb_32 

## Best Score using feature selection

In [46]:
make_classification(feats_to_drop + lexical_features + host_feats
                    + content_feats + ['img_mean', 'img_mean_red', 'image_mobnet_9']
                    + ['url_emb_mean', 'url_emb_mean_red', 'url_emb_1', 
                       'url_emb_mean_bert', 'url_emb_mean_red_bert', 'url_bert_emb_20'], df_surtur)

Train Accuracy_score = 0.9149005645908823
Test Accuracy_score = 0.8440610484406105


              precision    recall  f1-score   support

      benign     0.8304    0.9166    0.8714      6079
   malicious     0.8679    0.7454    0.8020      4470

    accuracy                         0.8441     10549
   macro avg     0.8492    0.8310    0.8367     10549
weighted avg     0.8463    0.8441    0.8420     10549

Feature                   Importance               
url_emb_1                 4.756106%                
num_semicolons            4.144145%                
url_bert_emb_20           3.864573%                
url_emb_mean_bert         3.694792%                
url_emb_mean              3.188905%                
who_is                    3.020396%                
https                     2.923735%                
num_ampersands            2.823474%                
url_emb_mean_red          2.802811%                
num_@s                    2.781992%                
num_equals      

## Best Score with PCA

In [47]:
make_classification(bert_url_embs_pca_10_columns + feats_to_drop + lexical_features 
                    + host_feats + content_feats 
                    + url_embs_pca_10_columns + img_embs_pca_10_columns + content_embs_pca_10_columns, df_surtur)

Train Accuracy_score = 0.9418871660908401
Test Accuracy_score = 0.8329699497582709


              precision    recall  f1-score   support

      benign     0.8252    0.9010    0.8614      6079
   malicious     0.8461    0.7405    0.7898      4470

    accuracy                         0.8330     10549
   macro avg     0.8357    0.8207    0.8256     10549
weighted avg     0.8341    0.8330    0.8311     10549

Feature                   Importance               
bert_url_pca_10_set_0     6.211748%                
num_ampersands            2.887758%                
url_pca_10_set_3          2.793111%                
dcd_color_1               2.739292%                
who_is                    2.557716%                
url_pca_10_set_1          2.556511%                
https                     2.469372%                
tld                       2.331568%                
is_in_alexa               2.330474%                
url_pca_10_set_0          2.327070%                
img_pca_10_set_0

## Checkpoint

In [50]:
# df_surtur.to_csv('ML_table_dataset.csv', index=False)

In [51]:
df_surtur = pd.read_csv('ML_table_dataset.csv')

## More models 

In [52]:
def split_data(features_to_keep):
    X = df_surtur[features_to_keep]
    X = X.drop(columns = ['label', 'content', 'hostname', 'url', 'js', 
                          'domain', 'google_is_safe', 'ip_address'])
    y = df_surtur['label']

    return train_test_split(X, y, test_size=0.10, random_state=20)

In [60]:
X_train_pca, X_test_pca, y_train_pca, y_test_pca = split_data(bert_url_embs_pca_10_columns + 
                                                              feats_to_drop + 
                                                              lexical_features + 
                                                              host_feats + 
                                                              content_feats + 
                                                              url_embs_pca_10_columns + 
                                                              img_embs_pca_10_columns + 
                                                              content_embs_pca_10_columns)

X_train_feat_sel, X_test_feat_sel, y_train_feat_sel, y_test_feat_sel = split_data(feats_to_drop + lexical_features + host_feats
                    + content_feats + ['img_mean', 'img_mean_red', 'image_mobnet_9']
                    + ['url_emb_mean', 'url_emb_mean_red', 'url_emb_1', 
                       'url_emb_mean_bert', 'url_emb_mean_red_bert', 'url_bert_emb_20'])

### Naive Bayes 

#### PCA mix

In [61]:
gnb = GaussianNB()
gnb.fit(X_train_pca, y_train_pca)

y_pred = gnb.predict(X_test_pca)
print(f"Accuracy score = {accuracy_score(y_test_pca, y_pred)}")

print('\n')

print(confusion_matrix(y_test_pca, y_pred))

print('\n')

print(classification_report(y_test_pca, y_pred, target_names=['benign', 'malicious'], digits=4))

Accuracy score = 0.6458432078870036


[[5837  242]
 [3494  976]]


              precision    recall  f1-score   support

      benign     0.6255    0.9602    0.7576      6079
   malicious     0.8013    0.2183    0.3432      4470

    accuracy                         0.6458     10549
   macro avg     0.7134    0.5893    0.5504     10549
weighted avg     0.7000    0.6458    0.5820     10549



#### Feature Selection mix

In [62]:
gnb = GaussianNB()
gnb.fit(X_train_feat_sel, y_train_feat_sel)

y_pred = gnb.predict(X_test_feat_sel)
print(f"Accuracy score = {accuracy_score(y_test_feat_sel, y_pred)}")

print('\n')

print(confusion_matrix(y_test_feat_sel, y_pred))

print('\n')

print(classification_report(y_test_feat_sel, y_pred, target_names=['benign', 'malicious'], digits=4))

Accuracy score = 0.6459380036022372


[[5835  244]
 [3491  979]]


              precision    recall  f1-score   support

      benign     0.6257    0.9599    0.7575      6079
   malicious     0.8005    0.2190    0.3439      4470

    accuracy                         0.6459     10549
   macro avg     0.7131    0.5894    0.5507     10549
weighted avg     0.6997    0.6459    0.5823     10549



### SVM 

#### PCA mix

In [63]:
svm_model = svm.SVC(C=1000, gamma=0.001, verbose=3)
svm_model.fit(X_train_pca, y_train_pca)

y_pred = svm_model.predict(X_test_pca)
print(f"Accuracy score = {accuracy_score(y_test_pca, y_pred)}")

print('\n')

print(confusion_matrix(y_test_pca, y_pred))

print('\n')

print(classification_report(y_test_pca, y_pred, target_names=['benign', 'malicious'], digits=4))

[LibSVM]Accuracy score = 0.6960849369608494


[[5624  455]
 [2751 1719]]


              precision    recall  f1-score   support

      benign     0.6715    0.9252    0.7782      6079
   malicious     0.7907    0.3846    0.5175      4470

    accuracy                         0.6961     10549
   macro avg     0.7311    0.6549    0.6478     10549
weighted avg     0.7220    0.6961    0.6677     10549



#### Feature Selection mix

In [64]:
svm_model = svm.SVC(C=1000, gamma=0.001, verbose=3)
svm_model.fit(X_train_feat_sel, y_train_feat_sel)

y_pred = svm_model.predict(X_test_feat_sel)
print(f"Accuracy score = {accuracy_score(y_test_feat_sel, y_pred)}")

print('\n')

print(confusion_matrix(y_test_feat_sel, y_pred))

print('\n')

print(classification_report(y_test_feat_sel, y_pred, target_names=['benign', 'malicious'], digits=4))

[LibSVM]Accuracy score = 0.695895345530382


[[5620  459]
 [2749 1721]]


              precision    recall  f1-score   support

      benign     0.6715    0.9245    0.7780      6079
   malicious     0.7894    0.3850    0.5176      4470

    accuracy                         0.6959     10549
   macro avg     0.7305    0.6548    0.6478     10549
weighted avg     0.7215    0.6959    0.6676     10549



### KNN

#### PCA mix

In [70]:
knn_model = KNeighborsClassifier(n_neighbors=100)
knn_model.fit(X_train_pca, y_train_pca)

y_pred = knn_model.predict(X_test_pca)
print(f"Accuracy score = {accuracy_score(y_test_pca, y_pred)}")

print('\n')

print(confusion_matrix(y_test_pca, y_pred))

print('\n')

print(classification_report(y_test_pca, y_pred, target_names=['benign', 'malicious'], digits=4))

Accuracy score = 0.7042373684709451


[[5410  669]
 [2451 2019]]


              precision    recall  f1-score   support

      benign     0.6882    0.8899    0.7762      6079
   malicious     0.7511    0.4517    0.5641      4470

    accuracy                         0.7042     10549
   macro avg     0.7197    0.6708    0.6702     10549
weighted avg     0.7149    0.7042    0.6863     10549



#### Feature Selection mix

In [66]:
knn_model = KNeighborsClassifier(n_neighbors=100)
knn_model.fit(X_train_feat_sel, y_train_feat_sel)

y_pred = knn_model.predict(X_test_feat_sel)
print(f"Accuracy score = {accuracy_score(y_test_feat_sel, y_pred)}")

print('\n')

print(confusion_matrix(y_test_feat_sel, y_pred))

print('\n')

print(classification_report(y_test_feat_sel, y_pred, target_names=['benign', 'malicious'], digits=4))

Accuracy score = 0.7040477770404777


[[5408  671]
 [2451 2019]]


              precision    recall  f1-score   support

      benign     0.6881    0.8896    0.7760      6079
   malicious     0.7506    0.4517    0.5640      4470

    accuracy                         0.7040     10549
   macro avg     0.7193    0.6706    0.6700     10549
weighted avg     0.7146    0.7040    0.6862     10549



### Adaboost

#### PCA mix

In [67]:
adaboost_model = AdaBoostClassifier()
adaboost_model.fit(X_train_pca, y_train_pca)

y_pred = adaboost_model.predict(X_test_pca)
print(f"Accuracy score = {accuracy_score(y_test_pca, y_pred)}")

print('\n')

print(confusion_matrix(y_test_pca, y_pred))

print('\n')

print(classification_report(y_test_pca, y_pred, target_names=['benign', 'malicious'], digits=4))

Accuracy score = 0.7950516636648024


[[5294  785]
 [1377 3093]]


              precision    recall  f1-score   support

      benign     0.7936    0.8709    0.8304      6079
   malicious     0.7976    0.6919    0.7410      4470

    accuracy                         0.7951     10549
   macro avg     0.7956    0.7814    0.7857     10549
weighted avg     0.7953    0.7951    0.7925     10549



#### Feature Selection mix

In [68]:
adaboost_model = AdaBoostClassifier()
adaboost_model.fit(X_train_feat_sel, y_train_feat_sel)

y_pred = adaboost_model.predict(X_test_feat_sel)
print(f"Accuracy score = {accuracy_score(y_test_feat_sel, y_pred)}")

print('\n')

print(confusion_matrix(y_test_feat_sel, y_pred))

print('\n')

print(classification_report(y_test_feat_sel, y_pred, target_names=['benign', 'malicious'], digits=4))

Accuracy score = 0.7886055550289127


[[5331  748]
 [1482 2988]]


              precision    recall  f1-score   support

      benign     0.7825    0.8770    0.8270      6079
   malicious     0.7998    0.6685    0.7282      4470

    accuracy                         0.7886     10549
   macro avg     0.7911    0.7727    0.7776     10549
weighted avg     0.7898    0.7886    0.7852     10549



## Random Forest 

In [87]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=16, random_state=0)
clf.fit(X_train_pca, y_train_pca)

y_pred = clf.predict(X_test_pca)

print(f"Accuracy score = {accuracy_score(y_test_pca, y_pred)}")

print('\n')

print(confusion_matrix(y_test_pca, y_pred))

print('\n')

print(classification_report(y_test_pca, y_pred, target_names=['benign', 'malicious'], digits=4))

Accuracy score = 0.827187411129017


[[5661  418]
 [1405 3065]]


              precision    recall  f1-score   support

      benign     0.8012    0.9312    0.8613      6079
   malicious     0.8800    0.6857    0.7708      4470

    accuracy                         0.8272     10549
   macro avg     0.8406    0.8085    0.8160     10549
weighted avg     0.8346    0.8272    0.8230     10549



In [83]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=16, random_state=0)
clf.fit(X_train_feat_sel, y_train_feat_sel)

y_pred = clf.predict(X_test_feat_sel)

print(f"Accuracy score = {accuracy_score(y_test_feat_sel, y_pred)}")

print('\n')

print(confusion_matrix(y_test_feat_sel, y_pred))

print('\n')

print(classification_report(y_test_feat_sel, y_pred, target_names=['benign', 'malicious'], digits=4))

Accuracy score = 0.8366669826523842


[[5751  328]
 [1395 3075]]


              precision    recall  f1-score   support

      benign     0.8048    0.9460    0.8697      6079
   malicious     0.9036    0.6879    0.7812      4470

    accuracy                         0.8367     10549
   macro avg     0.8542    0.8170    0.8254     10549
weighted avg     0.8467    0.8367    0.8322     10549

