In [40]:
import pandas as pd 
import numpy as np
import sys
from tqdm import tqdm

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2

from xgboost import XGBClassifier

np.set_printoptions(threshold=sys.maxsize)

## Reading the Dataset

In [16]:
df_surtur = pd.read_csv('super_final_dataset_surtur.csv')
df_surtur.head()

Unnamed: 0,content,has_IP_in_url,having_@_in_url,hostname,https,label,length_hostname,number_subdomains,number_underscores,ratio_digits_hostname,...,content_emb_12,content_emb_13,content_emb_14,content_emb_15,content_emb_16,content_emb_17,content_emb_18,content_emb_19,is_in_alexa,num_?s
0,"<html><head><meta content=""no-cache"" http-equi...",0.0,0.0,00.124.324.77.00.opteamevent.hu,0,1,31.0,6.0,0.0,0.387097,...,0.045458,0.045459,0.045461,0.045461,0.045458,0.045458,0.045458,0.04546,0,0
1,�MQTH1TDa1��fl�hF1bVOFESLCnBbRI9MRTH1PDa1SP...,0.0,0.0,00005ik.rcomhost.com,0,1,20.0,1.0,0.0,0.25,...,0.04416,0.04416,0.04416,0.04416,0.04416,0.04416,0.04416,0.044159,0,0
2,"<!DOCTYPE html>\n<html data-adblockkey=""MFwwDQ...",0.0,0.0,000098.ihostfull.com,0,1,20.0,1.0,0.0,0.3,...,0.046553,0.048641,0.048712,0.047349,0.048684,0.047914,0.048007,0.049129,0,0
3,"<html>\n<head>\n<meta content=""noarchive"" name...",0.0,0.0,000p6vl.wcomhost.com,0,1,20.0,1.0,0.0,0.2,...,0.047654,0.04767,0.047664,0.047662,0.04767,0.047674,0.04766,0.047667,1,0
4,"<html><head><meta content=""no-cache"" http-equi...",0.0,0.0,001.002.003.23.opteamevent.hu,0,1,29.0,5.0,0.0,0.37931,...,0.045458,0.045459,0.045461,0.045461,0.045458,0.045458,0.045458,0.04546,0,0


In [17]:
print(np.array(df_surtur.columns))

['content' 'has_IP_in_url' 'having_@_in_url' 'hostname' 'https' 'label'
 'length_hostname' 'number_subdomains' 'number_underscores'
 'ratio_digits_hostname' 'ratio_digits_url' 'tld' 'url' 'url_len' 'who_is'
 'js' 'js_len' 'js_ref' 'js_array_len_avg' 'js_array_len_max'
 'content_len' 'num_js_func_calls' 'malicious_func_count'
 'total_url_count' 'ext_url_count' 'num_semicolons' 'num_zeros'
 'num_spaces' 'num_hyphens' 'num_@s' 'num_queries' 'num_ampersands'
 'num_equals' 'domain' 'domain_len' 'img_mean' 'img_mean_red'
 'image_mobnet_0' 'image_mobnet_1' 'image_mobnet_2' 'image_mobnet_3'
 'image_mobnet_4' 'image_mobnet_5' 'image_mobnet_6' 'image_mobnet_7'
 'image_mobnet_8' 'image_mobnet_9' 'image_mobnet_10' 'image_mobnet_11'
 'image_mobnet_12' 'image_mobnet_13' 'image_mobnet_14' 'image_mobnet_15'
 'image_mobnet_16' 'image_mobnet_17' 'image_mobnet_18' 'image_mobnet_19'
 'dcd_color_1' 'dcd_color_2' 'dcd_color_3' 'dcd_color_4' 'dcd_color_5'
 'google_is_safe' 'ip_address' 'location' 'url_emb_me

## Machine Learning

In [18]:
def make_classification(features_to_keep):
    X = df_surtur[features_to_keep]
    X = X.drop(columns = ['label', 'content', 'hostname', 'url', 'js', 
                          'domain', 'google_is_safe', 'ip_address'])
    y = df_surtur['label']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=20)
    
    xgboost_model = XGBClassifier(verbosity=0, 
                              max_depth=7,
                              min_child_weight=1,
                              n_estimators=165,
                              colsample_bylevel=1,
                              colsample_bytree=1, 
                              num_parallel_tree=1,
                              learning_rate=0.3,
                              tree_method='exact', 
                              booster='dart',
                              gamma=1e-10,
                              alpha=0,
                              scale_pos_weight= 1,  # 1.375520774687535,
                              subsample=1,
                              n_jobs=-1)

    xgboost_model.fit(X_train, y_train)
    
    y_pred = xgboost_model.predict(X_test)
    
    print(f'Accuracy_score = {accuracy_score(y_test, y_pred)}')
    print('\n')
    print(classification_report(y_test, y_pred, target_names=['benign', 'malicious'], digits=4))
    
    feat_imp = xgboost_model.feature_importances_
    feat_dict = {}

    for i in range(len(feat_imp)):
        feat_dict[X.columns[i]] = feat_imp[i]

    feat_dict = {k: v for k, v in sorted(feat_dict.items(), key=lambda item: item[1], reverse=True)}

    print("{:<25} {:<25}".format('Feature' ,'Importance'))
    for k, v in feat_dict.items():
        num = v
        print("{:<25} {:<25%}".format(k, num))

    

### All Features without embeddings

In [19]:
baseline_feats = []
for feature in df_surtur.columns:
    if all(value not in feature for value in ['emb', 'mobnet', 'img_mean', 'url_emb_mean']):
        baseline_feats.append(feature)
        
make_classification(baseline_feats)



Accuracy_score = 0.8310740354535975


              precision    recall  f1-score   support

      benign     0.8182    0.9087    0.8611      6079
   malicious     0.8539    0.7255    0.7845      4470

    accuracy                         0.8311     10549
   macro avg     0.8361    0.8171    0.8228     10549
weighted avg     0.8333    0.8311    0.8286     10549

Feature                   Importance               
who_is                    12.321550%               
unique_url_nums           8.626608%                
unique_url_chars          6.183530%                
https                     4.710356%                
tld                       4.121293%                
number_subdomains         3.091990%                
url_len                   3.065580%                
num_hyphens               2.782290%                
having_@_in_url           2.776576%                
is_in_alexa               2.729821%                
dcd_color_1               2.676867%                
location   

### Lexical Features

In [20]:
feats_to_drop = ['label', 'content', 'hostname', 'url', 'js', 
                    'domain', 'google_is_safe', 'ip_address']

In [21]:
lexical_features = ['has_IP_in_url', 'having_@_in_url', 'length_hostname', 'number_subdomains', 
                    'number_underscores', 'ratio_digits_hostname', 'ratio_digits_url', 'tld', 
                    'url_len', 'num_semicolons', 'num_zeros', 'num_spaces', 'num_hyphens', 
                    'num_@s', 'num_queries', 'num_ampersands', 'num_equals', 'domain_len',
                    'hex_len', 'has_hex', 'unique_url_chars', 'unique_url_nums', 'unique_url_letters',
                    'ratio_let_chars', 'ratio_nums_chars', 'num_?s']

make_classification(lexical_features + feats_to_drop)

Accuracy_score = 0.754763484690492


              precision    recall  f1-score   support

      benign     0.7482    0.8659    0.8027      6079
   malicious     0.7680    0.6036    0.6759      4470

    accuracy                         0.7548     10549
   macro avg     0.7581    0.7348    0.7393     10549
weighted avg     0.7566    0.7548    0.7490     10549

Feature                   Importance               
unique_url_nums           12.100916%               
has_hex                   9.470193%                
unique_url_chars          9.390506%                
tld                       7.404562%                
having_@_in_url           6.221763%                
number_subdomains         5.808054%                
num_hyphens               4.286741%                
number_underscores        4.279814%                
num_equals                4.208982%                
num_queries               3.669810%                
url_len                   3.467369%                
num_ampersan

### Host-based Features

In [22]:
host_feats = ['https', 'who_is', 'is_in_alexa', 'location']

make_classification(host_feats + feats_to_drop)
make_classification(lexical_features + host_feats + feats_to_drop)

Accuracy_score = 0.6570291022845768


              precision    recall  f1-score   support

      benign     0.6615    0.8289    0.7358      6079
   malicious     0.6453    0.4233    0.5112      4470

    accuracy                         0.6570     10549
   macro avg     0.6534    0.6261    0.6235     10549
weighted avg     0.6547    0.6570    0.6407     10549

Feature                   Importance               
is_in_alexa               48.669788%               
location                  32.508582%               
https                     9.596092%                
who_is                    9.225541%                
Accuracy_score = 0.8002654280026543


              precision    recall  f1-score   support

      benign     0.7897    0.8906    0.8371      6079
   malicious     0.8199    0.6774    0.7419      4470

    accuracy                         0.8003     10549
   macro avg     0.8048    0.7840    0.7895     10549
weighted avg     0.8025    0.8003    0.7968     10549

Feature   

### Content-based Features

In [23]:
content_feats = ['js_len', 'js_ref', 'js_array_len_avg', 'js_array_len_max',
                 'content_len', 'num_js_func_calls', 'malicious_func_count','total_url_count', 
                 'ext_url_count', 'dcd_color_1', 'dcd_color_2', 'dcd_color_3', 'dcd_color_4', 'dcd_color_5']

make_classification(content_feats + feats_to_drop)
make_classification(lexical_features + host_feats + feats_to_drop + content_feats)

Accuracy_score = 0.7630107119158214


              precision    recall  f1-score   support

      benign     0.7628    0.8544    0.8060      6079
   malicious     0.7634    0.6387    0.6955      4470

    accuracy                         0.7630     10549
   macro avg     0.7631    0.7466    0.7508     10549
weighted avg     0.7630    0.7630    0.7592     10549

Feature                   Importance               
js_array_len_avg          11.331429%               
dcd_color_2               10.857553%               
js_len                    8.827004%                
dcd_color_1               8.539956%                
content_len               7.607314%                
num_js_func_calls         6.972174%                
js_array_len_max          6.771773%                
malicious_func_count      6.755401%                
ext_url_count             6.497730%                
total_url_count           6.315522%                
js_ref                    6.143576%                
dcd_color_5

### URL Embeddings from the longformer 

https://huggingface.co/transformers/model_doc/longformer.html

In [24]:
url_emb_feats = ['url_emb_mean', 'url_emb_mean_red', 'url_emb_0', 
                 'url_emb_1', 'url_emb_2', 'url_emb_3', 'url_emb_4', 
                 'url_emb_5', 'url_emb_6', 'url_emb_7', 'url_emb_8', 
                 'url_emb_9', 'url_emb_10', 'url_emb_11', 'url_emb_12', 
                 'url_emb_13', 'url_emb_14', 'url_emb_15', 'url_emb_16', 
                 'url_emb_17', 'url_emb_18', 'url_emb_19']

make_classification(url_emb_feats + feats_to_drop)
make_classification(url_emb_feats + feats_to_drop + lexical_features + host_feats + content_feats)

Accuracy_score = 0.7643378519290928


              precision    recall  f1-score   support

      benign     0.7532    0.8791    0.8113      6079
   malicious     0.7872    0.6083    0.6863      4470

    accuracy                         0.7643     10549
   macro avg     0.7702    0.7437    0.7488     10549
weighted avg     0.7676    0.7643    0.7583     10549

Feature                   Importance               
url_emb_mean_red          19.876653%               
url_emb_1                 18.713418%               
url_emb_mean              9.050211%                
url_emb_3                 3.517240%                
url_emb_8                 3.114260%                
url_emb_7                 3.106756%                
url_emb_4                 3.079773%                
url_emb_5                 3.053969%                
url_emb_2                 3.037693%                
url_emb_10                2.862465%                
url_emb_14                2.859021%                
url_emb_13 

### Image Embeddings

In [25]:
img_mobnet_embeddings = ['img_mean', 'img_mean_red', 'image_mobnet_0', 
                         'image_mobnet_1', 'image_mobnet_2', 'image_mobnet_3', 
                         'image_mobnet_4', 'image_mobnet_5', 'image_mobnet_6', 'image_mobnet_7', 
                         'image_mobnet_8', 'image_mobnet_9', 'image_mobnet_10', 'image_mobnet_11', 
                         'image_mobnet_12', 'image_mobnet_13', 'image_mobnet_14', 'image_mobnet_15',
                         'image_mobnet_16', 'image_mobnet_17', 'image_mobnet_18', 'image_mobnet_19']

make_classification(img_mobnet_embeddings + feats_to_drop)
make_classification(url_emb_feats + feats_to_drop + lexical_features + host_feats
                    + content_feats + img_mobnet_embeddings)

Accuracy_score = 0.6128542989856859


              precision    recall  f1-score   support

      benign     0.6194    0.8510    0.7170      6079
   malicious     0.5878    0.2890    0.3875      4470

    accuracy                         0.6129     10549
   macro avg     0.6036    0.5700    0.5523     10549
weighted avg     0.6060    0.6129    0.5774     10549

Feature                   Importance               
img_mean_red              15.427394%               
img_mean                  12.161608%               
image_mobnet_17           4.936055%                
image_mobnet_14           3.774932%                
image_mobnet_10           3.752293%                
image_mobnet_15           3.655128%                
image_mobnet_16           3.629029%                
image_mobnet_11           3.626671%                
image_mobnet_8            3.581095%                
image_mobnet_3            3.567450%                
image_mobnet_19           3.534382%                
image_mobne

## Content Embeddings

In [26]:
content_embeddings = ['content_emb_0', 'content_emb_1', 'content_emb_2', 
                      'content_emb_3', 'content_emb_4', 'content_emb_5', 
                      'content_emb_6', 'content_emb_7', 'content_emb_8', 
                      'content_emb_9', 'content_emb_10', 'content_emb_11', 
                      'content_emb_12', 'content_emb_13', 'content_emb_14', 
                      'content_emb_15', 'content_emb_16', 'content_emb_17', 
                      'content_emb_18', 'content_emb_19']

make_classification(content_embeddings + feats_to_drop)
make_classification(url_emb_feats + feats_to_drop + lexical_features + host_feats
                    + content_feats + img_mobnet_embeddings + content_embeddings)

Accuracy_score = 0.6656555123708409


              precision    recall  f1-score   support

      benign     0.6535    0.8937    0.7550      6079
   malicious     0.7110    0.3555    0.4740      4470

    accuracy                         0.6657     10549
   macro avg     0.6822    0.6246    0.6145     10549
weighted avg     0.6778    0.6657    0.6359     10549

Feature                   Importance               
content_emb_19            13.811943%               
content_emb_2             7.238602%                
content_emb_13            5.884395%                
content_emb_12            5.725258%                
content_emb_10            5.571158%                
content_emb_0             4.946371%                
content_emb_18            4.665475%                
content_emb_3             4.563457%                
content_emb_6             4.528358%                
content_emb_8             4.362756%                
content_emb_9             4.191156%                
content_emb

### DCD

In [27]:
dcd_feats = ['dcd_color_1', 'dcd_color_2', 'dcd_color_3', 'dcd_color_4', 'dcd_color_5']

make_classification(dcd_feats + feats_to_drop)

Accuracy_score = 0.6938098397952412


              precision    recall  f1-score   support

      benign     0.7031    0.8112    0.7533      6079
   malicious     0.6753    0.5342    0.5966      4470

    accuracy                         0.6938     10549
   macro avg     0.6892    0.6727    0.6749     10549
weighted avg     0.6913    0.6938    0.6869     10549

Feature                   Importance               
dcd_color_2               29.230213%               
dcd_color_1               25.545219%               
dcd_color_3               15.549251%               
dcd_color_5               14.965314%               
dcd_color_4               14.710005%               


### Best scores

In [32]:
make_classification(feats_to_drop + lexical_features + host_feats
                    + content_feats + ['img_mean', 'img_mean_red', 'image_mobnet_9']
                    + ['url_emb_mean', 'url_emb_mean_red', 'url_emb_1'])

Accuracy_score = 0.8398900369703289


              precision    recall  f1-score   support

      benign     0.8237    0.9187    0.8687      6079
   malicious     0.8689    0.7327    0.7950      4470

    accuracy                         0.8399     10549
   macro avg     0.8463    0.8257    0.8318     10549
weighted avg     0.8429    0.8399    0.8374     10549

Feature                   Importance               
url_emb_1                 7.933941%                
url_emb_mean_red          6.343190%                
unique_url_nums           5.074929%                
who_is                    4.967113%                
url_emb_mean              3.353893%                
num_hyphens               3.353597%                
num_equals                3.083605%                
js_len                    2.952749%                
tld                       2.948724%                
is_in_alexa               2.921225%                
num_ampersands            2.652247%                
number_subd

## Additional embeddings from BERT

In [42]:
url_features = pd.read_csv('url_feats_sliding_window.csv')

scaler = MinMaxScaler()
url_features_scaled = scaler.fit_transform(url_features)

url_features_mean = url_features_scaled.mean(axis=1)

url_feat_new = SelectKBest(chi2, k=20).fit_transform(url_features_scaled, df_surtur['label'])
url_feat_new = pd.DataFrame(url_feat_new)

feat_array = []
for i in tqdm(range(len(url_feat_new.columns))):
     feat_array.append(f'url_emb_{i+20}')
        
url_feat_new.columns = feat_array
url_feat_new.head()

df_surtur['url_emb_mean_bert'] = url_features_mean
df_surtur['url_emb_mean_red_bert'] = url_feat_new.mean(axis=1).values

df_surtur = pd.concat([df_surtur, url_feat_new], axis=1)

100%|██████████| 20/20 [00:00<00:00, 144134.16it/s]


In [44]:
url_emb_feats_bert = ['url_emb_mean_bert', 'url_emb_mean_red_bert', 'url_emb_20', 
                     'url_emb_21', 'url_emb_22', 'url_emb_23', 'url_emb_24', 
                     'url_emb_25', 'url_emb_26', 'url_emb_27', 'url_emb_28', 
                     'url_emb_29', 'url_emb_30', 'url_emb_31', 'url_emb_32', 
                     'url_emb_33', 'url_emb_34', 'url_emb_35', 'url_emb_36', 
                     'url_emb_37', 'url_emb_38', 'url_emb_39']


make_classification(url_emb_feats_bert + feats_to_drop)
make_classification(url_emb_feats + feats_to_drop + lexical_features + host_feats
                    + content_feats + img_mobnet_embeddings + content_embeddings + url_emb_feats_bert)

Accuracy_score = 0.7511612475116125


              precision    recall  f1-score   support

      benign     0.7529    0.8457    0.7966      6079
   malicious     0.7479    0.6226    0.6795      4470

    accuracy                         0.7512     10549
   macro avg     0.7504    0.7341    0.7381     10549
weighted avg     0.7508    0.7512    0.7470     10549

Feature                   Importance               
url_emb_20                19.957052%               
url_emb_29                12.471469%               
url_emb_mean_red_bert     6.176529%                
url_emb_mean_bert         5.323802%                
url_emb_24                4.307182%                
url_emb_34                4.020206%                
url_emb_33                3.940124%                
url_emb_21                3.879892%                
url_emb_27                3.556989%                
url_emb_35                3.455467%                
url_emb_22                3.353843%                
url_emb_23 

## Best Score again

In [45]:
make_classification(feats_to_drop + lexical_features + host_feats
                    + content_feats + ['img_mean', 'img_mean_red', 'image_mobnet_9']
                    + ['url_emb_mean', 'url_emb_mean_red', 'url_emb_1', 
                       'url_emb_mean_bert', 'url_emb_mean_red_bert', 'url_emb_20'])

Accuracy_score = 0.8427339084273391


              precision    recall  f1-score   support

      benign     0.8290    0.9161    0.8704      6079
   malicious     0.8669    0.7430    0.8001      4470

    accuracy                         0.8427     10549
   macro avg     0.8479    0.8295    0.8353     10549
weighted avg     0.8450    0.8427    0.8406     10549

Feature                   Importance               
url_emb_20                5.510643%                
url_emb_mean_red          5.413893%                
url_emb_1                 5.160495%                
num_ampersands            5.007708%                
https                     4.076918%                
url_emb_mean_bert         3.796947%                
is_in_alexa               3.084218%                
who_is                    2.945010%                
tld                       2.922234%                
number_subdomains         2.876776%                
js_len                    2.760551%                
location   

In [47]:
df_surtur.to_csv('final_dataset_both_embeddings.csv', index=False)

In [50]:
np.array(df_surtur.columns)

array(['content', 'has_IP_in_url', 'having_@_in_url', 'hostname', 'https',
       'label', 'length_hostname', 'number_subdomains',
       'number_underscores', 'ratio_digits_hostname', 'ratio_digits_url',
       'tld', 'url', 'url_len', 'who_is', 'js', 'js_len', 'js_ref',
       'js_array_len_avg', 'js_array_len_max', 'content_len',
       'num_js_func_calls', 'malicious_func_count', 'total_url_count',
       'ext_url_count', 'num_semicolons', 'num_zeros', 'num_spaces',
       'num_hyphens', 'num_@s', 'num_queries', 'num_ampersands',
       'num_equals', 'domain', 'domain_len', 'img_mean', 'img_mean_red',
       'image_mobnet_0', 'image_mobnet_1', 'image_mobnet_2',
       'image_mobnet_3', 'image_mobnet_4', 'image_mobnet_5',
       'image_mobnet_6', 'image_mobnet_7', 'image_mobnet_8',
       'image_mobnet_9', 'image_mobnet_10', 'image_mobnet_11',
       'image_mobnet_12', 'image_mobnet_13', 'image_mobnet_14',
       'image_mobnet_15', 'image_mobnet_16', 'image_mobnet_17',
       'image