# Dataset 20 Phase 2 Machine Learning Notebook

In [175]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.svm import libsvm 
libsvm.set_verbosity_wrap(1) 

In [2]:
dataset_20 = pd.read_csv('dataset_20_new_features.csv').drop(columns = ['Unnamed: 0'])
dataset_20.head(1)

Unnamed: 0,url,url_len,ip_add,tld,who_is,https,js_len,js_obf_len,content,label,latitude,longitude,has_IP_in_URL,number_subdomains,hostname,length_hostname,ratio_digits_url,having_@_in_url,ratio_digits_hostname,number_underscores
0,http://members.tripod.com/russiastation/,40,42.77.221.155,0,0,0,58.0,0.0,Named themselves charged particles in a manly ...,good,23.973937,120.982018,0,1,members.tripod.com,18,0.0,0,0.0,0


## Dataset Sample

In [3]:
good_samples = dataset_20[dataset_20['label'] == 'good'].sample(30000, random_state=42)
bad_samples = dataset_20[dataset_20['label'] == 'bad'].sample(29778, random_state=42)

dataset_20_sample = pd.concat([good_samples, bad_samples], axis=0)

In [4]:
X = dataset_20_sample.drop(columns = ['label'])
y = dataset_20_sample['label'].apply(lambda x: 1 if 'bad' in x else 0)

## Train-test-split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [6]:
original_features = ['url', 'url_len', 'ip_add', 'latitude', 'longitude', 
                     'tld', 'who_is', 'https', 'js_len', 'js_obf_len', 'content']

to_drop = ['url', 'content', 'ip_add', 'hostname']

## Variation 1
X_train_original_features = X_train[original_features].drop(columns=['url', 'content', 'ip_add'])
X_test_original_features = X_test[original_features].drop(columns=['url', 'content', 'ip_add'])

## Variation 2
X_train_original_features_remove_js = X_train[original_features]\
.drop(columns=['js_len', 'js_obf_len'])\
.drop(columns=['url', 'content', 'ip_add'])

X_test_original_features_remove_js = X_test[original_features]\
.drop(columns=['js_len', 'js_obf_len'])\
.drop(columns=['url', 'content', 'ip_add'])

## Variation 3
X_train_custom_features = X_train.drop(columns=to_drop)
X_test_custom_features = X_test.drop(columns=to_drop)

## Variation 4
X_train_custom_features_without_js = X_train.drop(columns = ['js_len', 'js_obf_len']).drop(columns=to_drop)
X_test_custom_features_without_js = X_test.drop(columns = ['js_len', 'js_obf_len']).drop(columns=to_drop)

## Variation 5
X_train_transformer = X_train['url']
X_test_transformer = X_test['url']

## Naive Bayes

### Variation 1 - Original Dataset

In [7]:
gnb = GaussianNB()
gnb.fit(X_train_original_features, y_train)

GaussianNB()

In [8]:
y_pred = gnb.predict(X_test_original_features)
accuracy_score(y_test, y_pred)

0.9492573630050185

In [9]:
confusion_matrix(y_test, y_pred)

array([[9877,    0],
       [1001, 8849]])

In [11]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      1.00      0.95      9877
           1       1.00      0.90      0.95      9850

    accuracy                           0.95     19727
   macro avg       0.95      0.95      0.95     19727
weighted avg       0.95      0.95      0.95     19727



### Variation 2 - Original Dataset with JS features removed

In [12]:
gnb = GaussianNB()
gnb.fit(X_train_original_features_remove_js, y_train)

GaussianNB()

In [13]:
y_pred = gnb.predict(X_test_original_features_remove_js)
accuracy_score(y_test, y_pred)

0.8866021189233031

In [14]:
confusion_matrix(y_test, y_pred)

array([[9468,  409],
       [1828, 8022]])

In [15]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.96      0.89      9877
           1       0.95      0.81      0.88      9850

    accuracy                           0.89     19727
   macro avg       0.89      0.89      0.89     19727
weighted avg       0.89      0.89      0.89     19727



### Variation 3 - Original Dataset + Custom Features

In [16]:
gnb = GaussianNB()
gnb.fit(X_train_custom_features, y_train)

GaussianNB()

In [17]:
y_pred = gnb.predict(X_test_custom_features)
accuracy_score(y_test, y_pred)

0.9557459319714098

In [18]:
confusion_matrix(y_test, y_pred)

array([[9867,   10],
       [ 863, 8987]])

In [19]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      1.00      0.96      9877
           1       1.00      0.91      0.95      9850

    accuracy                           0.96     19727
   macro avg       0.96      0.96      0.96     19727
weighted avg       0.96      0.96      0.96     19727



### Variation 4 - Original Dataset without JS Features + Custom Features

In [20]:
gnb = GaussianNB()
gnb.fit(X_train_custom_features_without_js, y_train)

GaussianNB()

In [21]:
y_pred = gnb.predict(X_test_custom_features_without_js)
accuracy_score(y_test, y_pred)

0.8817863841435596

In [22]:
confusion_matrix(y_test, y_pred)

array([[9150,  727],
       [1605, 8245]])

In [23]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.93      0.89      9877
           1       0.92      0.84      0.88      9850

    accuracy                           0.88     19727
   macro avg       0.88      0.88      0.88     19727
weighted avg       0.88      0.88      0.88     19727



## SVM

In [31]:
param_grid = {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}

### Variation 1 - Original Dataset

In [32]:
svm_model = svm.SVC(verbose=3)
clf = GridSearchCV(svm_model, param_grid)

clf.fit(X_train_original_features, y_train)

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]

GridSearchCV(estimator=SVC(verbose=3),
             param_grid={'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001],
                         'kernel': ['rbf']})

In [33]:
print(clf.best_params_)
print(clf.best_estimator_)

{'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'}
SVC(C=1000, gamma=0.0001, verbose=3)


In [34]:
y_pred = clf.predict(X_test_original_features)
accuracy_score(y_test, y_pred)

0.9937648907588584

In [35]:
confusion_matrix(y_test, y_pred)

array([[9845,   32],
       [  91, 9759]])

In [36]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      9877
           1       1.00      0.99      0.99      9850

    accuracy                           0.99     19727
   macro avg       0.99      0.99      0.99     19727
weighted avg       0.99      0.99      0.99     19727



### Variation 2 - Original Dataset with JS features removed

In [37]:
svm_model = svm.SVC(verbose=3)
clf = GridSearchCV(svm_model, param_grid)

clf.fit(X_train_original_features_remove_js, y_train)

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]

GridSearchCV(estimator=SVC(C=10.0, gamma=0.001, verbose=1),
             param_grid={'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001],
                         'kernel': ['rbf']})

In [38]:
print(clf.best_params_)
print(clf.best_estimator_)

{'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'}
SVC(C=1000, gamma=0.0001, verbose=1)


In [39]:
y_pred = clf.predict(X_test_original_features_remove_js)
accuracy_score(y_test, y_pred)

0.8872104222639022

In [40]:
confusion_matrix(y_test, y_pred)

array([[9406,  471],
       [1754, 8096]])

In [41]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.95      0.89      9877
           1       0.95      0.82      0.88      9850

    accuracy                           0.89     19727
   macro avg       0.89      0.89      0.89     19727
weighted avg       0.89      0.89      0.89     19727



### Variation 3 - Original Dataset + Custom Features

In [50]:
svm_model = svm.SVC(verbose=3)
clf = GridSearchCV(svm_model, param_grid)

clf.fit(X_train_custom_features, y_train)

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]

GridSearchCV(estimator=SVC(verbose=3),
             param_grid={'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001],
                         'kernel': ['rbf']})

In [51]:
print(clf.best_params_)
print(clf.best_estimator_)

{'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'}
SVC(C=1000, gamma=0.0001, verbose=3)


In [52]:
y_pred = clf.predict(X_test_custom_features)
accuracy_score(y_test, y_pred)

0.9936635068687586

In [53]:
confusion_matrix(y_test, y_pred)

array([[9842,   35],
       [  90, 9760]])

In [54]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      9877
           1       1.00      0.99      0.99      9850

    accuracy                           0.99     19727
   macro avg       0.99      0.99      0.99     19727
weighted avg       0.99      0.99      0.99     19727



### Variation 4 - Original Dataset without JS Features + Custom Features

In [56]:
svm_model = svm.SVC(verbose=3)
clf = GridSearchCV(svm_model, param_grid)

clf.fit(X_train_custom_features_without_js, y_train)

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]

GridSearchCV(estimator=SVC(verbose=3),
             param_grid={'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001],
                         'kernel': ['rbf']})

In [57]:
print(clf.best_params_)
print(clf.best_estimator_)

{'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'}
SVC(C=1000, gamma=0.0001, verbose=3)


In [58]:
y_pred = clf.predict(X_test_custom_features_without_js)
accuracy_score(y_test, y_pred)

0.8920261570436457

In [59]:
confusion_matrix(y_test, y_pred)

array([[9386,  491],
       [1639, 8211]])

In [60]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.95      0.90      9877
           1       0.94      0.83      0.89      9850

    accuracy                           0.89     19727
   macro avg       0.90      0.89      0.89     19727
weighted avg       0.90      0.89      0.89     19727



## KNN

### Variation 1 - Original Dataset

In [21]:
knn_model = KNeighborsClassifier(n_neighbors=100)
knn_model.fit(X_train_original_features, y_train)

KNeighborsClassifier(n_neighbors=100)

In [22]:
y_pred = knn_model.predict(X_test_original_features)
accuracy_score(y_test, y_pred)

0.9786079991889289

In [23]:
confusion_matrix(y_test, y_pred)

array([[9716,  161],
       [ 261, 9589]])

### Variation 2 - Original Dataset with JS features removed

In [24]:
knn_model = KNeighborsClassifier(n_neighbors=100)
knn_model.fit(X_train_original_features_remove_js, y_train)

KNeighborsClassifier(n_neighbors=100)

In [25]:
y_pred = knn_model.predict(X_test_original_features_remove_js)
accuracy_score(y_test, y_pred)

0.7617985502103716

In [26]:
confusion_matrix(y_test, y_pred)

array([[7065, 2812],
       [1887, 7963]])

### Variation 3 - Original Dataset + Custom Features

In [27]:
knn_model = KNeighborsClassifier(n_neighbors=100)
knn_model.fit(X_train_custom_features, y_train)

KNeighborsClassifier(n_neighbors=100)

In [28]:
y_pred = knn_model.predict(X_test_custom_features)
accuracy_score(y_test, y_pred)

0.9776955441780301

In [29]:
confusion_matrix(y_test, y_pred)

array([[9695,  182],
       [ 258, 9592]])

### Variation 4 - Original Dataset without JS Features + Custom Features

In [30]:
knn_model = KNeighborsClassifier(n_neighbors=100)
knn_model.fit(X_train_custom_features_without_js, y_train)

KNeighborsClassifier(n_neighbors=100)

In [31]:
y_pred = knn_model.predict(X_test_custom_features_without_js)
accuracy_score(y_test, y_pred)

0.7402544735641506

In [32]:
confusion_matrix(y_test, y_pred)

array([[6965, 2912],
       [2212, 7638]])

## XGBoost

### Variation 1 - Original Dataset

In [71]:
xgboost_model = XGBClassifier()
xgboost_model.fit(X_train_original_features, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=16, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [72]:
y_pred = xgboost_model.predict(X_test_original_features)
accuracy_score(y_test, y_pred)

0.9934607390885588

In [73]:
confusion_matrix(y_test, y_pred)

array([[9824,   53],
       [  76, 9774]])

In [75]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      9877
           1       0.99      0.99      0.99      9850

    accuracy                           0.99     19727
   macro avg       0.99      0.99      0.99     19727
weighted avg       0.99      0.99      0.99     19727



### Variation 2 - Original Dataset with JS features removed

In [76]:
xgboost_model = XGBClassifier()
xgboost_model.fit(X_train_original_features_remove_js, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=16, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [77]:
y_pred = xgboost_model.predict(X_test_original_features_remove_js)
accuracy_score(y_test, y_pred)

0.8949662898565418

In [78]:
confusion_matrix(y_test, y_pred)

array([[9063,  814],
       [1258, 8592]])

In [79]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.92      0.90      9877
           1       0.91      0.87      0.89      9850

    accuracy                           0.89     19727
   macro avg       0.90      0.89      0.89     19727
weighted avg       0.90      0.89      0.89     19727



### Variation 3 - Original Dataset + Custom Features

In [80]:
xgboost_model = XGBClassifier()
xgboost_model.fit(X_train_custom_features, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=16, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [81]:
y_pred = xgboost_model.predict(X_test_custom_features)
accuracy_score(y_test, y_pred)

0.9937141988138085

In [82]:
confusion_matrix(y_test, y_pred)

array([[9823,   54],
       [  70, 9780]])

In [83]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      9877
           1       0.99      0.99      0.99      9850

    accuracy                           0.99     19727
   macro avg       0.99      0.99      0.99     19727
weighted avg       0.99      0.99      0.99     19727



### Variation 4 - Original Dataset without JS Features + Custom Features

In [171]:
xgboost_model = XGBClassifier(learning_rate=0.4)
xgboost_model.fit(X_train_custom_features_without_js, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.4, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=16, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [172]:
y_pred = xgboost_model.predict(X_test_custom_features_without_js)
accuracy_score(y_test, y_pred)

0.9048005271962285

In [173]:
confusion_matrix(y_test, y_pred)

array([[9075,  802],
       [1076, 8774]])

In [174]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.92      0.91      9877
           1       0.92      0.89      0.90      9850

    accuracy                           0.90     19727
   macro avg       0.91      0.90      0.90     19727
weighted avg       0.91      0.90      0.90     19727



## Adaboost

### Variation 1 - Original Dataset

In [176]:
adaboost_model = AdaBoostClassifier()
adaboost_model.fit(X_train_original_features, y_train)

AdaBoostClassifier()

In [177]:
y_pred = adaboost_model.predict(X_test_original_features)
accuracy_score(y_test, y_pred)

0.9937141988138085

In [178]:
confusion_matrix(y_test, y_pred)

array([[9833,   44],
       [  80, 9770]])

In [179]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      9877
           1       1.00      0.99      0.99      9850

    accuracy                           0.99     19727
   macro avg       0.99      0.99      0.99     19727
weighted avg       0.99      0.99      0.99     19727



### Variation 2 - Original Dataset with JS features removed

In [180]:
adaboost_model = AdaBoostClassifier()
adaboost_model.fit(X_train_original_features_remove_js, y_train)

AdaBoostClassifier()

In [181]:
y_pred = adaboost_model.predict(X_test_original_features_remove_js)
accuracy_score(y_test, y_pred)

0.8915699295381964

In [182]:
confusion_matrix(y_test, y_pred)

array([[9337,  540],
       [1599, 8251]])

In [183]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.95      0.90      9877
           1       0.94      0.84      0.89      9850

    accuracy                           0.89     19727
   macro avg       0.90      0.89      0.89     19727
weighted avg       0.90      0.89      0.89     19727



### Variation 3 - Original Dataset + Custom Features

In [186]:
adaboost_model = AdaBoostClassifier()
adaboost_model.fit(X_train_custom_features, y_train)

AdaBoostClassifier()

In [187]:
y_pred = adaboost_model.predict(X_test_custom_features)
accuracy_score(y_test, y_pred)

0.9944745779895574

In [188]:
confusion_matrix(y_test, y_pred)

array([[9838,   39],
       [  70, 9780]])

In [189]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      9877
           1       1.00      0.99      0.99      9850

    accuracy                           0.99     19727
   macro avg       0.99      0.99      0.99     19727
weighted avg       0.99      0.99      0.99     19727



### Variation 4 - Original Dataset without JS Features + Custom Features

In [190]:
adaboost_model = AdaBoostClassifier(learning_rate=0.4)
adaboost_model.fit(X_train_custom_features_without_js, y_train)

AdaBoostClassifier(learning_rate=0.4)

In [191]:
y_pred = adaboost_model.predict(X_test_custom_features_without_js)
accuracy_score(y_test, y_pred)

0.8917220053733462

In [192]:
confusion_matrix(y_test, y_pred)

array([[9417,  460],
       [1676, 8174]])

In [193]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.95      0.90      9877
           1       0.95      0.83      0.88      9850

    accuracy                           0.89     19727
   macro avg       0.90      0.89      0.89     19727
weighted avg       0.90      0.89      0.89     19727



## JavaScript Validation using tools

## Dataset Supplementation

## Transformers 

## Image Classification Approach

### Reading images

In [38]:
import cv2
from glob import glob as globlin

image_paths = globlin('./img_extraction/dataset_20_images/*.png')

benign_images = []
malicious_images = []
pbar = tqdm(total=len(image_paths))

for path in image_paths:
    image = cv2.resize(cv2.cvtColor(cv2.imread(path), cv2.COLOR_BGR2RGB), (512, 512))
    if 'bad' in path:
        malicious_images.append(image)
    else:
        benign_images.append(image)
    pbar.update(1)
pbar.close()

print(f'Starting Number of Malicious Images = {len(malicious_images)}')
print(f'Starting Number of Benign Images = {len(benign_images)}')

100%|██████████| 14097/14097 [02:14<00:00, 105.00it/s]

Starting Number of Malicious Images = 6101
Starting Number of Benign Images = 7996





### Removing images with a could not connect page, or white page

In [39]:
from skimage.metrics import structural_similarity as ssim
 
def cleave_error_images(reference_image, cutoff_score, image_list):
    cleave_index = []
    for i in tqdm(range(0, len(image_list))):
        try:
            ssim_noise = ssim(reference_image, image_list[i], multichannel=True)
            if ssim_noise >= cutoff_score:
                cleave_index.append(i)
        except:
            cleave_index.append(i)
    return cleave_index


reference_image = malicious_images[4]
malicious_images_idx = cleave_error_images(reference_image, 0.9, malicious_images)
benign_image_idx = cleave_error_images(reference_image, 0.9, benign_images)

print(f'Number of Malicious Images after deletion = {len(malicious_images) - len(malicious_images_idx)}')
print(f'Number of Benign Images after deletion = {len(benign_images) - len(benign_image_idx)}')

100%|██████████| 6101/6101 [05:11<00:00, 19.59it/s]
100%|██████████| 7996/7996 [06:43<00:00, 19.84it/s]

Number of Malicious Images after deletion = 6100
Number of Benign Images after deletion = 7996





In [40]:
new_malicious_images = np.delete(np.array(malicious_images), malicious_images_idx, axis=0)
new_benign_images = np.delete(np.array(benign_images), benign_image_idx, axis=0)

In [41]:
np.savez('./img_extraction/malicious_images.npz', new_malicious_images)
np.savez('./img_extraction/benign_images.npz', new_benign_images)

### Image checkpoint

In [42]:
malicious_img = np.load('./img_extraction/malicious_images.npz', allow_pickle=True)['arr_0']
benign_img = np.load('./img_extraction/benign_images.npz', allow_pickle=True)['arr_0']

 ### Machine Learning 

In [43]:
from tensorflow.python.client import device_lib

device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 1144343241740423126]

In [None]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = "0"

#### Train-test-split

In [44]:
import tensorflow as tf

In [45]:
y_malicious = np.full(len(malicious_img), 1)
y_benign = np.full(len(benign_img), 0)             

labels_df = np.array(np.concatenate([y_malicious, y_benign]).astype(np.float32))

In [46]:
image_df = np.array(np.concatenate([malicious_img, benign_img]))

# image_df = [image_df[i].astype(np.int) for i in tqdm(range(len(image_df)))]

In [47]:
X_train_img, X_test_img, y_train_img, y_test_img = train_test_split(image_df, labels_df, test_size=0.33, random_state=42)

#### Lenet-5

In [50]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, Dense, AveragePooling2D, Flatten, Dropout

model = Sequential([
    Conv2D(filters=6, kernel_size=(3, 3), activation='relu', input_shape=X_train_img[0].shape),
    Conv2D(filters=16, kernel_size=(3, 3), activation='relu'),
    AveragePooling2D(),
    Flatten(),
    Dense(units=120, activation='relu'),
    Dense(units=84, activation='relu'),
    Dense(units=1, activation = 'softmax')
])

In [51]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model_history = model.fit(X_train_img, y_train_img, #validation_data=(X_valid_NN, y_valid_NN), 
                          epochs=15, batch_size=256, verbose=True)

Epoch 1/15
Epoch 2/15
 4/37 [==>...........................] - ETA: 9:47 - loss: 213.0984 - accuracy: 0.4080 

KeyboardInterrupt: 

In [None]:
y_pred = (model.predict(X_test_img) > 0.5).astype("int32")

accuracy_score(y_test_img, y_pred)

In [None]:
confusion_matrix(y_test_img, y_pred)

#### Pretrained inception V3

In [None]:
inception_model = tf.keras.applications.InceptionV3(
    include_top=False, weights='imagenet', input_tensor=None, input_shape=X_train_img[0].shape,
    pooling=None, classes=1000, classifier_activation=None
)

for layer in inception_model.layers:
    layer.trainable = False
x = Flatten()(inception_model.output)
x = Dense(1024, activation='relu')(x)
x = Dropout(0.7)(x)
x = Dense(75, activation='relu')(x)
x = Dropout(0.2)(x)
x = Dense(1, activation='sigmoid')(x)

inception_model = tf.keras.Model(inception_model.input, x)
#inception_model.summary()

In [None]:
epochs = 5
batch_size = 5
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01, name='adam')
loss_function = tf.keras.losses.binary_crossentropy

inception_model.compile(optimizer=optimizer, loss=loss_function, metrics=['accuracy'])

history = inception_model.fit(x=X_train_img,
                   y=y_train_img,
                   batch_size=batch_size,
                   epochs=epochs,
                   validation_split=0.2)

In [None]:
y_pred_proba = inception_model.predict(X_test_img)
y_pred = np.argmax(y_pred_proba, axis=-1)
accuracy_score(y_test_img, y_pred)

In [None]:
confusion_matrix(y_test_img, y_pred)