# Dataset 20 Phase 2 Machine Learning Notebook

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [3]:
dataset_20 = pd.read_csv('dataset_20_new_features.csv').drop(columns = ['Unnamed: 0'])
dataset_20.head(1)

Unnamed: 0,url,url_len,ip_add,tld,who_is,https,js_len,js_obf_len,content,label,latitude,longitude,has_IP_in_URL,number_subdomains,hostname,length_hostname,ratio_digits_url,having_@_in_url,ratio_digits_hostname,number_underscores
0,http://members.tripod.com/russiastation/,40,42.77.221.155,0,0,0,58.0,0.0,Named themselves charged particles in a manly ...,good,23.973937,120.982018,0,1,members.tripod.com,18,0.0,0,0.0,0


## Dataset Sample

In [6]:
good_samples = dataset_20[dataset_20['label'] == 'good'].sample(30000, random_state=42)
bad_samples = dataset_20[dataset_20['label'] == 'bad'].sample(29778, random_state=42)

dataset_20_sample = pd.concat([good_samples, bad_samples], axis=0)

In [7]:
X = dataset_20_sample.drop(columns = ['label'])
y = dataset_20_sample['label'].apply(lambda x: 1 if 'bad' in x else 0)

## Train-test-split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [9]:
original_features = ['url', 'url_len', 'ip_add', 'latitude', 'longitude', 
                     'tld', 'who_is', 'https', 'js_len', 'js_obf_len', 'content']

to_drop = ['url', 'content', 'ip_add', 'hostname']

## Variation 1
X_train_original_features = X_train[original_features].drop(columns=['url', 'content', 'ip_add'])
X_test_original_features = X_test[original_features].drop(columns=['url', 'content', 'ip_add'])

## Variation 2
X_train_original_features_remove_js = X_train[original_features]\
.drop(columns=['js_len', 'js_obf_len'])\
.drop(columns=['url', 'content', 'ip_add'])

X_test_original_features_remove_js = X_test[original_features]\
.drop(columns=['js_len', 'js_obf_len'])\
.drop(columns=['url', 'content', 'ip_add'])

## Variation 3
X_train_custom_features = X_train.drop(columns=to_drop)
X_test_custom_features = X_test.drop(columns=to_drop)

## Variation 4
X_train_custom_features_without_js = X_train.drop(columns = ['js_len', 'js_obf_len']).drop(columns=to_drop)
X_test_custom_features_without_js = X_test.drop(columns = ['js_len', 'js_obf_len']).drop(columns=to_drop)

## Variation 5
X_train_transformer = X_train['url']
X_test_transformer = X_test['url']

## Naive Bayes

### Variation 1 - Original Dataset

In [10]:
gnb = GaussianNB()
gnb.fit(X_train_original_features, y_train)

GaussianNB()

In [11]:
y_pred = gnb.predict(X_test_original_features)
accuracy_score(y_test, y_pred)

0.9492573630050185

In [12]:
confusion_matrix(y_test, y_pred)

array([[9877,    0],
       [1001, 8849]])

### Variation 2 - Original Dataset with JS features removed

In [13]:
gnb = GaussianNB()
gnb.fit(X_train_original_features_remove_js, y_train)

GaussianNB()

In [14]:
y_pred = gnb.predict(X_test_original_features_remove_js)
accuracy_score(y_test, y_pred)

0.8866021189233031

In [15]:
confusion_matrix(y_test, y_pred)

array([[9468,  409],
       [1828, 8022]])

### Variation 3 - Original Dataset + Custom Features

In [16]:
gnb = GaussianNB()
gnb.fit(X_train_custom_features, y_train)

GaussianNB()

In [17]:
y_pred = gnb.predict(X_test_custom_features)
accuracy_score(y_test, y_pred)

0.9557459319714098

In [18]:
confusion_matrix(y_test, y_pred)

array([[9867,   10],
       [ 863, 8987]])

### Variation 4 - Original Dataset without JS Features + Custom Features

In [19]:
gnb = GaussianNB()
gnb.fit(X_train_custom_features_without_js, y_train)

GaussianNB()

In [20]:
y_pred = gnb.predict(X_test_custom_features_without_js)
accuracy_score(y_test, y_pred)

0.8817863841435596

In [21]:
confusion_matrix(y_test, y_pred)

array([[9150,  727],
       [1605, 8245]])

## SVM

In [22]:
param_grid = [
  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
 ]

### Variation 1 - Original Dataset

In [None]:
svm_model = svm.SVC(verbose=1)
clf = GridSearchCV(svm_model, param_grid)

clf.fit(X_train_original_features, y_train)

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]

In [None]:
y_pred = clf.predict(X_test_original_features)
accuracy_score(y_test, y_pred)

In [None]:
confusion_matrix(y_test, y_pred)

### Variation 2 - Original Dataset with JS features removed

In [None]:
svm_model = svm.SVC(C=10.0, gamma=0.001, kernel='rbf', verbose=1)
clf = GridSearchCV(svm_model, param_grid)

clf.fit(X_train_original_features_remove_js, y_train)

In [None]:
y_pred = clf.predict(X_test_original_features_remove_js)
accuracy_score(y_test, y_pred)

In [None]:
confusion_matrix(y_test, y_pred)

### Variation 3 - Original Dataset + Custom Features

In [None]:
svm_model = svm.SVC(C=10.0, gamma=0.001, kernel='rbf', verbose=1)
clf = GridSearchCV(svm_model, param_grid)

clf.fit(X_train_custom_features, y_train)

In [None]:
y_pred = clf.predict(X_test_custom_features)
accuracy_score(y_test, y_pred)

In [None]:
confusion_matrix(y_test, y_pred)

### Variation 4 - Original Dataset without JS Features + Custom Features

In [None]:
svm_model = svm.SVC(C=10.0, gamma=0.001, kernel='rbf', verbose=1)
clf = GridSearchCV(svm_model, param_grid)

clf.fit(X_train_custom_features_without_js, y_trains)

In [None]:
y_pred = clf.predict(X_test_custom_features_without_js)
accuracy_score(y_test, y_pred)

In [None]:
confusion_matrix(y_test, y_pred)

## KNN

### Variation 1 - Original Dataset

In [38]:
knn_model = KNeighborsClassifier(n_neighbors=100)
knn_model.fit(X_train_original_features, y_train)

KNeighborsClassifier(n_neighbors=100)

In [39]:
y_pred = knn_model.predict(X_test_original_features)
accuracy_score(y_test, y_pred)

0.9786079991889289

In [40]:
confusion_matrix(y_test, y_pred)

array([[9716,  161],
       [ 261, 9589]])

### Variation 2 - Original Dataset with JS features removed

In [41]:
knn_model = KNeighborsClassifier(n_neighbors=100)
knn_model.fit(X_train_original_features_remove_js, y_train)

KNeighborsClassifier(n_neighbors=100)

In [42]:
y_pred = knn_model.predict(X_test_original_features_remove_js)
accuracy_score(y_test, y_pred)

0.7617985502103716

In [43]:
confusion_matrix(y_test, y_pred)

array([[7065, 2812],
       [1887, 7963]])

### Variation 3 - Original Dataset + Custom Features

In [44]:
knn_model = KNeighborsClassifier(n_neighbors=100)
knn_model.fit(X_train_custom_features, y_train)

KNeighborsClassifier(n_neighbors=100)

In [45]:
y_pred = knn_model.predict(X_test_custom_features)
accuracy_score(y_test, y_pred)

0.9776955441780301

In [46]:
confusion_matrix(y_test, y_pred)

array([[9695,  182],
       [ 258, 9592]])

### Variation 4 - Original Dataset without JS Features + Custom Features

In [47]:
knn_model = KNeighborsClassifier(n_neighbors=100)
knn_model.fit(X_train_custom_features_without_js, y_train)

KNeighborsClassifier(n_neighbors=100)

In [48]:
y_pred = knn_model.predict(X_test_custom_features_without_js)
accuracy_score(y_test, y_pred)

0.7402544735641506

In [49]:
confusion_matrix(y_test, y_pred)

array([[6965, 2912],
       [2212, 7638]])

## JavaScript Validation using tools

## Dataset Supplementation

## Transformers 

## Image Classification Approach

### Reading images

In [17]:
import cv2
from glob import glob as globlin

image_paths = globlin('./img_extraction/dataset_20_images/*.png')

benign_images = []
malicious_images = []
pbar = tqdm(total=len(image_paths))

for path in image_paths:
    image = cv2.resize(cv2.cvtColor(cv2.imread(path), cv2.COLOR_BGR2RGB), (512, 512))
    if 'bad' in path:
        malicious_images.append(image)
    else:
        benign_images.append(image)
    pbar.update(1)
pbar.close()

print(f'Starting Number of Malicious Images = {len(malicious_images)}')
print(f'Starting Number of Benign Images = {len(benign_images)}')

100%|████████████████████████████████████████████████████████████████████████████| 14097/14097 [02:41<00:00, 87.20it/s]

Starting Number of Malicious Images = 6101
Starting Number of Benign Images = 7996





### Removing images with a could not connect page, or white page

In [18]:
from skimage.metrics import structural_similarity as ssim
 
def cleave_error_images(reference_image, cutoff_score, image_list):
    cleave_index = []
    for i in tqdm(range(0, len(image_list))):
        try:
            ssim_noise = ssim(reference_image, image_list[i], multichannel=True)
            if ssim_noise >= cutoff_score:
                cleave_index.append(i)
        except:
            cleave_index.append(i)
    return cleave_index


reference_image = malicious_images[4]
malicious_images_idx = cleave_error_images(reference_image, 0.9, malicious_images)
benign_image_idx = cleave_error_images(reference_image, 0.9, benign_images)

print(f'Number of Malicious Images after deletion = {len(malicious_images) - len(malicious_images_idx)}')
print(f'Number of Benign Images after deletion = {len(benign_images) - len(benign_image_idx)}')

100%|██████████████████████████████████████████████████████████████████████████████| 6101/6101 [10:35<00:00,  9.60it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 7996/7996 [14:06<00:00,  9.44it/s]

Number of Malicious Images after deletion = 4492
Number of Benign Images after deletion = 6276





In [19]:
new_malicious_images = np.delete(np.array(malicious_images), malicious_images_idx, axis=0)
new_benign_images = np.delete(np.array(benign_images), benign_image_idx, axis=0)

In [20]:
np.savez('./img_extraction/malicious_images.npz', new_malicious_images)
np.savez('./img_extraction/benign_images.npz', new_benign_images)

### Image checkpoint

In [5]:
malicious_img = np.load('./img_extraction/malicious_images.npz', allow_pickle=True)['arr_0']
benign_img = np.load('./img_extraction/benign_images.npz', allow_pickle=True)['arr_0']

 ### Machine Learning 

In [6]:
from tensorflow.python.client import device_lib

device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 5902421674490658866,
 name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 2251764532
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 16819508529578550179
 physical_device_desc: "device: 0, name: NVIDIA Quadro T1000 with Max-Q Design, pci bus id: 0000:01:00.0, compute capability: 7.5"]

In [7]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = "0"

#### Train-test-split

In [8]:
import tensorflow as tf

In [9]:
y_malicious = np.full(len(malicious_img), 1)
y_benign = np.full(len(benign_img), 0)             

labels_df = np.array(np.concatenate([y_malicious, y_benign]).astype(np.float32))

In [10]:
image_df = np.array(np.concatenate([malicious_img, benign_img]))

# image_df = [image_df[i].astype(np.int) for i in tqdm(range(len(image_df)))]

In [11]:
X_train_img, X_test_img, y_train_img, y_test_img = train_test_split(image_df, labels_df, test_size=0.33, random_state=42)

#### Lenet-5

In [12]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, Dense, AveragePooling2D, Flatten, Dropout

model = Sequential([
    Conv2D(filters=6, kernel_size=(3, 3), activation='relu', input_shape=X_train_img[0].shape),
    Conv2D(filters=16, kernel_size=(3, 3), activation='relu'),
    AveragePooling2D(),
    Flatten(),
    Dense(units=120, activation='relu'),
    Dense(units=84, activation='relu'),
    Dense(units=1, activation = 'softmax')
])

In [19]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model_history = model.fit(X_train_img, y_train_img, #validation_data=(X_valid_NN, y_valid_NN), 
                          epochs=15, batch_size=256, verbose=True)

Epoch 1/15


ResourceExhaustedError:  OOM when allocating tensor with shape[256,3,512,512] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node sequential/ArithmeticOptimizer/ReorderCastLikeAndValuePreserving_float_Cast}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
 [Op:__inference_train_function_7689]

Function call stack:
train_function


In [None]:
y_pred = (model.predict(X_test_img) > 0.5).astype("int32")

accuracy_score(y_test_img, y_pred)

In [None]:
confusion_matrix(y_test_img, y_pred)

#### Pretrained inception V3

In [15]:
inception_model = tf.keras.applications.InceptionV3(
    include_top=False, weights='imagenet', input_tensor=None, input_shape=X_train_img[0].shape,
    pooling=None, classes=1000, classifier_activation=None
)

for layer in inception_model.layers:
    layer.trainable = False
x = Flatten()(inception_model.output)
x = Dense(1024, activation='relu')(x)
x = Dropout(0.7)(x)
x = Dense(75, activation='relu')(x)
x = Dropout(0.2)(x)
x = Dense(1, activation='sigmoid')(x)

inception_model = tf.keras.Model(inception_model.input, x)
#inception_model.summary()

ResourceExhaustedError: OOM when allocating tensor with shape[401408,1024] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:RandomUniform]

In [None]:
epochs = 5
batch_size = 5
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01, name='adam')
loss_function = tf.keras.losses.binary_crossentropy

inception_model.compile(optimizer=optimizer, loss=loss_function, metrics=['accuracy'])

history = inception_model.fit(x=X_train_img,
                   y=y_train_img,
                   batch_size=batch_size,
                   epochs=epochs,
                   validation_split=0.2)

In [None]:
y_pred_proba = inception_model.predict(X_test_img)
y_pred = np.argmax(y_pred_proba, axis=-1)
accuracy_score(y_test_img, y_pred)

In [None]:
confusion_matrix(y_test_img, y_pred)