# Dataset 20 Phase 2 Machine Learning Notebook

In [4]:
import pandas as pd

from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

In [5]:
dataset_20 = pd.read_csv('dataset_20_new_features.csv').drop(columns = ['Unnamed: 0'])
dataset_20.head(1)

Unnamed: 0,url,url_len,ip_add,tld,who_is,https,js_len,js_obf_len,content,label,latitude,longitude,has_IP_in_URL,number_subdomains,hostname,length_hostname,ratio_digits_url,having_@_in_url,ratio_digits_hostname,number_underscores
0,http://members.tripod.com/russiastation/,40,42.77.221.155,0,0,0,58.0,0.0,Named themselves charged particles in a manly ...,good,23.973937,120.982018,0,1,members.tripod.com,18,0.0,0,0.0,0


## Dataset Sample

In [6]:
good_samples = dataset_20[dataset_20['label'] == 'good'].sample(30000, random_state=42)
bad_samples = dataset_20[dataset_20['label'] == 'bad'].sample(29778, random_state=42)

dataset_20_sample = pd.concat([good_samples, bad_samples], axis=0)

In [7]:
X = dataset_20_sample.drop(columns = ['label'])
y = dataset_20_sample['label'].apply(lambda x: 1 if 'bad' in x else 0)

## Train-test-split

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [9]:
original_features = ['url', 'url_len', 'ip_add', 'latitude', 'longitude', 
                     'tld', 'who_is', 'https', 'js_len', 'js_obf_len', 'content']

to_drop = ['url', 'content', 'ip_add', 'hostname']

## Variation 1
X_train_original_features = X_train[original_features].drop(columns=['url', 'content', 'ip_add'])
X_test_original_features = X_test[original_features].drop(columns=['url', 'content', 'ip_add'])

## Variation 2
X_train_original_features_remove_js = X_train[original_features]\
.drop(columns=['js_len', 'js_obf_len'])\
.drop(columns=['url', 'content', 'ip_add'])

X_test_original_features_remove_js = X_test[original_features]\
.drop(columns=['js_len', 'js_obf_len'])\
.drop(columns=['url', 'content', 'ip_add'])

## Variation 3
X_train_custom_features = X_train.drop(columns=to_drop)
X_test_custom_features = X_test.drop(columns=to_drop)

## Variation 4
X_train_custom_features_without_js = X_train.drop(columns = ['js_len', 'js_obf_len']).drop(columns=to_drop)
X_test_custom_features_without_js = X_test.drop(columns = ['js_len', 'js_obf_len']).drop(columns=to_drop)

## Variation 5
X_train_transformer = X_train['url']
X_test_transformer = X_test['url']

## Naive Bayes

### Variation 1 - Original Dataset

In [10]:
gnb = GaussianNB()
gnb.fit(X_train_original_features, y_train)

GaussianNB()

In [11]:
y_pred = gnb.predict(X_test_original_features)
accuracy_score(y_test, y_pred)

0.9492573630050185

In [12]:
confusion_matrix(y_test, y_pred)

array([[9877,    0],
       [1001, 8849]])

### Variation 2 - Original Dataset with JS features removed

In [13]:
gnb = GaussianNB()
gnb.fit(X_train_original_features_remove_js, y_train)

GaussianNB()

In [14]:
y_pred = gnb.predict(X_test_original_features_remove_js)
accuracy_score(y_test, y_pred)

0.8866021189233031

In [15]:
confusion_matrix(y_test, y_pred)

array([[9468,  409],
       [1828, 8022]])

### Variation 3 - Original Dataset + Custom Features

In [16]:
gnb = GaussianNB()
gnb.fit(X_train_custom_features, y_train)

GaussianNB()

In [17]:
y_pred = gnb.predict(X_test_custom_features)
accuracy_score(y_test, y_pred)

0.9557459319714098

In [18]:
confusion_matrix(y_test, y_pred)

array([[9867,   10],
       [ 863, 8987]])

### Variation 4 - Original Dataset without JS Features + Custom Features

In [19]:
gnb = GaussianNB()
gnb.fit(X_train_custom_features_without_js, y_train)

GaussianNB()

In [20]:
y_pred = gnb.predict(X_test_custom_features_without_js)
accuracy_score(y_test, y_pred)

0.8817863841435596

In [21]:
confusion_matrix(y_test, y_pred)

array([[9150,  727],
       [1605, 8245]])

## SVM

In [22]:
param_grid = [
  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
 ]

### Variation 1 - Original Dataset

In [None]:
svm_model = svm.SVC(verbose=1)
clf = GridSearchCV(svm_model, param_grid)

clf.fit(X_train_original_features, y_train)

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]

In [None]:
y_pred = clf.predict(X_test_original_features)
accuracy_score(y_test, y_pred)

In [None]:
confusion_matrix(y_test, y_pred)

### Variation 2 - Original Dataset with JS features removed

In [None]:
svm_model = svm.SVC(C=10.0, gamma=0.001, kernel='rbf', verbose=1)
clf = GridSearchCV(svm_model, param_grid)

clf.fit(X_train_original_features_remove_js, y_train)

In [None]:
y_pred = clf.predict(X_test_original_features_remove_js)
accuracy_score(y_test, y_pred)

In [None]:
confusion_matrix(y_test, y_pred)

### Variation 3 - Original Dataset + Custom Features

In [None]:
svm_model = svm.SVC(C=10.0, gamma=0.001, kernel='rbf', verbose=1)
clf = GridSearchCV(svm_model, param_grid)

clf.fit(X_train_custom_features, y_train)

In [None]:
y_pred = clf.predict(X_test_custom_features)
accuracy_score(y_test, y_pred)

In [None]:
confusion_matrix(y_test, y_pred)

### Variation 4 - Original Dataset without JS Features + Custom Features

In [None]:
svm_model = svm.SVC(C=10.0, gamma=0.001, kernel='rbf', verbose=1)
clf = GridSearchCV(svm_model, param_grid)

clf.fit(X_train_custom_features_without_js, y_trains)

In [None]:
y_pred = clf.predict(X_test_custom_features_without_js)
accuracy_score(y_test, y_pred)

In [None]:
confusion_matrix(y_test, y_pred)

## KNN

### Variation 1 - Original Dataset

In [38]:
knn_model = KNeighborsClassifier(n_neighbors=100)
knn_model.fit(X_train_original_features, y_train)

KNeighborsClassifier(n_neighbors=100)

In [39]:
y_pred = knn_model.predict(X_test_original_features)
accuracy_score(y_test, y_pred)

0.9786079991889289

In [40]:
confusion_matrix(y_test, y_pred)

array([[9716,  161],
       [ 261, 9589]])

### Variation 2 - Original Dataset with JS features removed

In [41]:
knn_model = KNeighborsClassifier(n_neighbors=100)
knn_model.fit(X_train_original_features_remove_js, y_train)

KNeighborsClassifier(n_neighbors=100)

In [42]:
y_pred = knn_model.predict(X_test_original_features_remove_js)
accuracy_score(y_test, y_pred)

0.7617985502103716

In [43]:
confusion_matrix(y_test, y_pred)

array([[7065, 2812],
       [1887, 7963]])

### Variation 3 - Original Dataset + Custom Features

In [44]:
knn_model = KNeighborsClassifier(n_neighbors=100)
knn_model.fit(X_train_custom_features, y_train)

KNeighborsClassifier(n_neighbors=100)

In [45]:
y_pred = knn_model.predict(X_test_custom_features)
accuracy_score(y_test, y_pred)

0.9776955441780301

In [46]:
confusion_matrix(y_test, y_pred)

array([[9695,  182],
       [ 258, 9592]])

### Variation 4 - Original Dataset without JS Features + Custom Features

In [47]:
knn_model = KNeighborsClassifier(n_neighbors=100)
knn_model.fit(X_train_custom_features_without_js, y_train)

KNeighborsClassifier(n_neighbors=100)

In [48]:
y_pred = knn_model.predict(X_test_custom_features_without_js)
accuracy_score(y_test, y_pred)

0.7402544735641506

In [49]:
confusion_matrix(y_test, y_pred)

array([[6965, 2912],
       [2212, 7638]])

## JavaScript Validation using tools

## Dataset Supplementation

## Transformers 

## Image Classification Approach