In [5]:
import numpy as np
import pandas as pd
import csv
import os
import codecs
import fnmatch
import errno
import pickle
from sklearn.svm import OneClassSVM
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.svm import OneClassSVM, SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

In [6]:
usefulJSpaths = []  # absolute path of functional Javascript codes
usefulJSfiles = []  # list of file names
rusefulJSfiles = []  # list of opened files
n_features = 190

gmv = 0.001  # 0.001 # 0.05 # 0.1 # 0.5 # 0.1, 0.05, 0.01, 0.005, 0.001
nuv = 0.001  # 0.1, 0.05, 0.01, 0.005, 0.001

for root, dirnames, filenames in os.walk(r'/Users/ledaniel/Desktop/Downloads/LabelledDataSet/functionalJS'):
    for filename in fnmatch.filter(filenames, "*.js"):
        usefulJSfiles.append(filename)
        usefulJSpaths.append(os.path.join(root, filename))
        try:
            with codecs.open(os.path.join(root, filename), encoding='latin-1') as f:
                rusefulJSfiles.append(f.read())  # inline Cosine similarity, Cosine does work on whole doc so no .split()
        except IOError as exc:
            if exc.errno != errno.EISDIR:  # Do not fail if a directory is found, just ignore it.
                raise  # Propagate other kinds of IOError.

uselessJSpaths = []  # absolute path of tracking Javascript codes
uselessJSfiles = []
ruselessJSfiles = []  # list of opened files

for ulessroot, ulessdirnames, ulessfilenames in os.walk(r'/Users/ledaniel/Desktop/Downloads/LabelledDataSet/trackingJS'):
    for filename in fnmatch.filter(ulessfilenames, "*.js"):
        uselessJSfiles.append(filename)
        uselessJSpaths.append(os.path.join(ulessroot, filename))
        try:
            with codecs.open(os.path.join(ulessroot, filename), encoding='latin-1') as f:
                ruselessJSfiles.append(f.read())  # inline Cosine similarity, Cosine does work on whole doc so no .split()
        except IOError as exc:
            if exc.errno != errno.EISDIR:  # Do not fail if a directory is found, just ignore it.
                raise  # Propagate other kinds of IOError.

print("Number of useless JS files: %s " % len(ruselessJSfiles))
print("Number of useful JS files: %s " % len(rusefulJSfiles))
print("Ratio of useful JS vs useless JS: {:.2f}%" .format(len(rusefulJSfiles) / len(ruselessJSfiles) * 100))


Number of useless JS files: 1489 
Number of useful JS files: 1123 
Ratio of useful JS vs useless JS: 75.42%


In [7]:
usefulJSpaths = []
usefulJSfiles = []
rusefulJSfiles = []
n_features = 190

gmv = 0.001
nuv = 0.001

for root, dirnames, filenames in os.walk(r'/Users/ledaniel/Desktop/Downloads/LabelledDataSet/functionalJS'):
    for filename in fnmatch.filter(filenames, "*.js"):
        usefulJSfiles.append(filename)
        usefulJSpaths.append(os.path.join(root, filename))
        try:
            with codecs.open(os.path.join(root, filename), encoding='utf-8') as f:
                rusefulJSfiles.append(f.read())
        except IOError as exc:
            if exc.errno != errno.EISDIR:
                raise

uselessJSpaths = []
uselessJSfiles = []
ruselessJSfiles = []

for ulessroot, ulessdirnames, ulessfilenames in os.walk(r'/Users/ledaniel/Desktop/Downloads/LabelledDataSet/trackingJS'):
    for filename in fnmatch.filter(ulessfilenames, "*.js"):
        uselessJSfiles.append(filename)
        uselessJSpaths.append(os.path.join(ulessroot, filename))
        try:
            with codecs.open(os.path.join(ulessroot, filename), encoding='latin-1') as f:
                ruselessJSfiles.append(f.read())
        except IOError as exc:
            if exc.errno != errno.EISDIR:
                raise

# Combining functional and tracking JavaScript codes
all_JS_files = rusefulJSfiles + ruselessJSfiles
labels = [-1] * len(rusefulJSfiles) + [1] * len(ruselessJSfiles)

# Creating TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=n_features)

# Computing TF-IDF features
tfidf_features = vectorizer.fit_transform(all_JS_files)

# Computing TF-IDF features for tracking JavaScript codes
tfidf_features_tracking = vectorizer.fit_transform(ruselessJSfiles)

# Normalize the data
scaler = StandardScaler(with_mean=False)
tfidf_features = scaler.fit_transform(tfidf_features)
tfidf_features_tracking = scaler.fit_transform(tfidf_features_tracking)

# Spliting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(tfidf_features, labels, test_size=0.2, random_state=42)

# Training the One-Class SVM
one_class_svm = OneClassSVM(gamma=gmv, nu=nuv)
one_class_svm.fit(tfidf_features_tracking)

# Training the baseline SVM
baseline_svm = SVC(kernel='linear')
baseline_svm.fit(X_train, y_train)

# Making predictions on the test set
one_class_svm_pred = one_class_svm.predict(X_test)
baseline_svm_pred = baseline_svm.predict(X_test)

# Evaluating the models
print("One-Class SVM Classification Report:")
print(classification_report(y_test, one_class_svm_pred))

print("Baseline SVM Classification Report:")
print(classification_report(y_test, baseline_svm_pred))


One-Class SVM Classification Report:
              precision    recall  f1-score   support

          -1       0.57      0.02      0.03       231
           1       0.56      0.99      0.72       292

    accuracy                           0.56       523
   macro avg       0.57      0.50      0.37       523
weighted avg       0.57      0.56      0.41       523

Baseline SVM Classification Report:
              precision    recall  f1-score   support

          -1       0.73      0.71      0.72       231
           1       0.78      0.79      0.78       292

    accuracy                           0.76       523
   macro avg       0.75      0.75      0.75       523
weighted avg       0.75      0.76      0.76       523



In [8]:
usefulJSpaths = []
usefulJSfiles = []
rusefulJSfiles = []
n_features = 190

gmv = 0.001
nuv = 0.001

for root, dirnames, filenames in os.walk(r'/Users/ledaniel/Desktop/Downloads/LabelledDataSet/functionalJS'):
    for filename in fnmatch.filter(filenames, "*.js"):
        usefulJSfiles.append(filename)
        usefulJSpaths.append(os.path.join(root, filename))
        try:
            with codecs.open(os.path.join(root, filename), encoding='utf-8') as f:
                rusefulJSfiles.append(f.read())
        except IOError as exc:
            if exc.errno != errno.EISDIR:
                raise

uselessJSpaths = []
uselessJSfiles = []
ruselessJSfiles = []

for ulessroot, ulessdirnames, ulessfilenames in os.walk(r'/Users/ledaniel/Desktop/Downloads/LabelledDataSet/trackingJS'):
    for filename in fnmatch.filter(ulessfilenames, "*.js"):
        uselessJSfiles.append(filename)
        uselessJSpaths.append(os.path.join(ulessroot, filename))
        try:
            with codecs.open(os.path.join(ulessroot, filename), encoding='latin-1') as f:
                ruselessJSfiles.append(f.read())
        except IOError as exc:
            if exc.errno != errno.EISDIR:
                raise

# Combining functional and tracking JavaScript codes
all_JS_files = rusefulJSfiles + ruselessJSfiles
labels = [-1] * len(rusefulJSfiles) + [1] * len(ruselessJSfiles)

# Creating TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=n_features)

# Computing TF-IDF features
tfidf_features = vectorizer.fit_transform(all_JS_files)

# Computing TF-IDF features for tracking JavaScript codes
tfidf_features_tracking = vectorizer.fit_transform(ruselessJSfiles)

# Normalize the data
scaler = StandardScaler(with_mean=False)
tfidf_features = scaler.fit_transform(tfidf_features)
tfidf_features_tracking = scaler.fit_transform(tfidf_features_tracking)

# Splitting the data into training, validation, and testing sets
X_train, X_temp, y_train, y_temp = train_test_split(tfidf_features, labels, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


# Training the One-Class SVM on tracking JS (useless JS)
one_class_svm = OneClassSVM(gamma=gmv, nu=nuv)
one_class_svm.fit(tfidf_features_tracking)

# Training the baseline SVM
baseline_svm = SVC(kernel='linear')
baseline_svm.fit(X_train, y_train)

# Making predictions on the validation set
one_class_svm_val_pred = one_class_svm.predict(X_val)
baseline_svm_val_pred = baseline_svm.predict(X_val)

# Map OCSVM predictions to labels
one_class_svm_val_pred_labels = [1 if x == 1 else -1 for x in one_class_svm_val_pred]

# Evaluating the models on the validation set
print("One-Class SVM Validation Classification Report:")
print(classification_report(y_val, one_class_svm_val_pred_labels))
print("One-Class SVM Validation Accuracy: {:.2f}%".format(accuracy_score(y_val, one_class_svm_val_pred_labels) * 100))
print()
print("Baseline SVM Validation Classification Report:")
print(classification_report(y_val, baseline_svm_val_pred))
print("Baseline SVM Validation Accuracy: {:.2f}%".format(accuracy_score(y_val, baseline_svm_val_pred) * 100))
print()
# Making predictions on the test set
one_class_svm_test_pred = one_class_svm.predict(X_test)
baseline_svm_test_pred = baseline_svm.predict(X_test)

# Map OCSVM predictions to labels
one_class_svm_test_pred_labels = [1 if x == 1 else -1 for x in one_class_svm_test_pred]

# Evaluating the models on the test set
print("One-Class SVM Test Classification Report:")
print(classification_report(y_test, one_class_svm_test_pred_labels))
print("One-Class SVM Test Accuracy: {:.2f}%".format(accuracy_score(y_test, one_class_svm_test_pred_labels) * 100))
print()

print("Baseline SVM Test Classification Report:")
print(classification_report(y_test, baseline_svm_test_pred))
print("Baseline SVM Test Accuracy: {:.2f}%".format(accuracy_score(y_test, baseline_svm_test_pred) * 100))
print()

One-Class SVM Validation Classification Report:
              precision    recall  f1-score   support

          -1       0.67      0.01      0.02       229
           1       0.56      1.00      0.72       293

    accuracy                           0.56       522
   macro avg       0.61      0.50      0.37       522
weighted avg       0.61      0.56      0.41       522

One-Class SVM Validation Accuracy: 56.32%

Baseline SVM Validation Classification Report:
              precision    recall  f1-score   support

          -1       0.71      0.66      0.68       229
           1       0.75      0.79      0.77       293

    accuracy                           0.73       522
   macro avg       0.73      0.72      0.73       522
weighted avg       0.73      0.73      0.73       522

Baseline SVM Validation Accuracy: 73.18%

One-Class SVM Test Classification Report:
              precision    recall  f1-score   support

          -1       0.62      0.02      0.04       232
           1   

In [9]:
# Experimenting with different parameters for OCSVM
ocsvm_params = [
    {'nu': 0.01, 'gamma': 'auto'},
    {'nu': 0.05, 'gamma': 'auto'},
    {'nu': 0.1, 'gamma': 'auto'},
    {'nu': 0.1, 'gamma': 'scale'},
    {'nu': 0.2, 'gamma': 'scale'},
    {'nu': 0.5, 'gamma': 'scale'},
    {'nu': 0.5, 'gamma': 0.01}
]

best_params = None
best_score = 0

for params in ocsvm_params:
    ocsvm = OneClassSVM(kernel='rbf', gamma=params['gamma'], nu=params['nu'])
    ocsvm.fit(tfidf_features_tracking)
    val_pred = ocsvm.predict(X_val)
    val_pred_labels = [1 if x == 1 else -1 for x in val_pred]
    score = accuracy_score(y_val, val_pred_labels)
    if score > best_score:
        best_score = score
        best_params = params

print(f"Best Parameters: {best_params}")
print(f"Validation Accuracy with Best Parameters: {best_score * 100:.2f}%")
print("\n\n")

# Train and evaluate the best OCSVM model on the test set
best_ocsvm = OneClassSVM(kernel='rbf', gamma=best_params['gamma'], nu=best_params['nu'])
best_ocsvm.fit(tfidf_features_tracking)
best_ocsvm_test_pred = best_ocsvm.predict(X_test)
best_ocsvm_test_pred_labels = [1 if x == 1 else -1 for x in best_ocsvm_test_pred]

print("Best One-Class SVM Test Classification Report:")
print(classification_report(y_test, best_ocsvm_test_pred_labels))

print("Baseline SVM Test Classification Report:")
print(classification_report(y_test, baseline_svm_test_pred))

Best Parameters: {'nu': 0.1, 'gamma': 'scale'}
Validation Accuracy with Best Parameters: 60.34%



Best One-Class SVM Test Classification Report:
              precision    recall  f1-score   support

          -1       0.60      0.24      0.34       232
           1       0.59      0.87      0.70       291

    accuracy                           0.59       523
   macro avg       0.59      0.55      0.52       523
weighted avg       0.59      0.59      0.54       523

Baseline SVM Test Classification Report:
              precision    recall  f1-score   support

          -1       0.72      0.62      0.67       232
           1       0.73      0.81      0.77       291

    accuracy                           0.73       523
   macro avg       0.73      0.72      0.72       523
weighted avg       0.73      0.73      0.72       523

