In [9]:
import daal4py
import ember_modified
from ember_modified.features import PEFeatureExtractor
from sklearn.metrics import accuracy_score

In [2]:
## Code from 2020 SCARP

import numpy as np
from daal4py import decision_forest_classification_training, decision_forest_classification_prediction
from daal4py import gbt_classification_training, gbt_classification_prediction
from daal4py import logistic_regression_training, logistic_regression_prediction
from daal4py import kdtree_knn_classification_training, kdtree_knn_classification_prediction
from daal4py import svm_training, kernel_function_linear, svm_prediction

    
class daal_LR: # DAAL Logistic Regression
    """docstring for Logistic Regression"""
    def __init__(self):
        # Model Definition
        self.nClasses = 2
        self.model = logistic_regression_training(nClasses=self.nClasses, interceptFlag=True)

    def train(self, X_train, y_train):
        # Train the model
        self.trainResult = self.model.compute(X_train, np.array(y_train).reshape((len(y_train), 1)))
        return self.trainResult.model

    def classify(self, data):
        self.predictAlgorithm = logistic_regression_prediction(nClasses=self.nClasses) 
        return self.predictAlgorithm.compute(data, self.trainResult.model).prediction.flatten()



class daal_KNN: # DAAL k Nearest Neighbor 
    """docstring for k Nearest Neighbor """
    def __init__(self, k):
        # Model Definition
        self.nClasses = 2
        self.k=k
        self.model = kdtree_knn_classification_training(nClasses=self.nClasses, k=self.k)

    def train(self, X_train, y_train):
        # Train the model
        self.trainResult = self.model.compute(X_train, np.array(y_train).reshape((len(y_train), 1)))
        return self.trainResult.model

    def classify(self, data):
        self.predictAlgorithm = kdtree_knn_classification_prediction(nClasses=self.nClasses, k=self.k) 
        return self.predictAlgorithm.compute(data, self.trainResult.model).prediction.flatten()


class daal_DF: # DAAL Decision Forest
    """docstring for Decision Forest """
    def __init__(self, n=100, m=10):
        # Model Definition
        self.nClasses = 2
        self.model = decision_forest_classification_training(nClasses=self.nClasses, nTrees=n, maxTreeDepth=m)

    def train(self, X_train, y_train):
        # Train the model
        self.trainResult = self.model.compute(X_train, np.array(y_train).reshape((len(y_train), 1)))
        return self.trainResult.model

    def classify(self, data):
        self.predictAlgorithm = decision_forest_classification_prediction(self.nClasses) 
        return self.predictAlgorithm.compute(data, self.trainResult.model).prediction.flatten()


class daal_SVM: # DAAL Support Vector Machine
    """docstring for Support Vector Machine """
    def __init__(self, C=1.0, kernel='rbf'):
        # Model Definition
        self.nClasses = 2
        self.kern = kernel_function_linear(method='defaultDense')
        self.model = svm_training(nClasses=self.nClasses, C=C, maxIterations=100000, cacheSize=200, kernel=self.kern,
                                accuracyThreshold=1e-2, doShrinking=True)

    def train(self, X_train, y_train):
        # Train the model
        self.trainResult = self.model.compute(X_train, np.array(y_train).reshape((len(y_train), 1)))
        return self.trainResult.model

    def classify(self, data):
        self.predictAlgorithm = svm_prediction(self.nClasses) 
        return self.predictAlgorithm.compute(data, self.trainResult.model).prediction.flatten()
    
class daal_GBT:
    def __init__(self, max_iters=50, max_depth=6):
        # Model Definition
        self.nClasses = 2
        self.model = gbt_classification_training(nClasses=self.nClasses, maxIterations=max_iters, maxTreeDepth=max_depth)
        self.predictAlgorithm = gbt_classification_prediction(self.nClasses)

    def train(self, X_train, y_train):
        # Train the model
        self.trainResult = self.model.compute(X_train, np.array(y_train).reshape((len(y_train), 1)))
        return self.trainResult.model

    def classify(self, data):
        return self.predictAlgorithm.compute(data, self.trainResult.model).prediction.flatten()
    

In [3]:
X_train, y_train, X_test, y_test = ember_modified.read_vectorized_features('/home/scarp/ember2018/')

In [4]:
train_rows = (y_train != -1)
test_rows = (y_test != -1)

In [5]:
d4p_LR = daal_LR()

LR_model = d4p_LR.train(X_train[train_rows], y_train[train_rows])

<daal4py._daal4py.logistic_regression_model at 0x7ff63fe96450>

In [None]:
LR_results = d4p_LR.classify(X_test[test_rows])
print(accuracy_score(y_test,LR_results))

In [23]:
accuracy_score(y_test[test_rows],results)

0.51796

In [20]:
d4p_DF = daal_DF()

DF_model = d4p_DF.train(X_train[train_rows], y_train[train_rows])

In [24]:
DF_results = d4p_DF.classify(X_test[test_rows])
print(accuracy_score(y_test[test_rows],DF_results))

0.892915


In [5]:
d4p_GBT = daal_GBT()

GBT_model = d4p_GBT.train(X_train[train_rows], y_train[train_rows])

In [6]:
GBT_results = d4p_GBT.classify(X_test[test_rows])
print(accuracy_score(y_test[test_rows],GBT_results))

0.94021


In [8]:
putty_data = open("/home/weitkampe/DS420/FinalProject/test_pes/putty.exe", "rb").read()

In [10]:
extractor = PEFeatureExtractor()

In [24]:
features = np.reshape(extractor.feature_vector(putty_data), (1,2381))

In [40]:
# Check prediction time
import time

start_time = time.time_ns()
prediction = d4p_GBT.classify(features)
end_time = time.time_ns()
print(str(int((end_time - start_time)/1000000))+"ms")

1ms


In [41]:
# Cross Validation

max_depths = [4,6,8]
max_iterations = [40,50,60]
#accuracies = []

for depth in max_depths:
    for num in max_iterations:
        
        d4p_GBT = daal_GBT(max_iters=num,max_depth=depth)
        GBT_model = d4p_GBT.train(X_train[train_rows], y_train[train_rows])
        GBT_results = d4p_GBT.classify(X_test[test_rows])
        print("Max iters: "+str(num)+"\tDepth: "+str(depth)+"\tAcc: "+str(accuracy_score(y_test[test_rows],GBT_results)))
        

Max iters: 40	Depth: 4	Acc: 0.91204
Max iters: 50	Depth: 4	Acc: 0.916985
Max iters: 60	Depth: 4	Acc: 0.920875
Max iters: 40	Depth: 6	Acc: 0.93464
Max iters: 50	Depth: 6	Acc: 0.940175
Max iters: 60	Depth: 6	Acc: 0.94437
Max iters: 40	Depth: 8	Acc: 0.952785
Max iters: 50	Depth: 8	Acc: 0.95681
Max iters: 60	Depth: 8	Acc: 0.958675


In [42]:
# Cross Validation 2

max_depths = [7,9,10]
max_iterations = [60,65,70]
#accuracies = []

for depth in max_depths:
    for num in max_iterations:
        
        d4p_GBT = daal_GBT(max_iters=num,max_depth=depth)
        GBT_model = d4p_GBT.train(X_train[train_rows], y_train[train_rows])
        GBT_results = d4p_GBT.classify(X_test[test_rows])
        print("Max iters: "+str(num)+"\tDepth: "+str(depth)+"\tAcc: "+str(accuracy_score(y_test[test_rows],GBT_results)))

Max iters: 60	Depth: 7	Acc: 0.952055
Max iters: 65	Depth: 7	Acc: 0.953885
Max iters: 70	Depth: 7	Acc: 0.95475
Max iters: 60	Depth: 9	Acc: 0.9623
Max iters: 65	Depth: 9	Acc: 0.96465
Max iters: 70	Depth: 9	Acc: 0.96501
Max iters: 60	Depth: 10	Acc: 0.96589
Max iters: 65	Depth: 10	Acc: 0.965875
Max iters: 70	Depth: 10	Acc: 0.96727


In [43]:
# Cross Validation 3

max_depths = [11,12,13]
max_iterations = [75,80,90]
#accuracies = []

for depth in max_depths:
    for num in max_iterations:
        
        d4p_GBT = daal_GBT(max_iters=num,max_depth=depth)
        GBT_model = d4p_GBT.train(X_train[train_rows], y_train[train_rows])
        GBT_results = d4p_GBT.classify(X_test[test_rows])
        print("Max iters: "+str(num)+"\tDepth: "+str(depth)+"\tAcc: "+str(accuracy_score(y_test[test_rows],GBT_results)))

Max iters: 75	Depth: 11	Acc: 0.96978
Max iters: 80	Depth: 11	Acc: 0.969425
Max iters: 90	Depth: 11	Acc: 0.97043
Max iters: 75	Depth: 12	Acc: 0.972405
Max iters: 80	Depth: 12	Acc: 0.971025
Max iters: 90	Depth: 12	Acc: 0.972895
Max iters: 75	Depth: 13	Acc: 0.97203
Max iters: 80	Depth: 13	Acc: 0.972355
Max iters: 90	Depth: 13	Acc: 0.971335


In [44]:
# Cross Validation 4

max_depths = [12,13,14]
max_iterations = [65,70,80,90,95]
#accuracies = []

for depth in max_depths:
    for num in max_iterations:
        
        d4p_GBT = daal_GBT(max_iters=num,max_depth=depth)
        GBT_model = d4p_GBT.train(X_train[train_rows], y_train[train_rows])
        GBT_results = d4p_GBT.classify(X_test[test_rows])
        print("Max iters: "+str(num)+"\tDepth: "+str(depth)+"\tAcc: "+str(accuracy_score(y_test[test_rows],GBT_results)))

Max iters: 65	Depth: 12	Acc: 0.96956
Max iters: 70	Depth: 12	Acc: 0.971725
Max iters: 80	Depth: 12	Acc: 0.97131
Max iters: 90	Depth: 12	Acc: 0.969645
Max iters: 95	Depth: 12	Acc: 0.97256
Max iters: 65	Depth: 13	Acc: 0.97081
Max iters: 70	Depth: 13	Acc: 0.97151
Max iters: 80	Depth: 13	Acc: 0.97233
Max iters: 90	Depth: 13	Acc: 0.972525
Max iters: 95	Depth: 13	Acc: 0.97349
Max iters: 65	Depth: 14	Acc: 0.97306
Max iters: 70	Depth: 14	Acc: 0.973695
Max iters: 80	Depth: 14	Acc: 0.97279
Max iters: 90	Depth: 14	Acc: 0.974065
Max iters: 95	Depth: 14	Acc: 0.975085


In [45]:
# Cross Validation 5

max_depths = [14,15,16,17]
max_iterations = [80,90,100,110]
#accuracies = []

for depth in max_depths:
    for num in max_iterations:
        
        d4p_GBT = daal_GBT(max_iters=num,max_depth=depth)
        GBT_model = d4p_GBT.train(X_train[train_rows], y_train[train_rows])
        GBT_results = d4p_GBT.classify(X_test[test_rows])
        print("Max iters: "+str(num)+"\tDepth: "+str(depth)+"\tAcc: "+str(accuracy_score(y_test[test_rows],GBT_results)))

Max iters: 80	Depth: 14	Acc: 0.972855
Max iters: 90	Depth: 14	Acc: 0.974115
Max iters: 100	Depth: 14	Acc: 0.97522
Max iters: 110	Depth: 14	Acc: 0.97439
Max iters: 80	Depth: 15	Acc: 0.972595
Max iters: 90	Depth: 15	Acc: 0.974525
Max iters: 100	Depth: 15	Acc: 0.97473
Max iters: 110	Depth: 15	Acc: 0.97492
Max iters: 80	Depth: 16	Acc: 0.973775
Max iters: 90	Depth: 16	Acc: 0.974055
Max iters: 100	Depth: 16	Acc: 0.974195
Max iters: 110	Depth: 16	Acc: 0.97313
Max iters: 80	Depth: 17	Acc: 0.97331
Max iters: 90	Depth: 17	Acc: 0.97298
Max iters: 100	Depth: 17	Acc: 0.97459
Max iters: 110	Depth: 17	Acc: 0.97418


In [None]:
# Max iters: 100	Depth: 14	Acc: 0.97522

In [None]:
# Profiling from 2020 SCARP
# Spark Streaming using sklearn model