In [59]:
# Data preprocessing & dataset creation
from log_reader import read_all_logs
from preprocessors.preprocessor import Preprocessor
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.sparse import csr_matrix

df = read_all_logs(version=2)
unique_syscalls = df['syscall'].unique().tolist()
print("Unique system calls: ", len(unique_syscalls))
df = Preprocessor.get(version=2).preprocess(df)

X_train, X_test, y_train, y_test = train_test_split(df['syscall'], df['malicious'], test_size=0.2, random_state=42)

features_dict = {}
ngrams = range(4,8)
vectorizers = [CountVectorizer, TfidfVectorizer]

for vec_class in vectorizers:
    features_dict[vec_class.__name__] = {}
    for ngram in ngrams:
        vectorizer = vec_class(ngram_range=(ngram, ngram), token_pattern=r'\b\w+\b')
        X_train_vec = csr_matrix(vectorizer.fit_transform(X_train))
        X_test_vec = csr_matrix(vectorizer.transform(X_test))
        features_dict[vectorizer.__class__.__name__][ngram] = X_train_vec, X_test_vec

print(features_dict['CountVectorizer'][6][0].shape)


Reading logs from ../logs/V1
Reading logs from ../logs/V2-1
Reading logs from ../logs/V2
Classifying malicious_ransomwarePOC_10min_1.log as malicious, 13383 malicious entries found
Classifying malicious_ransomwarePOC_10min.log as malicious, 14477 malicious entries found
Classifying malicious_JavaRansomware_30min.log as malicious, 45982 malicious entries found
Classifying malicious_RAASNet-AES_60min.log as malicious, 10348 malicious entries found
Classifying malicious_ransim_20min.log as malicious, 23227 malicious entries found
Classifying malicious_RAASNet-Crypto_20min.log as malicious, 20069 malicious entries found
Classifying malicious_roar-ChaCha20_60min.log as malicious, 33968 malicious entries found
Classifying malicious_roar-AES-CTR_60min.log as malicious, 71676 malicious entries found
Classifying malicious_ransim-slow_60min.log as malicious, 23621 malicious entries found
Classifying malicious_cry_20min.log as malicious, 8185 malicious entries found
Classifying malicious_ransim-s

In [60]:
from sklearn.metrics import accuracy_score, f1_score
def get_test_metrics(y_true, y_pred):
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    return acc, f1

In [61]:
from sklearn.naive_bayes import MultinomialNB

def test_on_nb(X_train, y_train, X_test, y_test):
    clf = MultinomialNB()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return get_test_metrics(y_test, y_pred)

In [62]:
from models.V2.iforest import IForest

def test_on_iforest(X_train, y_train, X_test, y_test):
    clf = IForest(None)
    clf.fit_vectorized(X_train, y_train)
    y_pred = clf.instance.predict(X_test)
    y_pred = clf.pred_to_binary(y_pred)
    return get_test_metrics(y_test, y_pred)

In [63]:
from models.V2.lof import LOF

def test_on_lof(X_train, y_train, X_test, y_test):
    clf = LOF(None)
    clf.fit_vectorized(X_train, y_train)
    y_pred = clf.instance.predict(X_test)
    y_pred = clf.pred_to_binary(y_pred)
    return get_test_metrics(y_test, y_pred)

In [64]:
from sklearn.ensemble import RandomForestClassifier as RandomForest

def test_on_rf(X_train, y_train, X_test, y_test):
    clf = RandomForest()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return get_test_metrics(y_test, y_pred)

In [65]:
import concurrent.futures

def parallel_test(model, test_func, X_train, y_train, X_test, y_test, vec_name, ngram, selection):
    acc, f1 = test_func(X_train, y_train, X_test, y_test)
    return {'Vectorizer': vec_name, 'Ngram': ngram, 'Model': model, 'Accuracy': acc, 'F1': f1, 'Selection': selection, 'Features': X_train.shape[1]}

def get_test_results(features_dict, selection, max_workers=4):
    results = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = []
        for vec_name, vec_dict in features_dict.items():
            for ngram, (X_train, X_test) in vec_dict.items():
                for model, test_func in [('NB', test_on_nb), ('IForest', test_on_iforest), ('LOF', test_on_lof), ('RF', test_on_rf)]:
                    future = executor.submit(parallel_test, model, test_func, X_train, y_train, X_test, y_test, vec_name, ngram, selection)
                    futures.append(future)
        for future in concurrent.futures.as_completed(futures):
            results.append(future.result())
        concurrent.futures.wait(futures)
    return results


In [66]:
# Get results with no feature selection
import pandas as pd

results_df = pd.DataFrame(get_test_results(features_dict, 'None'))
results_df


Unnamed: 0,Vectorizer,Ngram,Model,Accuracy,F1,Selection,Features
0,CountVectorizer,4,NB,0.990688,0.948479,,54606
1,CountVectorizer,5,NB,0.988416,0.934548,,72233
2,CountVectorizer,4,IForest,0.891121,0.148997,,54606
3,CountVectorizer,5,IForest,0.891121,0.131071,,72233
4,CountVectorizer,4,RF,0.989735,0.942149,,54606
5,CountVectorizer,6,NB,0.986363,0.921651,,86880
6,CountVectorizer,6,IForest,0.894934,0.133091,,86880
7,CountVectorizer,5,RF,0.987389,0.927792,,72233
8,CountVectorizer,4,LOF,0.885329,0.093859,,54606
9,CountVectorizer,7,NB,0.98475,0.911263,,99384


In [67]:
# remove features with low variance
from sklearn.feature_selection import VarianceThreshold
threshold = 0.0001
results = []
features_varthresh_dict = {}
for vec_name, vec_dict in features_dict.items():
    features_varthresh_dict[vec_name] = {}
    for ngram, (X_train, X_test) in vec_dict.items():
        selector = VarianceThreshold(threshold=threshold)
        X_train_reduced = selector.fit_transform(X_train)
        X_test_reduced = selector.transform(X_test)
        features_varthresh_dict[vec_name][ngram] = X_train_reduced, X_test_reduced

results_var_df = pd.DataFrame(get_test_results(features_varthresh_dict, 'VarianceThreshold'))
results_df = pd.concat([results_df, results_var_df])
results_var_df

Unnamed: 0,Vectorizer,Ngram,Model,Accuracy,F1,Selection,Features
0,CountVectorizer,4,NB,0.990322,0.946385,VarianceThreshold,5361
1,CountVectorizer,5,NB,0.987902,0.93145,VarianceThreshold,5670
2,CountVectorizer,4,IForest,0.889875,0.213613,VarianceThreshold,5361
3,CountVectorizer,5,IForest,0.891854,0.214172,VarianceThreshold,5670
4,CountVectorizer,4,RF,0.989735,0.942292,VarianceThreshold,5361
5,CountVectorizer,6,NB,0.985996,0.919375,VarianceThreshold,5615
6,CountVectorizer,6,IForest,0.889215,0.200952,VarianceThreshold,5615
7,CountVectorizer,5,RF,0.987536,0.928691,VarianceThreshold,5670
8,CountVectorizer,6,RF,0.98475,0.911263,VarianceThreshold,5615
9,CountVectorizer,7,NB,0.98409,0.907146,VarianceThreshold,5470


In [68]:
# RFE feature selection
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

results = []
features_rfe_dict = {}
for vec_name, vec_dict in features_dict.items():
    features_rfe_dict[vec_name] = {}
    for ngram, (X_train_rfe, X_test_rfe) in vec_dict.items():
        print(vec_name, ngram)
        # Initialize the estimator and RFE
        estimator = LogisticRegression(solver='lbfgs', max_iter=1000)
        selector = RFE(estimator, n_features_to_select=int(X_train_rfe.shape[1]/2), step=1000)

        X_train_reduced = selector.fit_transform(X_train_rfe, y_train)
        X_test_reduced = selector.transform(X_test_rfe)
        features_rfe_dict[vec_name][ngram] = X_train_reduced, X_test_reduced

results_rfe_df = pd.DataFrame(get_test_results(features_rfe_dict, 'RFE'))
results_df = pd.concat([results_df, results_rfe_df])
results_rfe_df

CountVectorizer 4
CountVectorizer 5
CountVectorizer 6
CountVectorizer 7
TfidfVectorizer 4
TfidfVectorizer 5
TfidfVectorizer 6
TfidfVectorizer 7


Unnamed: 0,Vectorizer,Ngram,Model,Accuracy,F1,Selection,Features
0,CountVectorizer,4,NB,0.990835,0.949249,RFE,27303
1,CountVectorizer,5,NB,0.988269,0.93361,RFE,36116
2,CountVectorizer,4,IForest,0.885769,0.132517,RFE,27303
3,CountVectorizer,5,IForest,0.890828,0.145726,RFE,36116
4,CountVectorizer,4,RF,0.990029,0.943848,RFE,27303
5,CountVectorizer,6,NB,0.986363,0.921651,RFE,43440
6,CountVectorizer,6,IForest,0.89508,0.155752,RFE,43440
7,CountVectorizer,5,RF,0.987536,0.928511,RFE,36116
8,CountVectorizer,5,LOF,0.893467,0.086738,RFE,36116
9,CountVectorizer,7,NB,0.984676,0.910798,RFE,49692


In [69]:
results_df.sort_values(by='F1', ascending=False, inplace=True)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(results_df)

Unnamed: 0,Vectorizer,Ngram,Model,Accuracy,F1,Selection,Features
0,CountVectorizer,4,NB,0.990835,0.949249,RFE,27303
0,CountVectorizer,4,NB,0.990688,0.948479,,54606
0,CountVectorizer,4,NB,0.990322,0.946385,VarianceThreshold,5361
18,TfidfVectorizer,4,RF,0.990102,0.944146,,54606
19,TfidfVectorizer,4,RF,0.990102,0.944146,RFE,27303
4,CountVectorizer,4,RF,0.990029,0.943848,RFE,27303
4,CountVectorizer,4,RF,0.989735,0.942292,VarianceThreshold,5361
4,CountVectorizer,4,RF,0.989735,0.942149,,54606
13,TfidfVectorizer,4,NB,0.988855,0.936614,RFE,27303
1,CountVectorizer,5,NB,0.988416,0.934548,,72233


In [78]:
None_df = results_df[results_df['Selection'] == 'None'].drop(columns=['Selection', 'Accuracy'])
VarianceThreshold_df = results_df[results_df['Selection'] == 'VarianceThreshold'].drop(columns=['Selection', 'Accuracy'])
RFE_df = results_df[results_df['Selection'] == 'RFE'].drop(columns=['Selection', 'Accuracy'])

vt_merge = None_df.merge(VarianceThreshold_df, on=['Vectorizer', 'Ngram', 'Model'], suffixes=(None, '_VT'))
merged_df = vt_merge.merge(RFE_df, on=['Vectorizer', 'Ngram', 'Model'], suffixes=(None,'_RFE'))
merged_df['F1_diff_VT'] = merged_df['F1_VT'] - merged_df['F1']
merged_df['F1_diff_RFE'] = merged_df['F1_RFE'] - merged_df['F1']
merged_df.drop(columns=['F1', 'F1_VT', 'F1_RFE'], inplace=True)
merged_df = merged_df.sort_values(by=['Model', 'Vectorizer', 'Ngram'])
display(merged_df[merged_df['Model'] != 'IForest'].groupby(['Vectorizer', 'Ngram']).agg({'F1_diff_VT': 'mean', 'F1_diff_RFE': 'mean'}))
merged_df

Unnamed: 0_level_0,Unnamed: 1_level_0,F1_diff_VT,F1_diff_RFE
Vectorizer,Ngram,Unnamed: 2_level_1,Unnamed: 3_level_1
CountVectorizer,4,0.004818,-0.001004
CountVectorizer,5,0.007418,-0.001371
CountVectorizer,6,0.006484,-0.005505
CountVectorizer,7,0.010048,-0.001432
TfidfVectorizer,4,-0.017912,0.011063
TfidfVectorizer,5,-0.028953,0.0147
TfidfVectorizer,6,-0.034192,0.014332
TfidfVectorizer,7,-0.037315,0.011825


Unnamed: 0,Vectorizer,Ngram,Model,Features,Features_VT,Features_RFE,F1_diff_VT,F1_diff_RFE
18,CountVectorizer,4,IForest,54606,5361,27303,0.064615,-0.01648
22,CountVectorizer,5,IForest,72233,5670,36116,0.083101,0.014655
21,CountVectorizer,6,IForest,86880,5615,43440,0.067861,0.022661
19,CountVectorizer,7,IForest,99384,5470,49692,0.042451,0.021646
16,TfidfVectorizer,4,IForest,54606,485,27303,-0.12324,0.021127
20,TfidfVectorizer,5,IForest,72233,474,36116,-0.092276,0.048712
17,TfidfVectorizer,6,IForest,86880,449,43440,-0.140548,0.051596
23,TfidfVectorizer,7,IForest,99384,407,49692,-0.128673,0.072034
25,CountVectorizer,4,LOF,54606,5361,27303,0.016403,-0.005481
27,CountVectorizer,5,LOF,72233,5670,36116,0.024453,-0.003893


In [71]:
# show best performing model per model
model_score = results_df.groupby('Model').apply(lambda x: x.nlargest(1, 'F1')).reset_index(drop=True)
model_score.sort_values('F1', ascending=False, inplace=True)
model_score

  model_score = results_df.groupby('Model').apply(lambda x: x.nlargest(1, 'F1')).reset_index(drop=True)


Unnamed: 0,Vectorizer,Ngram,Model,Accuracy,F1,Selection,Features
2,CountVectorizer,4,NB,0.990835,0.949249,RFE,27303
3,TfidfVectorizer,4,RF,0.990102,0.944146,,54606
0,CountVectorizer,5,IForest,0.891854,0.214172,VarianceThreshold,5670
1,CountVectorizer,7,LOF,0.877264,0.124477,VarianceThreshold,5470


In [72]:
# Effect of feature selection on best performing models from V2
from log_reader import read_all_logs
from preprocessors.preprocessor import Preprocessor
from train_models import train_test_split_df
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.sparse import csr_matrix
from models.V2.iforest import IForest
from models.V2.lof import LOF
from models.V2.rf import RF
from models.V2.nb import NB
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score


df = read_all_logs(version=2)
unique_syscalls = df['syscall'].unique().tolist()
print("Unique system calls: ", len(unique_syscalls))
df = Preprocessor.get(version=2).preprocess(df)

X_train, X_test, y_train, y_test = train_test_split_df(df=df)

token_pattern = r'\b\w+\b'

# Best V2 models:
best_models = [
    (NB, CountVectorizer, 2, False),
    (IForest, TfidfVectorizer, 2, True),
    (LOF, TfidfVectorizer, 2, True),
    (RF, CountVectorizer, 1, False)
]
best_models_results = []
for model_class, vectorizer_class, ngram, requires_to_binary in best_models:
    # RFE
    props = [model_class.__name__, vectorizer_class.__name__, ngram]
    model = model_class(vectorizer_class(ngram_range=(ngram, ngram), token_pattern=token_pattern))
    X_train_rfe = csr_matrix(model.vectorizer.fit_transform(X_train))
    X_test_rfe = csr_matrix(model.vectorizer.transform(X_test))
    estimator = LogisticRegression(solver='lbfgs', max_iter=1000)
    selector = RFE(estimator, n_features_to_select=int(X_train_rfe.shape[1]/2), step=1000)
    X_train_reduced = selector.fit_transform(X_train_rfe, y_train)
    X_test_reduced = selector.transform(X_test_rfe)
    model.fit_vectorized(X_train_reduced, y_train)
    y_pred = model.instance.predict(X_test_reduced)
    if requires_to_binary:
        y_pred = model.pred_to_binary(y_pred)
    f1 = f1_score(y_test, y_pred)
    props.append(f1)
    # VT
    model = model_class(vectorizer_class(ngram_range=(ngram, ngram), token_pattern=token_pattern))
    selector = VarianceThreshold(threshold=0.0001)
    X_train_varthresh = csr_matrix(model.vectorizer.fit_transform(X_train))
    X_test_varthresh = csr_matrix(model.vectorizer.transform(X_test))
    X_train_reduced = selector.fit_transform(X_train_varthresh)
    X_test_reduced = selector.transform(X_test_varthresh)
    model.fit_vectorized(X_train_reduced, y_train)
    y_pred = model.instance.predict(X_test_reduced)
    if requires_to_binary:
        y_pred = model.pred_to_binary(y_pred)
    f1 = f1_score(y_test, y_pred)
    props.append(f1)
    best_models_results.append(props)

best_models_results_df = pd.DataFrame(best_models_results, columns=['Model', 'Vectorizer', 'Ngram', 'RFE F1', 'VT F1'])
best_models_results_df


    




Reading logs from ../logs/V1
Reading logs from ../logs/V2-1
Reading logs from ../logs/V2
Classifying malicious_ransomwarePOC_10min_1.log as malicious, 13383 malicious entries found
Classifying malicious_ransomwarePOC_10min.log as malicious, 14477 malicious entries found
Classifying malicious_JavaRansomware_30min.log as malicious, 45982 malicious entries found
Classifying malicious_RAASNet-AES_60min.log as malicious, 10348 malicious entries found
Classifying malicious_ransim_20min.log as malicious, 23227 malicious entries found
Classifying malicious_RAASNet-Crypto_20min.log as malicious, 20069 malicious entries found
Classifying malicious_roar-ChaCha20_60min.log as malicious, 33968 malicious entries found
Classifying malicious_roar-AES-CTR_60min.log as malicious, 71676 malicious entries found
Classifying malicious_ransim-slow_60min.log as malicious, 23621 malicious entries found
Classifying malicious_cry_20min.log as malicious, 8185 malicious entries found
Classifying malicious_ransim-s

Unnamed: 0,Model,Vectorizer,Ngram,RFE F1,VT F1
0,NB,CountVectorizer,2,0.958301,0.956351
1,IForest,TfidfVectorizer,2,0.134677,0.109351
2,LOF,TfidfVectorizer,2,0.038982,0.018957
3,RF,CountVectorizer,1,0.945528,0.971452
