# V2 Model Performance on V3 Data

In [28]:
# Evaluate whether V2 models can detect babuk
from log_reader import read_log_file
from train_models import load_model
from preprocessors.preprocessor import Preprocessor
import os
from sklearn.metrics import f1_score, precision_score, recall_score
import pandas as pd

babuk_path = os.path.join(os.getcwd(), '../logs/V3-Eval/malicious_babuk_30min.log')
df = read_log_file(babuk_path)
df = Preprocessor.get(version=2).preprocess(df)
X_test = df['syscall']

# Load models
iforest = load_model('models/trained/v2/v2_IForest_CountVectorizer_1_1.pkl')
lof = load_model('models/trained/v2/v2_LOF_TfIdfVectorizer_2_2.pkl')
nb = load_model('models/trained/v2/v2_NB_CountVectorizer_2_2.pkl')
rf = load_model('models/trained/v2/v2_RF_CountVectorizer_1_1.pkl')

df['iforest'] = iforest.predict(X_test)
df['lof'] = lof.predict(X_test)
df['nb'] = nb.predict(X_test)
df['rf'] = rf.predict(X_test)

df.sort_values(by='timestamp', inplace=True)

def get_first_pred_1_diff_seconds(df, pred_col):
    first_pred_1 = df[(df[pred_col] == 1) & (df['malicious'] == 1)].index[0][1]
    first_malicious_1 = df[df['malicious'] == 1].index[0][1]
    timestamp_diff = first_pred_1 - first_malicious_1
    return timestamp_diff.total_seconds()

res = []
for col in ['iforest', 'lof', 'nb', 'rf']:
    res.append({
        'algorithm': col, 
        'detected_after': get_first_pred_1_diff_seconds(df, col), 
        'F1': f1_score(df['malicious'], df[col]),
        'precision': precision_score(df['malicious'], df[col]),
        'recall': recall_score(df['malicious'], df[col])
    })

babuk_res = pd.DataFrame(res)
babuk_res

Classifying malicious_babuk_30min.log as malicious, 8743 malicious entries found


Unnamed: 0,algorithm,detected_after,F1,precision,recall
0,iforest,5.0,0.302067,0.179245,0.959596
1,lof,5.0,0.231604,0.131507,0.969697
2,nb,5.0,0.842105,0.744186,0.969697
3,rf,5.0,0.930693,0.912621,0.949495


In [37]:
# Evaluate whether V2 models correctly ignore file restoration
from log_reader import read_log_file
from train_models import load_model
from preprocessors.preprocessor import Preprocessor
import os
from sklearn.metrics import accuracy_score, confusion_matrix

file_restore_path = os.path.join(os.getcwd(), '../logs/V3-Eval/benign_FileRestoration_20min.log')
df = read_log_file(file_restore_path)
df = Preprocessor.get(version=2).preprocess(df)
X_test = df['syscall']
y_test = df['malicious']

# Load models
iforest = load_model('models/trained/v2/v2_IForest_CountVectorizer_1_1.pkl')
lof = load_model('models/trained/v2/v2_LOF_TfIdfVectorizer_2_2.pkl')
nb = load_model('models/trained/v2/v2_NB_CountVectorizer_2_2.pkl')
rf = load_model('models/trained/v2/v2_RF_CountVectorizer_1_1.pkl')

df['iforest'] = iforest.predict(X_test)
df['lof'] = lof.predict(X_test)
df['nb'] = nb.predict(X_test)
df['rf'] = rf.predict(X_test)

for col in ['iforest', 'lof', 'nb', 'rf']:
    print(f'{col} amount of 7-zip timeframes flagged:', df.loc['1e10', col].sum(), '/', df.loc['1e10'].shape[0])
    print(f'{col} accuracy score: {accuracy_score(y_test, df[col])}')
    display(df[df[col] == 1].value_counts('pid').head(5))
    print('--------------------------------')



iforest amount of 7-zip timeframes flagged: 141 / 168
iforest accuracy score: 0.8445158698272398


pid
1e10    141
440      78
1710     19
be8      17
23fc     13
Name: count, dtype: int64

--------------------------------
lof amount of 7-zip timeframes flagged: 165 / 168
lof accuracy score: 0.7464845319405383


pid
1e10    165
1710     28
440      27
200      26
3d8      23
Name: count, dtype: int64

--------------------------------
nb amount of 7-zip timeframes flagged: 164 / 168
nb accuracy score: 0.9168340699075934


pid
1e10    164
be8      12
2390      9
1dc4      3
23fc      3
Name: count, dtype: int64

--------------------------------
rf amount of 7-zip timeframes flagged: 158 / 168
rf accuracy score: 0.9321012454801125


pid
1e10    158
9b0       2
be8       2
11ac      1
1d80      1
Name: count, dtype: int64

--------------------------------


In [31]:
# Evaluate whether V2 models can detect lockbit
from log_reader import read_log_file
from train_models import load_model
from preprocessors.preprocessor import Preprocessor
import os
from sklearn.metrics import f1_score, precision_score, recall_score
import pandas as pd

lockbit_path = os.path.join(os.getcwd(), '../logs/V3-Eval/malicious_lockbit_30min.log')
df = read_log_file(lockbit_path)
df = Preprocessor.get(version=2).preprocess(df)
X_test = df['syscall']

# Load models
iforest = load_model('models/trained/v2/v2_IForest_CountVectorizer_1_1.pkl')
lof = load_model('models/trained/v2/v2_LOF_TfIdfVectorizer_2_2.pkl')
nb = load_model('models/trained/v2/v2_NB_CountVectorizer_2_2.pkl')
rf = load_model('models/trained/v2/v2_RF_CountVectorizer_1_1.pkl')

df['iforest'] = iforest.predict(X_test)
df['lof'] = lof.predict(X_test)
df['nb'] = nb.predict(X_test)
df['rf'] = rf.predict(X_test)

df.sort_values(by='timestamp', inplace=True)

def get_first_pred_1_diff_seconds(df, pred_col):
    first_pred_1 = df[(df[pred_col] == 1) & (df['malicious'] == 1)].index[0][1]
    first_malicious_1 = df[df['malicious'] == 1].index[0][1]
    timestamp_diff = first_pred_1 - first_malicious_1
    return timestamp_diff.total_seconds()

res = []
for col in ['iforest', 'lof', 'nb', 'rf']:
    res.append({
        'algorithm': col, 
        'detected_after': get_first_pred_1_diff_seconds(df, col), 
        'F1': f1_score(df['malicious'], df[col]),
        'precision': precision_score(df['malicious'], df[col]),
        'recall': recall_score(df['malicious'], df[col])
    })

lockbit_res = pd.DataFrame(res)
lockbit_res

Classifying malicious_lockbit_30min.log as malicious, 31774 malicious entries found


Unnamed: 0,algorithm,detected_after,F1,precision,recall
0,iforest,0.0,0.706897,0.546667,1.0
1,lof,0.0,0.474446,0.310999,1.0
2,nb,0.0,0.957198,0.91791,1.0
3,rf,5.0,0.958159,0.987069,0.930894


In [6]:
# Load all V3-Eval logs and find best V2 model
from log_reader import read_logs_from_dir
from train_models import load_model
from preprocessors.preprocessor import Preprocessor
import os
from sklearn.metrics import f1_score, precision_score, recall_score
import pandas as pd

log_dir = os.path.join(os.getcwd(), '../logs/V3-Eval')
df = read_logs_from_dir(log_dir)
df = Preprocessor.get(version=2).preprocess(df)

df.sort_values(by='timestamp', inplace=True)
res = []

# Load models
model_paths = [f.path for f in os.scandir('models/trained/v2/') if f.is_file() and f.path.endswith('.pkl')]
for model_path in model_paths:
    model = load_model(model_path)
    y_pred = model.predict(df['syscall'])
    y_true = df['malicious']
    res.append({
        'model': str(model),
        'F1': f1_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred),
        'recall': recall_score(y_true, y_pred)
    })

model_res = pd.DataFrame(res)
model_res = model_res.sort_values(by='F1', ascending=False)
model_res

Classifying malicious_babuk_30min.log as malicious, 8743 malicious entries found
Classifying malicious_lockbit_30min.log as malicious, 31774 malicious entries found
['models/trained/v2/v2_LOF_CountVectorizer_5_5.pkl', 'models/trained/v2/v2_LOF_CountVectorizer_1_1.pkl', 'models/trained/v2/v2_LOF_CountVectorizer_3_3.pkl', 'models/trained/v2/v2_RF_CountVectorizer_4_4.pkl', 'models/trained/v2/v2_RF_CountVectorizer_2_2.pkl', 'models/trained/v2/v2_IForest_TfidfVectorizer_5_5.pkl', 'models/trained/v2/v2_IForest_TfidfVectorizer_3_3.pkl', 'models/trained/v2/v2_IForest_TfidfVectorizer_1_1.pkl', 'models/trained/v2/v2_NB_CountVectorizer_2_2.pkl', 'models/trained/v2/v2_NB_CountVectorizer_4_4.pkl', 'models/trained/v2/v2_NB_CountVectorizer_1_1.pkl', 'models/trained/v2/v2_NB_CountVectorizer_3_3.pkl', 'models/trained/v2/v2_NB_CountVectorizer_5_5.pkl', 'models/trained/v2/v2_IForest_TfidfVectorizer_4_4.pkl', 'models/trained/v2/v2_IForest_TfidfVectorizer_2_2.pkl', 'models/trained/v2/v2_RF_CountVectorizer_

Unnamed: 0,model,F1,precision,recall
4,RF_CountVectorizer_2_2,0.763636,0.628037,0.973913
17,RF_CountVectorizer_1_1,0.760895,0.640873,0.936232
26,RF_TfidfVectorizer_2_2,0.75761,0.619926,0.973913
16,RF_CountVectorizer_3_3,0.7339,0.631799,0.875362
8,NB_CountVectorizer_2_2,0.720759,0.566225,0.991304
34,RF_TfidfVectorizer_3_3,0.710947,0.617521,0.837681
10,NB_CountVectorizer_1_1,0.695122,0.535211,0.991304
11,NB_CountVectorizer_3_3,0.684393,0.569231,0.857971
33,RF_TfidfVectorizer_1_1,0.675159,0.602273,0.768116
20,NB_TfidfVectorizer_2_2,0.673892,0.510448,0.991304


# V3 Models on V3 Eval Data
In V3, the models were trained on additional, system call intensive benign data.

In [2]:
# Evaluate whether V3 models correctly ignore file restoration
from log_reader import read_log_file
from train_models import load_model
from preprocessors.preprocessor import Preprocessor
import os
from sklearn.metrics import accuracy_score, confusion_matrix

file_restore_path = os.path.join(os.getcwd(), '../logs/V3-Eval/benign_FileRestoration_20min.log')
df = read_log_file(file_restore_path)
df = Preprocessor.get(version=2).preprocess(df)
X_test = df['syscall']
y_test = df['malicious']

# Load models
iforest = load_model('models/trained/v3/v3_IForest_CountVectorizer_1_1.pkl')
lof = load_model('models/trained/v3/v3_LOF_TfIdfVectorizer_2_2.pkl')
nb = load_model('models/trained/v3/v3_NB_CountVectorizer_2_2.pkl')
rf = load_model('models/trained/v3/v3_RF_CountVectorizer_1_1.pkl')

df['iforest'] = iforest.predict(X_test)
df['lof'] = lof.predict(X_test)
df['nb'] = nb.predict(X_test)
df['rf'] = rf.predict(X_test)

for col in ['iforest', 'lof', 'nb', 'rf']:
    print(f'{col} amount of 7-zip timeframes flagged:', df.loc['1e10', col].sum(), '/', df.loc['1e10'].shape[0])
    print(f'{col} accuracy score: {accuracy_score(y_test, df[col])}')
    display(df[df[col] == 1].value_counts('pid').head(5))
    print('--------------------------------')



iforest amount of 7-zip timeframes flagged: 118 / 168
iforest accuracy score: 0.8810767376456409


pid
1e10    118
440      81
1710     20
be8      13
2390      8
Name: count, dtype: int64

--------------------------------
lof amount of 7-zip timeframes flagged: 164 / 168
lof accuracy score: 0.7818400964242668


pid
1e10    164
1710     26
440      20
200      20
3d8      19
Name: count, dtype: int64

--------------------------------
nb amount of 7-zip timeframes flagged: 160 / 168
nb accuracy score: 0.9228605865809562


pid
1e10    160
be8       9
2390      8
2050      2
9b0       2
Name: count, dtype: int64

--------------------------------
rf amount of 7-zip timeframes flagged: 65 / 168
rf accuracy score: 0.9706709521896344


pid
1e10    65
be8      2
11ac     1
1d80     1
1f88     1
Name: count, dtype: int64

--------------------------------


In [30]:
# Load all V3-Eval logs and find best V3 model
from log_reader import read_logs_from_dir
from train_models import load_model
from preprocessors.preprocessor import Preprocessor
import os
from sklearn.metrics import f1_score, precision_score, recall_score
import pandas as pd

log_dir = os.path.join(os.getcwd(), '../logs/V3-Eval')
df = read_logs_from_dir(log_dir)
df = Preprocessor.get(version=2).preprocess(df)

df.sort_values(by='timestamp', inplace=True)
res = []

# Load models
model_paths = [f.path for f in os.scandir('models/trained/v3/') if f.is_file() and f.path.endswith('.pkl')]
for model_path in model_paths:
    model = load_model(model_path)
    y_pred = model.predict(df['syscall'])
    y_true = df['malicious']
    res.append({
        'model': str(model),
        'F1': f1_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred),
        'recall': recall_score(y_true, y_pred)
    })

model_res = pd.DataFrame(res)
model_res = model_res.sort_values(by='F1', ascending=False)
model_res

Classifying malicious_babuk_30min.log as malicious, 8743 malicious entries found
Classifying malicious_lockbit_30min.log as malicious, 31774 malicious entries found


Unnamed: 0,model,F1,precision,recall
38,RF_TfidfVectorizer_2_2,0.797153,0.674699,0.973913
16,RF_CountVectorizer_2_2,0.787234,0.664671,0.965217
7,RF_CountVectorizer_3_3,0.766623,0.696682,0.852174
8,RF_CountVectorizer_1_1,0.766484,0.72846,0.808696
19,NB_CountVectorizer_2_2,0.741062,0.591696,0.991304
5,NB_CountVectorizer_1_1,0.716823,0.560458,0.994203
25,RF_TfidfVectorizer_3_3,0.713359,0.64554,0.797101
6,NB_CountVectorizer_3_3,0.702326,0.586408,0.875362
29,NB_TfidfVectorizer_1_1,0.695829,0.53605,0.991304
37,NB_TfidfVectorizer_2_2,0.693009,0.53271,0.991304


# V3-1 Models on V3 Eval Data
In V3-1, the models were oversampled, by copying the data collected in V3 and duplicating it in V3-1

In [31]:
# Load all V3-Eval logs and find best V3 model
from log_reader import read_logs_from_dir
from train_models import load_model
from preprocessors.preprocessor import Preprocessor
import os
from sklearn.metrics import f1_score, precision_score, recall_score
import pandas as pd

log_dir = os.path.join(os.getcwd(), '../logs/V3-Eval')
df = read_logs_from_dir(log_dir)
df = Preprocessor.get(version=2).preprocess(df)

df.sort_values(by='timestamp', inplace=True)
res = []

# Load models
model_paths = [f.path for f in os.scandir('models/trained/v3-1/') if f.is_file() and f.path.endswith('.pkl')]
for model_path in model_paths:
    model = load_model(model_path)
    y_pred = model.predict(df['syscall'])
    y_true = df['malicious']
    res.append({
        'model': str(model),
        'F1': f1_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred),
        'recall': recall_score(y_true, y_pred)
    })

model_res = pd.DataFrame(res)
model_res = model_res.sort_values(by='F1', ascending=False)
model_res

Classifying malicious_babuk_30min.log as malicious, 8743 malicious entries found
Classifying malicious_lockbit_30min.log as malicious, 31774 malicious entries found


Unnamed: 0,model,F1,precision,recall
38,RF_TfidfVectorizer_2_2,0.82454,0.714894,0.973913
16,RF_CountVectorizer_2_2,0.787234,0.664671,0.965217
7,RF_CountVectorizer_3_3,0.758534,0.672646,0.869565
19,NB_CountVectorizer_2_2,0.748359,0.601054,0.991304
5,NB_CountVectorizer_1_1,0.726695,0.572621,0.994203
6,NB_CountVectorizer_3_3,0.715447,0.596899,0.892754
25,RF_TfidfVectorizer_3_3,0.707535,0.63242,0.802899
37,NB_TfidfVectorizer_2_2,0.695829,0.53605,0.991304
29,NB_TfidfVectorizer_1_1,0.695829,0.53605,0.991304
8,RF_CountVectorizer_1_1,0.67341,0.67147,0.675362


# V3-2 Models on V3 Eval Data
In V3-2, the system-call-intensive benign samples from V3 are naively oversampled

In [6]:
# Create oversampled benign systemcall intensive dataset
from log_reader import read_logs_from_dir
from preprocessors.preprocessor import Preprocessor
import os
import pandas as pd
from datetime import datetime

log_dir = os.path.join(os.getcwd(), '../logs/V3')
df = read_logs_from_dir(log_dir)
prep_df = Preprocessor.get(version=2).group_by_pid_and_timestamp(df)

filtered_df = prep_df[prep_df['syscall'].apply(lambda x: len(x) > 20)]
# Get systemcall intensive pids
syscall_intense_pids = list(filtered_df.value_counts('pid').head(5).index)
print(syscall_intense_pids)

# create unprocessed system-call intensive dataset
df = df[df['pid'].isin(syscall_intense_pids)]

# Oversample benign data
stacked_df = pd.concat([df, df, df], ignore_index=True).reset_index()

# Ensure the timestamp is in datetime format, date is set to current date + 1 day to avoid syscall being in same timeframe
stacked_df['timestamp'] = pd.to_datetime(stacked_df['timestamp'])
tomorrow_date = datetime.now().date() + datetime.timedelta(days=1)
stacked_df['timestamp'] = stacked_df['timestamp'].apply(lambda x: x.replace(year=tomorrow_date.year, month=tomorrow_date.month, day=tomorrow_date.day))
stacked_df['timestamp'] = stacked_df['timestamp'].dt.strftime('%Y-%m-%d %H:%M:%S.%f')

stacked_df.drop(columns=['index', 'malicious'], inplace=True)
stacked_df = stacked_df[stacked_df['pname'] != '7z.exe']
os.makedirs('../logs/V3-2', exist_ok=True)
stacked_df.to_csv('../logs/V3-2/benign_oversampled_.log', index=False, header=False)

['27b8', '1864', 'd18', '1694', '2124']


In [8]:
# Load all V3-Eval logs and find best V3 model
from log_reader import read_logs_from_dir
from train_models import load_model
from preprocessors.preprocessor import Preprocessor
import os
from sklearn.metrics import f1_score, precision_score, recall_score
import pandas as pd

log_dir = os.path.join(os.getcwd(), '../logs/V3-Eval')
df = read_logs_from_dir(log_dir)
df = Preprocessor.get(version=2).preprocess(df)

df.sort_values(by='timestamp', inplace=True)
res = []

# Load models
model_paths = [f.path for f in os.scandir('models/trained/v3-2/') if f.is_file() and f.path.endswith('.pkl')]
for model_path in model_paths:
    model = load_model(model_path)
    y_pred = model.predict(df['syscall'])
    y_true = df['malicious']
    res.append({
        'model': str(model),
        'F1': f1_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred),
        'recall': recall_score(y_true, y_pred)
    })

model_res = pd.DataFrame(res)
model_res = model_res.sort_values(by='F1', ascending=False)
model_res

Classifying malicious_babuk_30min.log as malicious, 8743 malicious entries found
Classifying malicious_lockbit_30min.log as malicious, 31774 malicious entries found


Unnamed: 0,model,F1,precision,recall
8,RF_CountVectorizer_1_1,0.852984,0.856725,0.849275
38,RF_TfidfVectorizer_2_2,0.800478,0.680894,0.971014
7,RF_CountVectorizer_3_3,0.791349,0.705215,0.901449
16,RF_CountVectorizer_2_2,0.790588,0.665347,0.973913
19,NB_CountVectorizer_2_2,0.744287,0.595819,0.991304
5,NB_CountVectorizer_1_1,0.728238,0.574539,0.994203
6,NB_CountVectorizer_3_3,0.708145,0.580705,0.907246
37,NB_TfidfVectorizer_2_2,0.700102,0.541139,0.991304
29,NB_TfidfVectorizer_1_1,0.699387,0.540284,0.991304
25,RF_TfidfVectorizer_3_3,0.676393,0.623472,0.73913


# V3-3 Models trained on Babuk, LockBit and File recovery samples
Note that V3-2 has been discarded, as it led to worse performance

In [5]:
# V3-3 best model performances
from train_models import load_scores_from_dir
import os

# Load model scores
models_dir = os.path.join(os.getcwd(), 'models/trained/v3-3')
model_score = load_scores_from_dir(models_dir)

# show best performing model per model
model_score = model_score.groupby('Model').apply(lambda x: x.nlargest(1, 'F1')).reset_index(drop=True)
model_score.sort_values('F1', ascending=False, inplace=True)
model_score

  model_score = model_score.groupby('Model').apply(lambda x: x.nlargest(1, 'F1')).reset_index(drop=True)


Unnamed: 0,Score,Duration,F1,Recall,Precision,Model,Model_Type,Scaler,Min Ngram,Max Ngram
3,0.994961,23.800034,0.965732,0.984127,0.948012,RF,Classification,CountVectorizer,1,1
2,0.989578,5.098903,0.927605,0.966833,0.891437,NB,Classification,CountVectorizer,4,4
1,0.909065,90.186113,0.480366,0.419908,0.561162,LOF,Anomaly Detection,CountVectorizer,4,4
0,0.863139,8.318848,0.235445,0.20242,0.281346,IForest,Anomaly Detection,CountVectorizer,2,2
