In [1]:
import pandas as pd
import os

executable_pname_dict = {
    'ransomwarePOC': 'RansomwarePOC.',
}

def read_file(file_path):
    df = pd.read_csv(file_path, header=None, names=["timestamp","pname", "pid", "tid", "syscall", "rcx", "rdx", "r8", "r9"])
    return df

def group_by_pid_and_ten_seconds(df):
    df['timestamp'] = pd.to_datetime(df['timestamp'], utc=True)
    df['timestamp'] = df['timestamp'].dt.floor('10s')
    df = df.drop(columns=['pname', 'tid', 'rcx', 'rdx', 'r8', 'r9'])
    grouped_df = df.groupby(['pid', 'timestamp']).agg({'syscall':list, 'malicious': 'first'})
    return grouped_df

def classify_malicious(df, file_name):
    malicious, executable = file_name.split('_')[:2]
    df['malicious'] = 0
    if malicious != 'benign':
        df.loc[df['pname'] == executable_pname_dict[executable], 'malicious'] = 1
    return df

# c90 is pid of RansomwarePOC
# grouped_df = read_file('logs/ransomwarePOC_10min.log')
# grouped_df = group_by_pid_and_ten_seconds(grouped_df)
# grouped_df['malicious'] = 0
# grouped_df.loc['c90', 'malicious'] = 1
# grouped_df[grouped_df['malicious'] == 1].count()

# Read all logs from logs directory
logs_dir = 'logs/'
logs_dir = os.path.abspath(logs_dir)
files = os.listdir(logs_dir)
files = filter(lambda file: file.endswith('.log'), files)
files = [os.path.join(logs_dir, file) for file in files]

# Read all logs to df
dfs = []
for file in files:
    df = read_file(file)
    file_name = os.path.basename(file)
    df = classify_malicious(df, file_name)
    dfs.append(df)

df = pd.concat(dfs)




In [None]:
# files_to_fix = [file for file in files if os.path.basename(file)!= 'malicious_ransomwarePOC_10min.log']

# for i in range(len(files_to_fix)):
#     fix_df = pd.read_csv(files_to_fix[i], header=None, names=["timestamp","pname", "pid", "tid", "bsDigit", "syscall", "rcx", "rdx", "r8", "r9"])
#     fix_df = fix_df.drop(columns=['bsDigit'])
#     print(files_to_fix[i])
#     fix_df.to_csv(files_to_fix[i], index=False, header=False)

In [2]:
# bag of ngrams nb classifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

def list_to_str(lst: list):
    return ' '.join(lst)

classifier_df = df.copy()
classifier_df = group_by_pid_and_ten_seconds(classifier_df)


# # ensure balanced samples
malicious_sample = classifier_df.loc[classifier_df['malicious'] == 1]
benign_sample = classifier_df.loc[classifier_df['malicious'] == 0].sample(n=malicious_sample.shape[0])
classifier_sample = pd.concat([malicious_sample, benign_sample])

classifier_sample['syscall'] = classifier_sample['syscall'].apply(list_to_str)
X_train, X_test, y_train, y_test = train_test_split(classifier_sample['syscall'], classifier_sample['malicious'], test_size=0.2)

scaler = CountVectorizer(ngram_range=(6, 6))
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model = MultinomialNB()
model.fit(X_train, y_train)
print(model.score(X_test, y_test))
y_pred = model.predict(X_test)
comp_df = pd.DataFrame({'y_test': y_test, 'y_pred': y_pred})
comp_df





1.0


Unnamed: 0_level_0,Unnamed: 1_level_0,y_test,y_pred
pid,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1
458,2024-04-18 07:50:10+00:00,0,0
45c,2024-04-18 08:19:40+00:00,1,1
24d8,2024-04-18 07:36:00+00:00,0,0
45c,2024-04-18 08:20:30+00:00,1,1
418,2024-04-18 07:33:10+00:00,0,0
10fc,2024-04-18 07:48:30+00:00,0,0
c90,2024-04-17 14:28:20+00:00,1,1
2510,2024-04-20 15:38:30+00:00,0,0
c90,2024-04-17 14:33:10+00:00,1,1
45c,2024-04-18 08:16:40+00:00,1,1


In [5]:
# bag of n grams anomaly detection with isolation forest
from pyod.models.iforest import IForest
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def to_dense(x):
    return x.toarray()

anomaly_df = df.copy()
anomaly_df = group_by_pid_and_ten_seconds(anomaly_df)

anomaly_df['syscall'] = anomaly_df['syscall'].apply(list_to_str)

X_train, X_test, y_train, y_test = train_test_split(anomaly_df['syscall'], anomaly_df['malicious'], test_size=0.2)


scaler = CountVectorizer(ngram_range=(6,6))
X_train = scaler.fit_transform(X_train).toarray()
X_test = scaler.transform(X_test).toarray()

iforest = IForest()

iforest.fit(X_train)
y_pred = iforest.predict(X_test)
comp_df = pd.DataFrame({'y_test': y_test, 'y_pred': y_pred})
print(accuracy_score(y_test, y_pred))
print(f"Correct predictions: {comp_df[comp_df['y_test'] == comp_df['y_pred']].shape[0]} / {comp_df.shape[0]}")
comp_df[comp_df['y_test'] != comp_df['y_pred']]


0.9536423841059603
Correct predictions: 864 / 906


Unnamed: 0_level_0,Unnamed: 1_level_0,y_test,y_pred
pid,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1
418,2024-04-18 07:35:00+00:00,0,1
2510,2024-04-20 15:37:50+00:00,0,1
27dc,2024-04-18 08:27:50+00:00,0,1
41c,2024-04-17 14:37:00+00:00,0,1
41c,2024-04-17 14:36:00+00:00,0,1
1b8c,2024-04-18 07:36:00+00:00,0,1
418,2024-04-18 07:35:30+00:00,0,1
41c,2024-04-17 14:35:50+00:00,0,1
1b8c,2024-04-18 07:37:30+00:00,0,1
237c,2024-04-18 07:48:00+00:00,0,1
