In [56]:
import pandas as pd


def get_relevant_columns(df):
    return df.drop(columns=['pname', 'tid', 'rcx', 'rdx', 'r8', 'r9'])

def read_file(file_path):
    df = pd.read_csv(file_path, header=None, names=["pname", "pid", "tid", "syscall", "rcx", "rdx", "r8", "r9"])
    filtered_df = get_relevant_columns(df)
    return filtered_df.groupby('pid').agg({'syscall': list}).reset_index()


grouped_df = read_file('logs/ransomwarePOC_60s.log')
grouped_df['malicious'] = grouped_df['pid'].apply(lambda x: 1 if x == '2118' else 0)
grouped_df


Unnamed: 0,pid,syscall,malicious
0,150c,"[48, 39, f, 34, e, 4, e, e, e, 4, e, 4, 4, 4, ...",0
1,1568,"[1524, 1046, 121, 32, 121, 21, 121, 121, 32, 1...",0
2,1734,"[f, 4, 154, 4, 19, f, 16, 154, 4, f, 4, 4, 16,...",0
3,1894,[6],0
4,19a8,[f],0
5,1b64,[53],0
6,1c0c,"[4, 4, 1b0, 4, 1b0, 4, 4, 4, 4, 4, 4, 4, 4, 4,...",0
7,1e1c,"[24, 24, 24, d, 7f, d, 171, 7, 1d5, 171, 1d5, ...",0
8,20ac,"[1486, 34, 1486, 48, 34, 1486]",0
9,2108,"[9, 23, 18, a1, 9]",0


In [65]:
# bag of ngrams nb classifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer

def list_to_str(lst: list):
    return ' '.join(lst)

corpus_df = grouped_df.iloc[[1, 10], :].copy()
corpus_df['syscall'] = grouped_df['syscall'].apply(list_to_str)
test_df = grouped_df.copy()
test_df['syscall'] = test_df['syscall'].apply(list_to_str)
# Create pipeline
pipeline = make_pipeline(CountVectorizer(ngram_range=(6, 6)), MultinomialNB())
# Fit the model
pipeline.fit(corpus_df['syscall'], corpus_df['malicious'])
# Testing the model
predictions = pipeline.predict(test_df['syscall'].apply(list_to_str))

grouped_df['predictions'] = predictions
grouped_df
# Did not detect the ransomware process



Unnamed: 0,pid,syscall,malicious,predictions
0,150c,"[48, 39, f, 34, e, 4, e, e, e, 4, e, 4, 4, 4, ...",0,0
1,1568,"[1524, 1046, 121, 32, 121, 21, 121, 121, 32, 1...",0,0
2,1734,"[f, 4, 154, 4, 19, f, 16, 154, 4, f, 4, 4, 16,...",0,0
3,1894,[6],0,0
4,19a8,[f],0,0
5,1b64,[53],0,0
6,1c0c,"[4, 4, 1b0, 4, 1b0, 4, 4, 4, 4, 4, 4, 4, 4, 4,...",0,0
7,1e1c,"[24, 24, 24, d, 7f, d, 171, 7, 1d5, 171, 1d5, ...",0,0
8,20ac,"[1486, 34, 1486, 48, 34, 1486]",0,0
9,2108,"[9, 23, 18, a1, 9]",0,0


In [39]:
# Anomaly detection with Isolation Forest
from pyod.models.iforest import IForest
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer

def to_dense(x):
    return x.toarray()

contamination = 0.05
benign_df = read_file('logs/benign_file_ops_60s.log')
benign_df['malicious'] = 0

benign_df['syscall'] = benign_df['syscall'].apply(list_to_str)

X_train = benign_df['syscall']
X_test = grouped_df['syscall']

scaler = CountVectorizer(ngram_range=(6,6))
X_train = scaler.fit_transform(X_train).toarray()
X_test = scaler.transform(X_test).toarray()

iforest = IForest(contamination=contamination)

iforest.fit(X_train)
iforest.predict(X_test)

# pipeline = make_pipeline(CountVectorizer(ngram_range=(6,6)), FunctionTransformer(to_dense, accept_sparse=True), IForest(contamination=contamination))

# pipeline.fit(benign_df['syscall'])

# predictions = pipeline.predict(grouped_df['syscall'])
# grouped_df['predictions'] = predictions
# grouped_df

# does not find the ransomware process...


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])