In [60]:
import os
import joblib
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [66]:
base_path = r"C:\Users\akars\OneDrive\Desktop\model_os\ADFA-LD\ADFA-LD"
train_path = os.path.join(base_path, "Training_Data_Master")
val_path = os.path.join(base_path, "Validation_Data_Master")
attack_path = os.path.join(base_path, "Attack_Data_Master")

In [67]:
def read_syscalls_from_dir(directory, label):
    data = []
    for root, _, files in os.walk(directory):
        for file in files:
            file_path = os.path.join(root, file)
            with open(file_path) as f:
                content = f.read().strip()
                if content:  # skip empty files
                    sequence = list(map(int, content.split()))
                    data.append((sequence, label))
    return data


In [68]:
benign_train = read_syscalls_from_dir(train_path, label=0)
benign_val = read_syscalls_from_dir(val_path, label=0)
malicious = read_syscalls_from_dir(attack_path, label=1)


In [69]:
all_data = benign_train + benign_val + malicious

# Sanity check
print("Total samples:", len(all_data))
print("Benign samples:", len(benign_train) + len(benign_val))
print("Malicious samples:", len(malicious))

Total samples: 5951
Benign samples: 5205
Malicious samples: 746


In [70]:
def get_ngrams(seq, n=6):
    return [" ".join(map(str, seq[i:i+n])) for i in range(len(seq)-n+1)]


In [72]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Fit TF-IDF on all n-gram sequences
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(X_raw)


In [73]:
print("Shape of feature matrix:", X.shape)


Shape of feature matrix: (5951, 167)


In [74]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [57]:
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.4f}")

Accuracy: 0.9505


In [75]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [76]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = model.predict(X_test)

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Confusion Matrix:
 [[1018   24]
 [  33  116]]

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.98      0.97      1042
           1       0.83      0.78      0.80       149

    accuracy                           0.95      1191
   macro avg       0.90      0.88      0.89      1191
weighted avg       0.95      0.95      0.95      1191



In [78]:
import joblib

# Save the trained classifier
joblib.dump(model, 'rf_ids_model.joblib')

# Save the TF-IDF vectorizer
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')


['tfidf_vectorizer.joblib']