In [None]:
import os

NOTEBOOK_DIR = os.getcwd()
PROJECT_ROOT = os.path.abspath(os.path.join(NOTEBOOK_DIR, ".."))

BASE_LOG_DIR = os.path.join(
    PROJECT_ROOT,
    "data",
    "logs",
    "logs",
    "ADFA_log",
    "ADFA-LD"
)

NORMAL_LOG_FILE = os.path.join(
    BASE_LOG_DIR,
    "ADFA-LD+Syscall+List.txt"
)

ATTACK_LOG_DIR = os.path.join(
    BASE_LOG_DIR,
    "Attack_Data_Master"
)

print("Normal log file exists:", os.path.exists(NORMAL_LOG_FILE))
print("Attack dir exists:", os.path.exists(ATTACK_LOG_DIR))
## load normal logs for training
def load_normal_logs(file_path):
    logs = []
    with open(file_path, "r", errors="ignore") as f:
        for line in f:
            logs.append(line.strip())
    return logs

normal_logs = load_normal_logs(NORMAL_LOG_FILE)

print("Total normal log lines:", len(normal_logs))
print("Sample normal log:", normal_logs[0])

## load attack logs for testing
def load_attack_logs_recursive(root_dir):
    logs = []

    for root, dirs, files in os.walk(root_dir):
        for file in files:
            file_path = os.path.join(root, file)
            with open(file_path, "r", errors="ignore") as f:
                logs.extend(f.readlines())

    return logs

attack_logs = load_attack_logs_recursive(ATTACK_LOG_DIR)

print("Total attack log lines:", len(attack_logs))

if len(attack_logs) > 0:
    print("Sample attack log:", attack_logs[0])
else:
    print("No attack logs found â€” check dataset integrity")

    ## clean the log
import re

def clean_log(line):
    line = line.lower()
    line = re.sub(r"\d+", "<NUM>", line)
    line = re.sub(r"[^\w\s]", " ", line)
    return line.strip()

normal_clean = [clean_log(l) for l in normal_logs]
attack_clean = [clean_log(l) for l in attack_logs]

print("Cleaned normal log:", normal_clean[0])
##TF-IDF feature extraction

from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
vectorizer = TfidfVectorizer(
    max_features=500,
    ngram_range=(1, 2)
)

X_train = vectorizer.fit_transform(normal_clean).toarray()
X_attack = vectorizer.transform(attack_clean).toarray()

print("Train feature shape:", X_train.shape)
print("Attack feature shape:", X_attack.shape)
## log autoencoder

import torch
import torch.nn as nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class LogAutoEncoder(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 64)
        )
        self.decoder = nn.Sequential(
            nn.Linear(64, 256),
            nn.ReLU(),
            nn.Linear(256, input_dim)
        )

    def forward(self, x):
        z = self.encoder(x)
        return self.decoder(z)
    ## training the log autoencoder

X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)

model = LogAutoEncoder(X_train.shape[1]).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

epochs = 15
batch_size = 128

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for i in range(0, len(X_train_tensor), batch_size):
        batch = X_train_tensor[i:i+batch_size]

        recon = model(batch)
        loss = criterion(recon, batch)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch [{epoch+1}/{epochs}] Loss: {total_loss:.4f}")

    ## anomaly scoring
def anomaly_score(model, X):
    model.eval()
    with torch.no_grad():
        recon = model(X)
        score = torch.mean((X - recon) ** 2, dim=1)
    return score.cpu().numpy()

train_scores = anomaly_score(model, X_train_tensor)

X_attack_tensor = torch.tensor(X_attack, dtype=torch.float32).to(device)
attack_scores = anomaly_score(model, X_attack_tensor)

print("Train scores (first 5):", train_scores[:5])
print("Attack scores (first 5):", attack_scores[:5])

## anomaly threshold
threshold = np.mean(train_scores) + 3 * np.std(train_scores)
print("Anomaly threshold:", threshold)
## detect attacks
predictions = ["ANOMALY" if s > threshold else "NORMAL" for s in attack_scores]

print("Attack predictions (first 20):", predictions[:20])
## log features for fusion
with torch.no_grad():
    log_features = model.encoder(X_train_tensor).cpu()

print("Log latent feature shape:", log_features.shape)











