# System Logs Anomaly Detection using Fine-Tuned LLMs

A fine-tuned LLMs to classify logs as 'normal' or 'anomalous'.

---

## Install Dependencies

In [63]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install transformers datasets evaluate scikit-learn pandas tqdm peft

Looking in indexes: https://download.pytorch.org/whl/cu118



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


---

## Data Loading & Preprocessing

#### 1. Load log data

In [64]:
import pandas as pd

# Ignore warning
pd.set_option('future.no_silent_downcasting', True)

logs_df = pd.read_csv("../data/logs.csv")
labels_df = pd.read_csv("../data/labels.csv")

print(f"Log entries: {len(logs_df)}")

Log entries: 104815


In [65]:
logs_df.head(1)

Unnamed: 0,LineId,Date,Time,Pid,Level,Component,Content,EventId,EventTemplate
0,1,81109,203518,143,INFO,dfs.DataNode$DataXceiver,Receiving block blk_-1608999687919862906 src: ...,E5,Receiving block <*> src: /<*> dest: /<*>


In [66]:
labels_df.head(1)

Unnamed: 0,BlockId,Label
0,blk_-1608999687919862906,Normal


---

#### 2. Extract block_id from the content and add it as a new field

In [67]:
logs_df["BlockId"] = logs_df["Content"].str.extract(r'(blk_-?\d+)')
logs_df.head(1)

Unnamed: 0,LineId,Date,Time,Pid,Level,Component,Content,EventId,EventTemplate,BlockId
0,1,81109,203518,143,INFO,dfs.DataNode$DataXceiver,Receiving block blk_-1608999687919862906 src: ...,E5,Receiving block <*> src: /<*> dest: /<*>,blk_-1608999687919862906


#### 3. Merge logs with its label ('Normal' or 'Anomaly')

In [68]:
new_logs_df = pd.merge(logs_df, labels_df, on="BlockId")
new_logs_df.head(1)

Unnamed: 0,LineId,Date,Time,Pid,Level,Component,Content,EventId,EventTemplate,BlockId,Label
0,1,81109,203518,143,INFO,dfs.DataNode$DataXceiver,Receiving block blk_-1608999687919862906 src: ...,E5,Receiving block <*> src: /<*> dest: /<*>,blk_-1608999687919862906,Normal


#### 4. Map 'Normal' to '1' & 'Anomaly' to '0'

In [69]:
new_logs_df["Label"] = new_logs_df["Label"].replace({'Normal': 1, 'Anomaly': 0})
new_logs_df.head(1)

Unnamed: 0,LineId,Date,Time,Pid,Level,Component,Content,EventId,EventTemplate,BlockId,Label
0,1,81109,203518,143,INFO,dfs.DataNode$DataXceiver,Receiving block blk_-1608999687919862906 src: ...,E5,Receiving block <*> src: /<*> dest: /<*>,blk_-1608999687919862906,1


#### 5. Split dataset: Training & Test

In [70]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(
    new_logs_df,
    test_size=0.2,
    random_state=42,
    stratify=new_logs_df["Label"]
)

print(f"Training Split: {len(train_df)} | Test Split: {len(test_df)}")

Training Split: 83852 | Test Split: 20963


---

## Initialize Models 
*Using Lightweight models for training convenience. Will change it in the later stage of project*

In [71]:
from transformers import DistilBertModel, GPT2LMHeadModel, GPT2Tokenizer

#### 1. Initialize encoder

In [72]:
# Lightweight encoder model
# TODO: Change it later??
encoder = DistilBertModel.from_pretrained("distilbert-base-uncased")
encoder_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

#### 2. Initialize decoder

In [73]:
# Lightweight decorder model
# TODO: Change it later??
decoder = GPT2LMHeadModel.from_pretrained("gpt2").to("cuda")
decoder_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
decoder_tokenizer.pad_token = decoder_tokenizer.eos_token

---

## Prepare Dataset

#### 1. Encoder dataset

In [74]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_df[["EventTemplate", "Label"]])
test_dataset = Dataset.from_pandas(test_df[["EventTemplate", "Label"]])

def get_embeddings(batch):
    inputs = encoder_tokenizer(
        batch["EventTemplate"], 
        padding="max_length", 
        truncation=True, 
        max_length=128, 
        return_tensors="pt"
    )
    with torch.no_grad():
        outputs = encoder(**inputs)
    return {"embeddings": outputs.last_hidden_state.mean(dim=1).cpu().numpy()}

train_dataset = train_dataset.map(get_embeddings, batched=True, batch_size=32)
test_dataset = test_dataset.map(get_embeddings, batched=True, batch_size=32)

Map: 100%|██████████████████████████████████████████████████████████████| 83852/83852 [1:42:18<00:00, 13.66 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████| 20963/20963 [25:35<00:00, 13.65 examples/s]


#### 2. Decoder Dataset

In [75]:
def prepare_decoder_data(data):
    prompts = [
        f"""Analyze this log:
        Log: {log}
        Is this anomalous? Answer YES or NO:"""
        for log in data["EventTemplate"]
    ]
    tokenized = decoder_tokenizer(
        prompts,
        padding="max_length",
        truncation=True,
        max_length=256,
        return_tensors="pt"
    )
    return {
        "input_ids": tokenized["input_ids"],
        "attention_mask": tokenized["attention_mask"],
        "labels": tokenized["input_ids"].clone()
    }

train_dataset = train_dataset.map(prepare_decoder_data, batched=True, batch_size=8)
test_dataset = test_dataset.map(prepare_decoder_data, batched=True, batch_size=8)

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

Map: 100%|██████████████████████████████████████████████████████████████| 83852/83852 [00:25<00:00, 3305.55 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████| 20963/20963 [00:06<00:00, 3246.05 examples/s]


---

## Fine tune decoder

In [85]:
from transformers import TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
from safetensors.torch import save_file
import os

#### 1. Workaround for GPU efficiency

In [86]:
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["c_attn"],
    lora_dropout=0.05,
    task_type="CAUSAL_LM"
)

decoder = get_peft_model(decoder, peft_config)

#### 2. Train model

In [87]:
training_args = TrainingArguments(
    output_dir="./model",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=5e-5,
    evaluation_strategy="epoch",
    save_strategy="no",
    fp16=True,
    logging_steps=50,
    remove_unused_columns=False
)

trainer = Trainer(
    model=decoder,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,0.0197,0.00747
2,0.0114,0.007112


TrainOutput(global_step=15720, training_loss=0.06564609595401445, metrics={'train_runtime': 4271.7164, 'train_samples_per_second': 58.889, 'train_steps_per_second': 3.68, 'total_flos': 4.788400406239642e+16, 'train_loss': 0.06564609595401445, 'epoch': 2.99947526594476})

#### 4. Save model

In [90]:
decoder.save_pretrained("./log_anomaly", safe_serialization=True)