In [None]:
import os
import zipfile
import pandas as pd

# Paths
ZIP_PATH = "loghub-master.zip"
EXTRACT_DIR = "loghub_extracted"
OUTPUT_CSV = "log_data.csv"

# ----------------------------
# 1️⃣ Extract ZIP
# ----------------------------

with zipfile.ZipFile(ZIP_PATH, "r") as z:
    z.extractall(EXTRACT_DIR)

# ----------------------------
# 2️⃣ Build dataset
# ----------------------------
all_rows = []

valid_extensions = (".log", ".txt", ".out", ".json", ".csv", ".xml")

for root, dirs, files in os.walk(EXTRACT_DIR):
    for file in files:
        if file.endswith(valid_extensions):
            file_path = os.path.join(root, file)

            label = os.path.basename(root).lower()   # <-- FIX: lowercase label

            try:
                with open(file_path, "r", errors="ignore") as f:
                    for line in f:
                        line = line.strip()
                        if line:
                            all_rows.append([line, label])
            except:
                continue

# ----------------------------
# 3️⃣ Save to CSV
# ----------------------------
df = pd.DataFrame(all_rows, columns=["log_text", "label"])
df.to_csv(OUTPUT_CSV, index=False)

print("Dataset created:", len(df), "rows")
print("Unique labels:", df["label"].unique())


In [2]:
import pandas as pd
from datasets import Dataset
from transformers import BertTokenizerFast

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/Hackathon/log_data.csv")

# Train / test split
train_df = df.sample(frac=0.8, random_state=42)
test_df = df.drop(train_df.index)

train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)

# Tokenizer
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

def tokenize(batch):
    return tokenizer(
        batch["log_text"],
        truncation=True,
        padding="max_length",
        max_length=64
    )

train_tokenized = train_ds.map(tokenize, batched=True)
test_tokenized = test_ds.map(tokenize, batched=True)

# Rename labels
train_tokenized = train_tokenized.rename_column("label", "labels")
test_tokenized = test_tokenized.rename_column("label", "labels")

# Convert to torch format
train_tokenized.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_tokenized.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

print("Tokenization complete!")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/52643 [00:00<?, ? examples/s]

Map:   0%|          | 0/13161 [00:00<?, ? examples/s]

Tokenization complete!


In [3]:
print(train_tokenized.column_names)
print(test_tokenized.column_names)


['log_text', 'labels', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask']
['log_text', 'labels', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask']


In [4]:
train_tokenized = train_tokenized.remove_columns(["__index_level_0__"])
test_tokenized  = test_tokenized.remove_columns(["__index_level_0__"])


In [5]:
train_tokenized.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "token_type_ids", "labels"]
)

test_tokenized.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "token_type_ids", "labels"]
)


In [6]:
unique_labels = sorted(df["label"].unique())
label_to_id = {lbl: i for i, lbl in enumerate(unique_labels)}
id_to_label = {i: lbl for lbl, i in label_to_id.items()}

print(label_to_id)


{'Android': 0, 'Apache': 1, 'BGL': 2, 'HDFS': 3, 'HPC': 4, 'Hadoop': 5, 'HealthApp': 6, 'Linux': 7, 'Mac': 8, 'OpenSSH': 9, 'OpenStack': 10, 'Proxifier': 11, 'Spark': 12, 'Thunderbird': 13, 'Windows': 14, 'Zookeeper': 15}


In [7]:
df["label_id"] = df["label"].map(label_to_id)


In [8]:
from datasets import Dataset

dataset = Dataset.from_pandas(df[['log_text','label_id']])
dataset = dataset.shuffle(seed=42)


In [9]:
# Split dataset into train and test randomly
from datasets import Dataset

# Shuffle and split
train_test = dataset.train_test_split(test_size=0.2, shuffle=True, seed=42)
train_ds = train_test["train"]
test_ds  = train_test["test"]

# Take only a subset of the training set (e.g., 5000 rows)
train_ds_small = train_ds.shuffle(seed=42).select(range(min(25000, len(train_ds))))

print("Original train size:", len(train_ds))
print("Subset train size:", len(train_ds_small))



Original train size: 52643
Subset train size: 25000


In [10]:
def tokenize(example):
    return tokenizer(
        example["log_text"],
        truncation=True,
        padding="max_length",
        max_length=64
    )

train_tokenized = train_ds.map(tokenize, batched=True)
test_tokenized  = test_ds.map(tokenize, batched=True)


Map:   0%|          | 0/52643 [00:00<?, ? examples/s]

Map:   0%|          | 0/13161 [00:00<?, ? examples/s]

In [11]:
train_tokenized = train_tokenized.rename_column("label_id", "labels")
test_tokenized  = test_tokenized.rename_column("label_id", "labels")


In [12]:
if "__index_level_0__" in train_tokenized.column_names:
    train_tokenized = train_tokenized.remove_columns(["__index_level_0__"])

if "__index_level_0__" in test_tokenized.column_names:
    test_tokenized = test_tokenized.remove_columns(["__index_level_0__"])

train_tokenized = train_tokenized.remove_columns(["log_text"])
test_tokenized  = test_tokenized.remove_columns(["log_text"])


In [13]:
train_tokenized.set_format(type="torch")
test_tokenized.set_format(type="torch")


In [14]:
num_labels = len(unique_labels)
print(num_labels)

16


In [15]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=num_labels
)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Hackathon/log_classifier",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    logging_steps=200,
    eval_strategy="no",   # disable evaluation
    save_strategy="no",         # disable saving checkpoints
    fp16=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=test_tokenized,

)


In [17]:
trainer.train()


Step,Training Loss
200,0.55
400,0.0644
600,0.0497
800,0.0428
1000,0.039
1200,0.0339
1400,0.0302
1600,0.0339
1800,0.0155
2000,0.0159


TrainOutput(global_step=8230, training_loss=0.02580452621942737, metrics={'train_runtime': 498.7704, 'train_samples_per_second': 527.728, 'train_steps_per_second': 16.501, 'total_flos': 4359513960867840.0, 'train_loss': 0.02580452621942737, 'epoch': 5.0})

In [39]:
from transformers import DistilBertTokenizerFast

save_path = "/content/drive/MyDrive/Hackathon/log_classifier_final"

# Force saving the correct tokenizer again
correct_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
correct_tokenizer.save_pretrained(save_path)

print("Correct DistilBert tokenizer saved!")


Correct DistilBert tokenizer saved!


In [40]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast
import torch

load_path = "/content/drive/MyDrive/Hackathon/log_classifier_final"

model = DistilBertForSequenceClassification.from_pretrained(load_path)
tokenizer = DistilBertTokenizerFast.from_pretrained(load_path)

model.eval()
print("Model + tokenizer loaded correctly!")


Model + tokenizer loaded correctly!


In [58]:
def predict_log(log_text):
    inputs = tokenizer(
        log_text,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=64
    )
    with torch.no_grad():
        outputs = model(**inputs)
    pred_id = outputs.logits.argmax(dim=1).item()
    return id_to_label[pred_id]

print(predict_log("session failed for user test by (uid=509)"))


Linux


In [60]:
print(predict_log("Read out cached package applicability for package"))

Windows


In [71]:
print(predict_log("Read out cached applicability from TiLight for package "))

Hadoop


In [72]:
print(predict_log("Server doesnt support encryption"))

Linux
