# ðŸš€ DistilBERT NER Training for Log Intelligence
This notebook trains a lightweight **DistilBERT log-specific NER model** that identifies:

- USER
- SRC_IP
- DEST_IP
- PROCESS
- HOST
- PORT
- ACTION

The trained model will be used as a **fallback** inside your NLP pipeline when regex and rule-based extraction fail.

Model is saved to:

```
nlp/bert/model/
  config.json
  pytorch_model.bin
  tokenizer.json
  vocab.txt
```

---

In [1]:
import os
import json
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    Trainer,
    TrainingArguments,
    DataCollatorForTokenClassification
)
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

print("CUDA available:", torch.cuda.is_available())

  from .autonotebook import tqdm as notebook_tqdm


ModuleNotFoundError: No module named 'datasets'

## ðŸ“¥ Load Training Data
Your raw labeled data should be located here:

```
nlp/data/raw/train_logs.txt
nlp/data/raw/test_logs.txt
```

Each line should contain a log and manually annotated BIO tags OR generated using a preprocessing script.

Format example:

```
Invalid O
user O
admin B-USER
from O
122.225.109.208 B-SRC_IP
port O
443 B-PORT
```


In [None]:
# Path to raw training data
train_path = "../data/raw/train_logs.txt"
test_path = "../data/raw/test_logs.txt"

def load_bio_file(path):
    sentences = []
    labels = []
    with open(path, "r", encoding="utf-8") as f:
        words = []
        tags = []
        for line in f:
            line = line.strip()
            if not line:
                if words:
                    sentences.append(words)
                    labels.append(tags)
                words = []
                tags = []
                continue

            try:
                w, t = line.split()
            except:
                continue

            words.append(w)
            tags.append(t)

    return sentences, labels

train_sentences, train_labels = load_bio_file(train_path)
test_sentences, test_labels = load_bio_file(test_path)

print("Loaded training samples:", len(train_sentences))

In [None]:
label_map_path = "../bert/utils/label_map.json"
with open(label_map_path, "r") as f:
    label_map = json.load(f)

id2label = {int(k): v for k, v in label_map.items()}
label2id = {v: k for k, v in id2label.items()}

label_list = list(label2id.keys())
num_labels = len(label_list)

print("Labels:", label_list)
print("# labels:", num_labels)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_and_align(sentences, labels):
    tokenized_inputs = tokenizer(
        sentences,
        truncation=True,
        is_split_into_words=True
    )

    new_labels = []
    for i, label_seq in enumerate(labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned = []

        prev_word = None
        for word_id in word_ids:
            if word_id is None:
                aligned.append(-100)
            else:
                aligned.append(label2id[label_seq[word_id]])

        new_labels.append(aligned)

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

# Prepare HuggingFace Datasets
train_dataset = Dataset.from_dict(tokenize_and_align(train_sentences, train_labels))
test_dataset = Dataset.from_dict(tokenize_and_align(test_sentences, test_labels))

data = DatasetDict({"train": train_dataset, "test": test_dataset})

In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)

data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
training_args = TrainingArguments(
    output_dir="../data/output/",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=5e-5,
    logging_steps=10,
    weight_decay=0.01,
    report_to="none"
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data["train"],
    eval_dataset=data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()

In [None]:
save_path = "../model/"

model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print("Model saved to", save_path)

In [None]:
metrics = trainer.evaluate()
metrics

In [None]:
with open("../data/output/metrics.json", "w") as f:
    json.dump(metrics, f, indent=4)

metrics