In [1]:
import pandas as pd

df = pd.read_csv("../data/raw/threat_texts.csv")
df.head()


Unnamed: 0,text,label
0,Multiple failed SSH login attempts detected fr...,Bruteforce
1,Unusual traffic spike indicates possible DDoS ...,DDoS
2,SQL injection attempt detected on login page,WebAttack
3,Benign system update request completed success...,Benign
4,Malicious PowerShell execution observed on end...,Malware


In [2]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['label_enc'] = le.fit_transform(df['label'])


In [8]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def encode_batch(texts, labels):
    enc = tokenizer(
        list(texts),
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors='pt'
    )
    return enc['input_ids'], enc['attention_mask'], torch.tensor(labels)


In [9]:
import torch
import torch.nn as nn
from transformers import BertModel

class ThreatClassifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.fc = nn.Linear(768, num_labels)

    def forward(self, input_ids, attention_mask):
        out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls = out.last_hidden_state[:,0,:]
        return self.fc(cls)


In [10]:
model = ThreatClassifier(num_labels=len(le.classes_))
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

input_ids, attention_mask, labels = encode_batch(df['text'], df['label_enc'])

epochs = 10
for epoch in range(epochs):
    optimizer.zero_grad()
    logits = model(input_ids, attention_mask)
    loss = criterion(logits, labels)
    loss.backward()
    optimizer.step()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")


Epoch 1/10, Loss: 1.8369
Epoch 2/10, Loss: 1.4191
Epoch 3/10, Loss: 1.1135
Epoch 4/10, Loss: 0.8371
Epoch 5/10, Loss: 0.6132
Epoch 6/10, Loss: 0.4303
Epoch 7/10, Loss: 0.2863
Epoch 8/10, Loss: 0.1997
Epoch 9/10, Loss: 0.1443
Epoch 10/10, Loss: 0.1066


In [21]:
import os
from transformers import BertConfig

save_path = "../models/bert_threat_classifier"
os.makedirs(save_path, exist_ok=True)

# 1. Save model weights
torch.save(model.state_dict(), os.path.join(save_path, "pytorch_model.bin"))

# 2. Save config so HF knows how many labels there are
config = BertConfig.from_pretrained("bert-base-uncased")
config.num_labels = len(le.classes_)
config.save_pretrained(save_path)

# 3. Save tokenizer
tokenizer.save_pretrained(save_path)

print("MODEL SAVED AT:", save_path)


MODEL SAVED AT: ../models/bert_threat_classifier


In [11]:
test = "Multiple login failures detected from IP 10.0.0.5"

ids, mask, _ = encode_batch([test], [0])
model.eval()
pred = model(ids, mask).argmax(dim=1)
print("Prediction:", le.inverse_transform([pred.item()])[0])


Prediction: Bruteforce


In [19]:
from src.nlp_engine.ioc_extractor import extract_iocs

extract_iocs("Attack from 192.168.1.50 contacting https://bad.com exploiting CVE-2023-23397")




{'ips': ['192.168.1.50'],
 'urls': ['https://bad.com'],
 'domains': ['bad.com'],
 'hashes': [],
 'cves': ['CVE-2023-23397'],
 'emails': []}

In [24]:
from src.nlp_engine.summariser import summarise_text

long_text = """
Multiple unauthorized login attempts were detected from IP 10.0.0.50.
After the brute force attempts, the attacker successfully authenticated
using compromised credentials. Shortly after, a suspicious PowerShell
script was executed which downloaded additional payloads from a remote server.
"""

summarise_text(long_text)


'multiple unauthorized login attempts were detected from IP 10.0.0.50. the attacker successfully authenticated using compromised credentials.'

In [26]:
from src.nlp_engine.fusion_engine import fusion_engine

test_log = """
Multiple unauthorized SSH login failures from 10.0.0.45.
User root was accessed after 57 attempts.
Downloaded file from http://evil-server.net/backdoor.exe
SHA256: 9f86d081884c7d659a2feaa0c55ad015
Exploit: CVE-2023-23397 detected.
"""

fusion_engine(test_log)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /Users/abdullahmehmood/Desktop/Project/ai-threat-intelligence-system/models/bert_threat_classifier and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'threat_type': 2,
 'iocs': {'ips': ['10.0.0.45'],
  'urls': ['http://evil-server.net/backdoor.exe'],
  'domains': ['backdoor.exe', 'evil-server.net'],
  'hashes': ['9f86d081884c7d659a2feaa0c55ad015'],
  'cves': ['CVE-2023-23397'],
  'emails': []},
 'summary': 'multiple unauthorized SSH login failures from 10.0.0.45. User root was accessed after 57 attempts.',
 'anomaly_score': 0.6590499271758975}