# üõ°Ô∏è Log G√∂zc√ºs√º - AI Model Eƒüitimi

Bu notebook, log saldƒ±rƒ± tespiti i√ßin DistilBERT modelini eƒüitir.

**Adƒ±mlar:**
1. Veri y√ºkleme ve hazƒ±rlama
2. Model eƒüitimi
3. Deƒüerlendirme
4. Model kaydetme (.pth)

In [None]:
# GPU kontrol√º
!nvidia-smi

In [None]:
# K√ºt√ºphaneleri kur
!pip install -q transformers datasets torch scikit-learn

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertModel, DistilBertTokenizer, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import json
import numpy as np
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

## 1Ô∏è‚É£ Config

In [None]:
# Konfig√ºrasyon
MODEL_NAME = "distilbert-base-uncased"
MAX_SEQ_LENGTH = 256
BATCH_SIZE = 32
EPOCHS = 5
LEARNING_RATE = 2e-5
DROPOUT = 0.3

LABEL_MAP = {
    0: "benign",
    1: "sqli",
    2: "xss",
    3: "path_traversal",
    4: "command_injection",
    5: "bruteforce",
    6: "honeypot_trap",
    7: "other_attack"
}
LABEL_TO_ID = {v: k for k, v in LABEL_MAP.items()}
NUM_CLASSES = len(LABEL_MAP)

## 2Ô∏è‚É£ Veri Y√ºkleme

**Veri dosyasƒ±nƒ± y√ºkleyin:**
- Sol men√ºden üìÅ Files'a tƒ±klayƒ±n
- `train_data.jsonl` dosyasƒ±nƒ± s√ºr√ºkleyip bƒ±rakƒ±n

In [None]:
# Veri y√ºkleme fonksiyonu
def load_jsonl(file_path):
    samples = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line:
                try:
                    sample = json.loads(line)
                    if 'log' in sample and 'label' in sample:
                        samples.append(sample)
                except:
                    continue
    return samples

# Veriyi y√ºkle (dosya yolunu g√ºncelleyin)
# data = load_jsonl('/content/train_data.jsonl')
# print(f"Toplam √∂rnek: {len(data)}")

In [None]:
# Demo: √ñrnek veri olu≈ütur (ger√ßek veri yoksa test i√ßin)
demo_data = [
    {"log": '192.168.1.1 - - [10/Jan/2026:12:00:00] "GET /index.html HTTP/1.1" 200 5678', "label": "benign"},
    {"log": '10.0.0.5 - - [10/Jan/2026:12:01:00] "GET /api/users HTTP/1.1" 200 1234', "label": "benign"},
    {"log": '192.168.1.100 - - [10/Jan/2026:12:03:00] "GET /login?user=admin\' OR \'1\'=\'1 HTTP/1.1" 200 1234', "label": "sqli"},
    {"log": '10.0.0.50 - - [10/Jan/2026:12:04:00] "GET /product?id=1 UNION SELECT * FROM users HTTP/1.1" 200 3000', "label": "sqli"},
    {"log": '192.168.1.200 - - [10/Jan/2026:12:05:00] "GET /search?q=<script>alert(1)</script> HTTP/1.1" 200 500', "label": "xss"},
    {"log": '10.0.0.100 - - [10/Jan/2026:12:06:00] "GET /download?file=../../etc/passwd HTTP/1.1" 200 1500', "label": "path_traversal"},
    {"log": '172.16.0.50 - - [10/Jan/2026:12:07:00] "GET /ping?ip=127.0.0.1;cat /etc/shadow HTTP/1.1" 200 2000', "label": "command_injection"},
    {"log": '192.168.1.150 - - [10/Jan/2026:12:08:00] "POST /login HTTP/1.1" 401 100', "label": "bruteforce"},
] * 100  # Demo i√ßin √ßoƒüalt

data = demo_data  # Ger√ßek veri y√ºklendiƒüinde bu satƒ±rƒ± kaldƒ±rƒ±n
print(f"Toplam √∂rnek: {len(data)}")

## 3Ô∏è‚É£ Dataset & DataLoader

In [None]:
class LogDataset(Dataset):
    def __init__(self, samples, tokenizer, max_length=256):
        self.samples = samples
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        sample = self.samples[idx]
        log_text = sample['log']
        label = LABEL_TO_ID.get(sample['label'].lower(), 7)
        
        encoding = self.tokenizer(
            log_text,
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'label': torch.tensor(label)
        }

# Tokenizer y√ºkle
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)

# Train/Val split
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)
print(f"Train: {len(train_data)}, Val: {len(val_data)}")

# DataLoader
train_dataset = LogDataset(train_data, tokenizer, MAX_SEQ_LENGTH)
val_dataset = LogDataset(val_data, tokenizer, MAX_SEQ_LENGTH)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

## 4Ô∏è‚É£ Model Tanƒ±mƒ±

In [None]:
class LogClassifier(nn.Module):
    def __init__(self, num_classes=NUM_CLASSES, dropout=DROPOUT):
        super().__init__()
        self.bert = DistilBertModel.from_pretrained(MODEL_NAME)
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(768, num_classes)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state[:, 0, :]
        return self.classifier(self.dropout(pooled))

model = LogClassifier().to(device)
print(f"Model parametreleri: {sum(p.numel() for p in model.parameters()):,}")

## 5Ô∏è‚É£ Eƒüitim

In [None]:
# Optimizer ve Scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=500, num_training_steps=total_steps)
criterion = nn.CrossEntropyLoss()

# Eƒüitim d√∂ng√ºs√º
best_val_acc = 0

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_loader)
    
    # Validation
    model.eval()
    correct = 0
    total = 0
    
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(input_ids, attention_mask)
            preds = torch.argmax(outputs, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    
    val_acc = correct / total
    print(f"üìä Epoch {epoch+1}: Loss={avg_loss:.4f}, Val Acc={val_acc:.4f}")
    
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), 'log_classifier_best.pth')
        print(f"‚úÖ En iyi model kaydedildi! Acc: {val_acc:.4f}")

## 6Ô∏è‚É£ Deƒüerlendirme

In [None]:
# En iyi modeli y√ºkle
model.load_state_dict(torch.load('log_classifier_best.pth'))
model.eval()

all_preds = []
all_labels = []

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        outputs = model(input_ids, attention_mask)
        preds = torch.argmax(outputs, dim=1)
        
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Rapor
print("\nüìã Classification Report:")
print(classification_report(all_labels, all_preds, target_names=list(LABEL_MAP.values())))

## 7Ô∏è‚É£ Model Kaydetme

In [None]:
# Final model kaydet
torch.save(model.state_dict(), 'log_classifier.pth')
print("‚úÖ Model kaydedildi: log_classifier.pth")

# ƒ∞ndirmek i√ßin
from google.colab import files
files.download('log_classifier.pth')

## üéâ Tamamlandƒ±!

**Sonraki adƒ±mlar:**
1. `log_classifier.pth` dosyasƒ±nƒ± indir
2. Log G√∂zc√ºs√º projesine kopyala: `ai_model/log_classifier.pth`
3. `ajan.py`'daki API kodlarƒ±nƒ± kaldƒ±r
4. Test et!