## 1. Setup - Colab Kontrol√º ve Proje Kurulumu

In [None]:
import sys
import os

# Colab kontrol√º
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    print("‚úÖ Google Colab ortamƒ± tespit edildi")

    # GPU kontrol√º
    import torch
    if torch.cuda.is_available():
        print(f"‚úÖ GPU: {torch.cuda.get_device_name(0)}")
        print(f"   VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    else:
        print("‚ö†Ô∏è  GPU bulunamadƒ±! Runtime > Change runtime type > GPU se√ßin")

    # GitHub'dan projeyi klonla
    print("\nüì• Proje indiriliyor...")
    !git clone https://github.com/Aliekinozcetin/Mitre_Attack_TTP_Mapping.git

    # Proje dizinine ge√ß
    os.chdir('Mitre_Attack_TTP_Mapping')
    print(f"‚úÖ √áalƒ±≈üma dizini: {os.getcwd()}")

    # Gerekli paketleri y√ºkle (sadece temel ML paketleri, Jupyter paketleri hari√ß)
    print("\nüì¶ Paketler y√ºkleniyor...")
    !pip install -q torch transformers datasets scikit-learn pandas tqdm wandb matplotlib seaborn
    print("‚úÖ T√ºm paketler y√ºklendi")
else:
    print("‚ÑπÔ∏è  Yerel ortamda √ßalƒ±≈üƒ±yorsunuz")

## 2. Import Mod√ºller

In [None]:
import torch
import numpy as np
import json
from datetime import datetime

from src.data_loader import prepare_data
from src.model import load_model
from src.train import train_model
from src.evaluate import evaluate_model

print("‚úÖ Mod√ºller y√ºklendi")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

## 3. Konfig√ºrasyon

In [None]:
# Training parametreleri
CONFIG = {
    'model_name': 'bert-base-uncased',  # veya 'jackaduma/SecBERT', 'distilbert-base-uncased'
    'max_length': 512,
    'batch_size': 16,  # GPU varsa 32'ye √ßƒ±karabilirsin
    'learning_rate': 2e-5,
    'num_epochs': 3,
    'warmup_steps': 500,
    'threshold': 0.5,
    'device': 'cuda' if torch.cuda.is_available() else 'cpu',
    'output_dir': './outputs'
}

# Konfig√ºrasyonu yazdƒ±r
print("\n" + "="*70)
print("TRAINING CONFIGURATION")
print("="*70)
for key, value in CONFIG.items():
    print(f"  {key:20s}: {value}")
print("="*70)

## 4. Veri Y√ºkleme ve Hazƒ±rlama

In [None]:
print("\n" + "="*70)
print("STEP 1: DATA PREPARATION")
print("="*70 + "\n")

data = prepare_data(
    model_name=CONFIG['model_name'],
    max_length=CONFIG['max_length']
)

train_dataset = data['train_dataset']
test_dataset = data['test_dataset']
label_list = data['label_list']
num_labels = data['num_labels']

print(f"\n‚úÖ Veri hazƒ±rlama tamamlandƒ±!")
print(f"   Train samples: {len(train_dataset)}")
print(f"   Test samples: {len(test_dataset)}")
print(f"   Number of labels: {num_labels}")

## 5. Model Y√ºkleme

In [None]:
print("\n" + "="*70)
print("STEP 2: MODEL INITIALIZATION")
print("="*70 + "\n")

model = load_model(
    model_name=CONFIG['model_name'],
    num_labels=num_labels,
    device=CONFIG['device']
)

print(f"\n‚úÖ Model y√ºklendi ve {CONFIG['device']} cihazƒ±na ta≈üƒ±ndƒ±!")

## 6. Model Eƒüitimi

In [None]:
print("\n" + "="*70)
print("STEP 3: MODEL TRAINING")
print("="*70 + "\n")

# Output dizini olu≈ütur
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
run_name = f"{CONFIG['model_name'].replace('/', '_')}_{timestamp}"
output_dir = os.path.join(CONFIG['output_dir'], run_name)
os.makedirs(output_dir, exist_ok=True)

# Label listesini kaydet
label_file = os.path.join(output_dir, "labels.json")
with open(label_file, 'w') as f:
    json.dump(label_list, f, indent=2)

# Eƒüitimi ba≈ülat
history = train_model(
    model=model,
    train_dataset=train_dataset,
    test_dataset=test_dataset,
    output_dir=output_dir,
    batch_size=CONFIG['batch_size'],
    learning_rate=CONFIG['learning_rate'],
    num_epochs=CONFIG['num_epochs'],
    warmup_steps=CONFIG['warmup_steps'],
    device=CONFIG['device']
)

# Training ge√ßmi≈üini kaydet
history_file = os.path.join(output_dir, "training_history.json")
with open(history_file, 'w') as f:
    json.dump(history, f, indent=2)

print(f"\n‚úÖ Eƒüitim tamamlandƒ±!")
print(f"   Final train loss: {history['train_loss'][-1]:.4f}")
if 'val_loss' in history:
    print(f"   Final val loss: {history['val_loss'][-1]:.4f}")

# Loss grafiƒüi
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 5))
plt.plot(history['train_loss'], label='Train Loss')
if 'val_loss' in history:
    plt.plot(history['val_loss'], label='Val Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training History')
plt.legend()
plt.grid(True)
plt.savefig(os.path.join(output_dir, 'training_loss.png'))
plt.show()

print(f"\nüìä Loss grafiƒüi kaydedildi: training_loss.png")

## 7. Model Deƒüerlendirme

In [None]:
print("\n" + "="*70)
print("STEP 4: MODEL EVALUATION")
print("="*70 + "\n")

# √ñnce model √ßƒ±ktƒ±larƒ±nƒ± kontrol et
print("üîç Model √ßƒ±ktƒ±larƒ±nƒ± kontrol ediyorum...")
model.eval()
sample_batch = next(iter(torch.utils.data.DataLoader(test_dataset, batch_size=16)))
with torch.no_grad():
    sample_out = model(
        input_ids=sample_batch['input_ids'].to(CONFIG['device']),
        attention_mask=sample_batch['attention_mask'].to(CONFIG['device'])
    )
    sample_probs = torch.sigmoid(sample_out['logits'])

print(f"\nSample sigmoid outputs:")
print(f"  Min: {sample_probs.min().item():.6f}")
print(f"  Max: {sample_probs.max().item():.6f}")
print(f"  Mean: {sample_probs.mean().item():.6f}")
print(f"  Median: {sample_probs.median().item():.6f}")

# Otomatik threshold belirleme
optimal_threshold = float(sample_probs.median().item())
print(f"\nüí° √ñnerilen threshold: {optimal_threshold:.4f}")

# Farklƒ± threshold deƒüerleri ile deƒüerlendir
thresholds_to_test = [0.1, 0.2, 0.3, optimal_threshold, 0.5]
print(f"\nüìä Farklƒ± threshold deƒüerleri test ediliyor...")

best_f1 = 0
best_threshold = 0.5
best_metrics = None

for thresh in thresholds_to_test:
    metrics = evaluate_model(
        model=model,
        test_dataset=test_dataset,
        batch_size=CONFIG['batch_size'],
        device=CONFIG['device'],
        threshold=thresh,
        label_list=label_list
    )

    if metrics['micro_f1'] > best_f1:
        best_f1 = metrics['micro_f1']
        best_threshold = thresh
        best_metrics = metrics

print(f"\nüèÜ En iyi threshold: {best_threshold:.4f}")
print(f"   Micro F1: {best_f1:.4f}")

# En iyi metrikleri kaydet
metrics_to_save = {k: float(v) if isinstance(v, (float, int)) else v
                   for k, v in best_metrics.items()
                   if k not in ['predictions', 'labels']}
metrics_to_save['best_threshold'] = best_threshold

metrics_file = os.path.join(output_dir, "evaluation_metrics.json")
with open(metrics_file, 'w') as f:
    json.dump(metrics_to_save, f, indent=2)

print(f"\n‚úÖ Deƒüerlendirme tamamlandƒ±!")
print(f"\nFinal Metrics (threshold={best_threshold:.4f}):")
print(f"  Micro F1:    {best_metrics['micro_f1']:.4f}")
print(f"  Macro F1:    {best_metrics['macro_f1']:.4f}")
print(f"  Samples F1:  {best_metrics['samples_f1']:.4f}")

## 8. Sonu√ßlarƒ± Kaydet ve ƒ∞ndir

In [None]:
# √ñzet dosyasƒ± olu≈ütur
summary = {
    'model': CONFIG['model_name'],
    'timestamp': timestamp,
    'configuration': CONFIG,
    'data': {
        'num_labels': num_labels,
        'train_samples': len(train_dataset),
        'test_samples': len(test_dataset)
    },
    'training': {
        'final_loss': history['train_loss'][-1]
    },
    'evaluation': metrics_to_save
}

summary_file = os.path.join(output_dir, "summary.json")
with open(summary_file, 'w') as f:
    json.dump(summary, f, indent=2)

print("\n" + "="*70)
print("PIPELINE COMPLETE!")
print("="*70)
print(f"\nSonu√ßlar kaydedildi: {output_dir}")
print("\nDosyalar:")
print(f"  - labels.json")
print(f"  - training_history.json")
print(f"  - evaluation_metrics.json")
print(f"  - summary.json")
print(f"  - final_model.pt")
print(f"  - checkpoint_epoch_*.pt")
print("="*70)

## 9. Colab'da Sonu√ßlarƒ± ƒ∞ndir

In [None]:
if IN_COLAB:
    import shutil
    from google.colab import files

    # Sonu√ßlarƒ± ZIP'le
    zip_name = f"{run_name}.zip"
    shutil.make_archive(run_name, 'zip', output_dir)

    print(f"\nüì¶ Sonu√ßlar sƒ±kƒ±≈ütƒ±rƒ±lƒ±yor: {zip_name}")
    print(f"   Boyut: {os.path.getsize(zip_name) / (1024*1024):.2f} MB")

    # ƒ∞ndir
    print("\n‚¨áÔ∏è  ƒ∞ndirme ba≈ülatƒ±lƒ±yor...")
    files.download(zip_name)
    print("‚úÖ ƒ∞ndirme tamamlandƒ±!")
else:
    print("‚ÑπÔ∏è  Yerel ortamdasƒ±nƒ±z, sonu√ßlar zaten bilgisayarƒ±nƒ±zda.")

## 10. (Opsiyonel) Farklƒ± Modelleri Dene

In [None]:
# SecBERT modelini denemek i√ßin bu h√ºcreyi √ßalƒ±≈ütƒ±r
# CONFIG['model_name'] = 'jackaduma/SecBERT'

# DistilBERT modelini denemek i√ßin bu h√ºcreyi √ßalƒ±≈ütƒ±r
# CONFIG['model_name'] = 'distilbert-base-uncased'
# CONFIG['batch_size'] = 32  # DistilBERT daha k√º√ß√ºk, batch size artƒ±rƒ±labilir

# Sonra yukarƒ±daki h√ºcreleri tekrar √ßalƒ±≈ütƒ±r

## üî¨ Advanced Evaluation - Top-K Strategy

In [None]:
# üî¨ Advanced Evaluation with Top-K strategy
print("üî¨ Testing Top-K prediction strategy...\n")

# Calculate average number of true labels per sample
true_labels_sum = torch.stack([test_dataset[i]['labels'] for i in range(len(test_dataset))]).sum(dim=1)
avg_k = int(true_labels_sum.float().mean().item())
print(f"Average true labels per sample: {avg_k:.1f}")

# Test different K values
k_values = [1, 3, 5, avg_k, 10, 15, 20]
topk_results = []

for k in k_values:
    print(f"\nTesting Top-K with k={k}...")
    
    model.eval()
    all_predictions = []
    all_labels = []
    
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=False)
    
    with torch.no_grad():
        for batch in test_loader:
            outputs = model(
                input_ids=batch['input_ids'].to(CONFIG['device']),
                attention_mask=batch['attention_mask'].to(CONFIG['device'])
            )
            probs = torch.sigmoid(outputs['logits'])
            
            # Select top-k
            batch_preds = torch.zeros_like(probs)
            topk_values, topk_indices = torch.topk(probs, k=min(k, probs.size(1)), dim=1)
            batch_preds.scatter_(1, topk_indices, 1.0)
            
            all_predictions.append(batch_preds.cpu().numpy())
            all_labels.append(batch['labels'].cpu().numpy())
    
    predictions = np.vstack(all_predictions)
    labels = np.vstack(all_labels)
    
    # Compute metrics
    from sklearn.metrics import f1_score, precision_score, recall_score
    
    micro_f1 = f1_score(labels, predictions, average='micro', zero_division=0)
    macro_f1 = f1_score(labels, predictions, average='macro', zero_division=0)
    samples_f1 = f1_score(labels, predictions, average='samples', zero_division=0)
    precision = precision_score(labels, predictions, average='micro', zero_division=0)
    recall = recall_score(labels, predictions, average='micro', zero_division=0)
    
    topk_results.append({
        'k': k,
        'micro_f1': micro_f1,
        'macro_f1': macro_f1,
        'samples_f1': samples_f1,
        'precision': precision,
        'recall': recall
    })
    
    print(f"  Micro F1: {micro_f1:.4f} | Precision: {precision:.4f} | Recall: {recall:.4f}")

# Display results table
print(f"\n{'='*80}")
print("TOP-K STRATEGY RESULTS")
print(f"{'='*80}\n")
print(f"{'K':>5} {'Micro F1':>10} {'Macro F1':>10} {'Samples F1':>12} {'Precision':>11} {'Recall':>10}")
print("-" * 80)
for result in topk_results:
    print(f"{result['k']:>5} {result['micro_f1']:>10.4f} {result['macro_f1']:>10.4f} "
          f"{result['samples_f1']:>12.4f} {result['precision']:>11.4f} {result['recall']:>10.4f}")

# Find best K
best_result = max(topk_results, key=lambda x: x['micro_f1'])
print(f"\nüèÜ Best K: {best_result['k']} with Micro F1: {best_result['micro_f1']:.4f}")

# Visualization
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot F1 scores
k_vals = [r['k'] for r in topk_results]
axes[0].plot(k_vals, [r['micro_f1'] for r in topk_results], marker='o', label='Micro F1')
axes[0].plot(k_vals, [r['macro_f1'] for r in topk_results], marker='s', label='Macro F1')
axes[0].plot(k_vals, [r['samples_f1'] for r in topk_results], marker='^', label='Samples F1')
axes[0].axvline(x=best_result['k'], color='r', linestyle='--', alpha=0.5, label=f'Best K={best_result["k"]}')
axes[0].set_xlabel('K')
axes[0].set_ylabel('F1 Score')
axes[0].set_title('F1 Scores vs K')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Plot Precision-Recall
axes[1].plot(k_vals, [r['precision'] for r in topk_results], marker='o', label='Precision')
axes[1].plot(k_vals, [r['recall'] for r in topk_results], marker='s', label='Recall')
axes[1].axvline(x=best_result['k'], color='r', linestyle='--', alpha=0.5, label=f'Best K={best_result["k"]}')
axes[1].set_xlabel('K')
axes[1].set_ylabel('Score')
axes[1].set_title('Precision & Recall vs K')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'topk_analysis.png'), dpi=150, bbox_inches='tight')
plt.show()

print(f"\nüìä Top-K analiz grafiƒüi kaydedildi: topk_analysis.png")