In [None]:
# Check GPU availability
!nvidia-smi

In [None]:
# Install required packages
print("üì¶ Installing dependencies...")
!pip install -q transformers datasets accelerate peft tqdm pandas numpy scikit-learn matplotlib seaborn
print("‚úÖ Dependencies installed!")

In [None]:
# Configuration
import os

# Kaggle paths - UPDATE THESE based on your uploaded dataset names
MODEL_PATH = "/kaggle/input/mitre-fine-tuned-model"  # Your uploaded model dataset
DATA_PATH = "/kaggle/input/mitre-datset"  # Your test data dataset
TEST_FILE = f"{DATA_PATH}/test.jsonl"

# Evaluation settings
EVAL_LIMIT = None  # Set to None for full evaluation, or a number like 100 for quick test
MAX_NEW_TOKENS = 256

print("‚úÖ Configuration loaded")
print(f"   Model: {MODEL_PATH}")
print(f"   Test data: {TEST_FILE}")

# Verify paths exist
if os.path.exists(MODEL_PATH):
    print(f"‚úÖ Model found: {len(os.listdir(MODEL_PATH))} files")
else:
    print(f"‚ùå Model not found at {MODEL_PATH}")
    print("   Please upload your fine_tuned_model as a Kaggle dataset")

if os.path.exists(TEST_FILE):
    print(f"‚úÖ Test file found")
else:
    print(f"‚ùå Test file not found at {TEST_FILE}")
    print("   Please add your test dataset to Kaggle")

In [None]:
# Load the fine-tuned model
print("üîÑ Loading fine-tuned model...\n")

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

model.eval()  # Set to evaluation mode

print(f"‚úÖ Model loaded from: {MODEL_PATH}")
print(f"üìä GPU Memory: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
print(f"üìä Model device: {model.device}")

In [None]:
# Load test dataset
print("üîÑ Loading test dataset...\n")

from datasets import load_dataset

test_dataset = load_dataset('json', data_files={'test': TEST_FILE})['test']

print(f"‚úÖ Test dataset loaded: {len(test_dataset):,} examples")
print(f"\nüìã Dataset columns: {test_dataset.column_names}")
print(f"\nüìã Sample test entry:")
print(f"   Instruction: {test_dataset[0]['instruction'][:100]}...")
print(f"   Input: {test_dataset[0]['input'][:100]}...")
print(f"   Output: {test_dataset[0]['output'][:100]}...")

In [None]:
# Define evaluation functions
print("üîÑ Defining evaluation functions...\n")

def generate_response(model, tokenizer, instruction, input_text, max_new_tokens=256):
    """Generate a response for a given instruction and input."""
    prompt = f"""{instruction}

### Input:
{input_text}

### Response:
"""
    
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.7,
            do_sample=True,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id
        )
    
    # Decode only the generated part (remove the prompt)
    generated_text = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    return generated_text.strip()

def calculate_exact_match(pred, target):
    """Calculate exact match accuracy."""
    return 1.0 if pred.strip().lower() == target.strip().lower() else 0.0

def calculate_partial_match(pred, target):
    """Calculate partial match (keyword overlap)."""
    pred_lower = pred.strip().lower()
    target_lower = target.strip().lower()
    
    # Check if major keywords from target appear in prediction
    target_words = set(target_lower.split())
    pred_words = set(pred_lower.split())
    
    if len(target_words) == 0:
        return 0.0
    
    overlap = len(target_words.intersection(pred_words))
    return overlap / len(target_words)

def calculate_f1_score(pred, target):
    """Calculate F1 score based on word overlap."""
    pred_words = set(pred.strip().lower().split())
    target_words = set(target.strip().lower().split())
    
    if len(pred_words) == 0 or len(target_words) == 0:
        return 0.0
    
    overlap = len(pred_words.intersection(target_words))
    
    precision = overlap / len(pred_words) if len(pred_words) > 0 else 0.0
    recall = overlap / len(target_words) if len(target_words) > 0 else 0.0
    
    if precision + recall == 0:
        return 0.0
    
    f1 = 2 * (precision * recall) / (precision + recall)
    return f1

print("‚úÖ Evaluation functions defined")

In [None]:
# Run evaluation
print("üöÄ Running evaluation on test set...\n")

from tqdm import tqdm
import time

# Determine sample size
if EVAL_LIMIT is None:
    eval_samples = test_dataset
    print(f"Evaluating on FULL test set: {len(eval_samples):,} examples")
else:
    eval_samples = test_dataset.select(range(min(EVAL_LIMIT, len(test_dataset))))
    print(f"Evaluating on LIMITED test set: {len(eval_samples):,} examples (out of {len(test_dataset):,})")

print(f"This may take a while...\n")

results = []
exact_matches = 0
partial_match_scores = []
f1_scores = []

start_time = time.time()

for i, example in enumerate(tqdm(eval_samples, desc="Evaluating")):
    # Generate prediction
    prediction = generate_response(
        model,
        tokenizer,
        example['instruction'],
        example['input'],
        max_new_tokens=MAX_NEW_TOKENS
    )
    
    # Calculate metrics
    exact_match = calculate_exact_match(prediction, example['output'])
    partial_match = calculate_partial_match(prediction, example['output'])
    f1 = calculate_f1_score(prediction, example['output'])
    
    exact_matches += exact_match
    partial_match_scores.append(partial_match)
    f1_scores.append(f1)
    
    # Store result
    results.append({
        'index': i,
        'instruction': example['instruction'],
        'input': example['input'],
        'expected': example['output'],
        'predicted': prediction,
        'exact_match': exact_match,
        'partial_match': partial_match,
        'f1_score': f1
    })
    
    # Show first few examples
    if i < 5:
        print(f"\n{'='*80}")
        print(f"Example {i+1}:")
        print(f"Instruction: {example['instruction'][:80]}...")
        print(f"Input: {example['input'][:100]}...")
        print(f"Expected: {example['output'][:100]}...")
        print(f"Predicted: {prediction[:100]}...")
        print(f"Metrics: Exact={exact_match}, Partial={partial_match:.2f}, F1={f1:.2f}")

elapsed = time.time() - start_time

print(f"\n{'='*80}")
print(f"‚úÖ Evaluation completed in {elapsed/60:.2f} minutes ({elapsed/len(eval_samples):.2f} sec/example)")

In [None]:
# Final Summary Report
print("\n" + "="*80)
print("üéâ FINAL EVALUATION SUMMARY")
print("="*80 + "\n")

print(f"üìä Dataset Information:")
print(f"   Total samples evaluated: {len(eval_samples):,}")
print(f"   Unique labels: {len(unique_labels)}")
print(f"   Evaluation time: {elapsed/60:.2f} minutes")
print(f"   Time per sample: {elapsed/len(eval_samples):.2f} seconds")

print(f"\nüéØ Key Performance Metrics:")
print(f"   ‚úì Overall Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"   ‚úì Weighted Precision: {precision_weighted:.4f}")
print(f"   ‚úì Weighted Recall: {recall_weighted:.4f}")
print(f"   ‚úì Weighted F1-Score: {f1_weighted:.4f}")

print(f"\nüìÅ Output Files:")
print(f"   ‚úì {output_file}")
print(f"   ‚úì {metrics_file}")

print(f"\n{'='*80}")
print("‚úÖ Evaluation Complete!")
print("="*80)

In [None]:
# Save results to CSV
print("üíæ Saving detailed results...\n")

# Create detailed results DataFrame
results_df = pd.DataFrame(results)
results_df['true_label'] = y_true
results_df['predicted_label'] = y_pred
results_df['correct'] = results_df['true_label'] == results_df['predicted_label']

# Save to CSV
output_file = 'evaluation_results.csv'
results_df.to_csv(output_file, index=False)
print(f"‚úÖ Detailed results saved to: {output_file}")

# Create metrics summary
metrics_summary = {
    'Metric': [
        'Accuracy',
        'Precision (Macro)',
        'Precision (Weighted)',
        'Recall (Macro)',
        'Recall (Weighted)',
        'F1-Score (Macro)',
        'F1-Score (Weighted)',
        'Exact Match Accuracy',
        'Avg Partial Match',
        'Avg F1 (Word-level)'
    ],
    'Score': [
        accuracy,
        precision_macro,
        precision_weighted,
        recall_macro,
        recall_weighted,
        f1_macro,
        f1_weighted,
        exact_match_accuracy,
        avg_partial_match,
        avg_f1_word
    ]
}

metrics_df = pd.DataFrame(metrics_summary)
metrics_file = 'metrics_summary.csv'
metrics_df.to_csv(metrics_file, index=False)
print(f"‚úÖ Metrics summary saved to: {metrics_file}")

# Show sample of results
print("\nüìã Sample Results (First 10):")
display_cols = ['instruction', 'true_label', 'predicted_label', 'correct', 'f1_score']
print(results_df[display_cols].head(10).to_string(index=False))

print(f"\nüìä Correct Predictions: {results_df['correct'].sum()} / {len(results_df)} ({accuracy*100:.2f}%)")
print(f"üìä Incorrect Predictions: {(~results_df['correct']).sum()} / {len(results_df)} ({(1-accuracy)*100:.2f}%)")

In [None]:
# Visualize Metrics Comparison
print("üé® Creating metrics visualization...\n")

import matplotlib.pyplot as plt

# Create metrics comparison bar chart
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Plot 1: Overall Metrics
metrics_names = ['Accuracy', 'Precision\n(Weighted)', 'Recall\n(Weighted)', 'F1-Score\n(Weighted)']
metrics_values = [accuracy, precision_weighted, recall_weighted, f1_weighted]

bars1 = ax1.bar(metrics_names, metrics_values, color=['#2ecc71', '#3498db', '#e74c3c', '#f39c12'], alpha=0.8)
ax1.set_ylabel('Score', fontsize=12, fontweight='bold')
ax1.set_title('Overall Performance Metrics', fontsize=14, fontweight='bold')
ax1.set_ylim([0, 1])
ax1.axhline(y=0.5, color='gray', linestyle='--', alpha=0.3, label='50% baseline')
ax1.grid(axis='y', alpha=0.3)

# Add value labels on bars
for bar in bars1:
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height,
             f'{height:.3f}\n({height*100:.1f}%)',
             ha='center', va='bottom', fontweight='bold')

# Plot 2: Macro vs Weighted Metrics
metrics_comparison = {
    'Precision': [precision_macro, precision_weighted],
    'Recall': [recall_macro, recall_weighted],
    'F1-Score': [f1_macro, f1_weighted]
}

x = np.arange(len(metrics_comparison))
width = 0.35

bars2_1 = ax2.bar(x - width/2, [v[0] for v in metrics_comparison.values()], 
                   width, label='Macro', color='#3498db', alpha=0.8)
bars2_2 = ax2.bar(x + width/2, [v[1] for v in metrics_comparison.values()], 
                   width, label='Weighted', color='#e74c3c', alpha=0.8)

ax2.set_ylabel('Score', fontsize=12, fontweight='bold')
ax2.set_title('Macro vs Weighted Metrics Comparison', fontsize=14, fontweight='bold')
ax2.set_xticks(x)
ax2.set_xticklabels(metrics_comparison.keys())
ax2.set_ylim([0, 1])
ax2.legend()
ax2.grid(axis='y', alpha=0.3)

# Add value labels
for bars in [bars2_1, bars2_2]:
    for bar in bars:
        height = bar.get_height()
        ax2.text(bar.get_x() + bar.get_width()/2., height,
                 f'{height:.3f}',
                 ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.show()

print("‚úÖ Metrics visualization created!")

In [None]:
# Detailed Classification Report
print("üìä DETAILED CLASSIFICATION REPORT")
print("="*80 + "\n")

# Generate classification report
report = classification_report(y_true, y_pred, labels=unique_labels, zero_division=0, output_dict=True)
report_df = pd.DataFrame(report).transpose()

# Display full report
print(classification_report(y_true, y_pred, labels=unique_labels, zero_division=0))

# Convert to DataFrame for better visualization
print("\nüìà Per-Class Metrics Summary:")
print(report_df.round(4))

# Show best and worst performing classes
if len(unique_labels) > 5:
    print("\nüèÜ TOP 5 BEST PERFORMING CLASSES (by F1-score):")
    class_metrics = report_df[report_df.index.str.startswith('T')].sort_values('f1-score', ascending=False)
    print(class_metrics.head(5)[['precision', 'recall', 'f1-score', 'support']].round(4))
    
    print("\n‚ö†Ô∏è TOP 5 WORST PERFORMING CLASSES (by F1-score):")
    print(class_metrics.tail(5)[['precision', 'recall', 'f1-score', 'support']].round(4))

In [None]:
# Visualize Confusion Matrix
print("üé® Creating confusion matrix visualization...\n")

import matplotlib.pyplot as plt
import seaborn as sns

# Create figure
fig, ax = plt.subplots(figsize=(max(12, len(unique_labels)), max(10, len(unique_labels))))

# If too many labels, show a subset or use different visualization
if len(unique_labels) > 20:
    print(f"‚ö†Ô∏è Large number of labels ({len(unique_labels)}). Showing top 20 most frequent...")
    
    # Get top N most frequent labels
    from collections import Counter
    label_counts = Counter(y_true)
    top_labels = [label for label, _ in label_counts.most_common(20)]
    
    # Filter confusion matrix for top labels
    label_indices = [unique_labels.index(label) for label in top_labels]
    conf_matrix_subset = conf_matrix[np.ix_(label_indices, label_indices)]
    
    # Plot subset
    sns.heatmap(conf_matrix_subset, 
                annot=True, 
                fmt='d', 
                cmap='Blues',
                xticklabels=top_labels,
                yticklabels=top_labels,
                ax=ax,
                cbar_kws={'label': 'Count'})
    
    plt.title(f'Confusion Matrix (Top 20 Labels)\nTotal Labels: {len(unique_labels)}', 
              fontsize=16, fontweight='bold', pad=20)
else:
    # Plot full confusion matrix
    sns.heatmap(conf_matrix, 
                annot=True, 
                fmt='d', 
                cmap='Blues',
                xticklabels=unique_labels,
                yticklabels=unique_labels,
                ax=ax,
                cbar_kws={'label': 'Count'})
    
    plt.title('Confusion Matrix - All Labels', fontsize=16, fontweight='bold', pad=20)

plt.xlabel('Predicted Label', fontsize=12, fontweight='bold')
plt.ylabel('True Label', fontsize=12, fontweight='bold')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

print("‚úÖ Confusion matrix visualization created!")

In [None]:
# Calculate comprehensive metrics with confusion matrix
print("\n" + "="*80)
print("üìä CALCULATING COMPREHENSIVE METRICS")
print("="*80 + "\n")

import numpy as np
from sklearn.metrics import (
    accuracy_score, 
    precision_score, 
    recall_score, 
    f1_score, 
    confusion_matrix,
    classification_report
)
import pandas as pd

# Extract labels from predictions and expected outputs
# This assumes the output contains MITRE technique IDs (e.g., T1234)
import re

def extract_technique_id(text):
    """Extract MITRE technique ID from text (e.g., T1234, T1234.001)"""
    match = re.search(r'T\d{4}(?:\.\d{3})?', text.upper())
    return match.group(0) if match else "UNKNOWN"

# Extract all labels
y_true = [extract_technique_id(r['expected']) for r in results]
y_pred = [extract_technique_id(r['predicted']) for r in results]

# Get unique labels
unique_labels = sorted(list(set(y_true + y_pred)))
print(f"üìã Unique labels found: {len(unique_labels)}")
print(f"   Labels: {', '.join(unique_labels[:10])}{'...' if len(unique_labels) > 10 else ''}\n")

# Calculate metrics
accuracy = accuracy_score(y_true, y_pred)
precision_macro = precision_score(y_true, y_pred, average='macro', zero_division=0)
precision_weighted = precision_score(y_true, y_pred, average='weighted', zero_division=0)
recall_macro = recall_score(y_true, y_pred, average='macro', zero_division=0)
recall_weighted = recall_score(y_true, y_pred, average='weighted', zero_division=0)
f1_macro = f1_score(y_true, y_pred, average='macro', zero_division=0)
f1_weighted = f1_score(y_true, y_pred, average='weighted', zero_division=0)

# Print overall metrics
print("üéØ OVERALL METRICS:")
print(f"   Accuracy:           {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"\n   Precision (Macro):  {precision_macro:.4f}")
print(f"   Precision (Weighted): {precision_weighted:.4f}")
print(f"\n   Recall (Macro):     {recall_macro:.4f}")
print(f"   Recall (Weighted):  {recall_weighted:.4f}")
print(f"\n   F1-Score (Macro):   {f1_macro:.4f}")
print(f"   F1-Score (Weighted): {f1_weighted:.4f}")

# Calculate word-level metrics (from previous evaluation)
avg_partial_match = np.mean(partial_match_scores)
avg_f1_word = np.mean(f1_scores)
exact_match_accuracy = exact_matches / len(eval_samples)

print(f"\nüìù WORD-LEVEL METRICS:")
print(f"   Exact Match Accuracy: {exact_match_accuracy:.4f} ({exact_match_accuracy*100:.2f}%)")
print(f"   Avg Partial Match:    {avg_partial_match:.4f}")
print(f"   Avg F1 (Word-level):  {avg_f1_word:.4f}")

# Generate confusion matrix
conf_matrix = confusion_matrix(y_true, y_pred, labels=unique_labels)

print(f"\n‚úÖ Metrics calculated successfully!")