# EV Charging LLM Evaluation and Benchmarking
This notebook evaluates the fine-tuned model using domain-specific benchmarks and automated metrics.

In [None]:
# Core imports
import json
import pandas as pd
import torch
import os
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import nltk
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

In [None]:
# Configuration
CONFIG = {
    "base_model_name": "microsoft/DialoGPT-small",
    "fine_tuned_model_path": "./fine_tuned_model/final_model",
    "test_data_path": "output_data/ev_training_alpaca.json",
    "benchmark_size": 50,  # Number of test examples
    "max_length": 200,
    "temperature": 0.7,
    "results_dir": "evaluation_results"
}

# Create results directory
os.makedirs(CONFIG["results_dir"], exist_ok=True)

print("Evaluation configuration:")
for key, value in CONFIG.items():
    print(f"  {key}: {value}")

In [None]:
# Setup device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Initialize sentence transformer for semantic similarity
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
print("Sentence transformer loaded")

## Load Models and Create Benchmark Dataset

In [None]:
# Load base model
print("Loading base model...")
base_tokenizer = AutoTokenizer.from_pretrained(CONFIG["base_model_name"])
base_model = AutoModelForCausalLM.from_pretrained(
    CONFIG["base_model_name"],
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto" if torch.cuda.is_available() else None
)

if base_tokenizer.pad_token is None:
    base_tokenizer.pad_token = base_tokenizer.eos_token

print("Base model loaded")

In [None]:
# Load fine-tuned model
print("Loading fine-tuned model...")
try:
    ft_tokenizer = AutoTokenizer.from_pretrained(CONFIG["fine_tuned_model_path"])
    ft_model = AutoModelForCausalLM.from_pretrained(
        CONFIG["fine_tuned_model_path"],
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
        device_map="auto" if torch.cuda.is_available() else None
    )
    
    if ft_tokenizer.pad_token is None:
        ft_tokenizer.pad_token = ft_tokenizer.eos_token
    
    print("Fine-tuned model loaded successfully")
    model_loaded = True
except Exception as e:
    print(f"Could not load fine-tuned model: {e}")
    print("Will use base model for comparison")
    ft_model = base_model
    ft_tokenizer = base_tokenizer
    model_loaded = False

In [None]:
# Create domain-specific benchmark dataset
def create_ev_benchmark():
    """Create EV charging domain benchmark questions"""
    benchmark_questions = [
        {
            "question": "What are the different types of EV charging connectors?",
            "expected_keywords": ["Type 1", "Type 2", "CHAdeMO", "CCS", "connector", "charging"]
        },
        {
            "question": "How long does it take to charge an electric vehicle?",
            "expected_keywords": ["time", "hours", "fast", "slow", "charging", "battery"]
        },
        {
            "question": "What is the difference between AC and DC charging?",
            "expected_keywords": ["AC", "DC", "alternating", "direct", "current", "charging"]
        },
        {
            "question": "Where can I find public charging stations?",
            "expected_keywords": ["public", "stations", "location", "map", "app", "network"]
        },
        {
            "question": "What is fast charging for electric vehicles?",
            "expected_keywords": ["fast", "rapid", "DC", "charging", "quick", "speed"]
        },
        {
            "question": "How much does it cost to charge an electric vehicle?",
            "expected_keywords": ["cost", "price", "money", "charging", "electricity", "rate"]
        },
        {
            "question": "Can I charge my EV at home?",
            "expected_keywords": ["home", "residential", "charging", "installation", "outlet"]
        },
        {
            "question": "What is charging infrastructure?",
            "expected_keywords": ["infrastructure", "network", "stations", "grid", "charging"]
        },
        {
            "question": "How do I install a home charging station?",
            "expected_keywords": ["install", "home", "electrician", "charging", "station", "setup"]
        },
        {
            "question": "What are the benefits of electric vehicle charging?",
            "expected_keywords": ["benefits", "advantages", "clean", "environment", "cost", "charging"]
        }
    ]
    
    return benchmark_questions

# Load test data from training set (last 20% as held-out test)
with open(CONFIG["test_data_path"], 'r', encoding='utf-8') as f:
    all_data = json.load(f)

# Use last portion as test set
test_size = min(CONFIG["benchmark_size"], len(all_data) // 5)
test_data = all_data[-test_size:]

# Create domain benchmark
domain_benchmark = create_ev_benchmark()

print(f"Created benchmark with {len(domain_benchmark)} domain-specific questions")
print(f"Using {len(test_data)} examples from training data as test set")

## Text Generation Functions

In [None]:
def generate_response(model, tokenizer, prompt, max_length=200):
    """Generate response using the given model"""
    inputs = tokenizer.encode(prompt, return_tensors="pt").to(device)
    
    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_length=min(max_length, inputs.shape[1] + 100),
            num_return_sequences=1,
            temperature=CONFIG["temperature"],
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Remove the input prompt from response
    if prompt in response:
        response = response.replace(prompt, "").strip()
    
    return response

def format_prompt(question):
    """Format question as prompt"""
    return f"### Instruction:\n{question}\n\n### Response:\n"

print("Generation functions defined")

## Evaluation Metrics

In [None]:
def calculate_rouge_scores(reference, hypothesis):
    """Calculate ROUGE scores"""
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, hypothesis)
    
    return {
        'rouge1': scores['rouge1'].fmeasure,
        'rouge2': scores['rouge2'].fmeasure,
        'rougeL': scores['rougeL'].fmeasure
    }

def calculate_bleu_score(reference, hypothesis):
    """Calculate BLEU score"""
    reference_tokens = reference.split()
    hypothesis_tokens = hypothesis.split()
    
    smoothing = SmoothingFunction().method1
    score = sentence_bleu([reference_tokens], hypothesis_tokens, smoothing_function=smoothing)
    
    return score

def calculate_semantic_similarity(text1, text2):
    """Calculate semantic similarity using sentence transformers"""
    embeddings = sentence_model.encode([text1, text2])
    similarity = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
    return similarity

def keyword_coverage(text, keywords):
    """Calculate how many expected keywords are covered"""
    text_lower = text.lower()
    covered = sum(1 for keyword in keywords if keyword.lower() in text_lower)
    return covered / len(keywords) if keywords else 0

print("Evaluation metrics defined")

## Model Evaluation

In [None]:
# Evaluate on domain-specific benchmark
print("Evaluating on domain-specific benchmark...")

domain_results = []

for i, item in enumerate(domain_benchmark):
    question = item["question"]
    expected_keywords = item["expected_keywords"]
    
    prompt = format_prompt(question)
    
    # Generate responses
    base_response = generate_response(base_model, base_tokenizer, prompt)
    ft_response = generate_response(ft_model, ft_tokenizer, prompt)
    
    # Calculate metrics
    base_coverage = keyword_coverage(base_response, expected_keywords)
    ft_coverage = keyword_coverage(ft_response, expected_keywords)
    
    semantic_sim = calculate_semantic_similarity(base_response, ft_response)
    
    result = {
        'question_id': i + 1,
        'question': question,
        'base_response': base_response,
        'ft_response': ft_response,
        'base_keyword_coverage': base_coverage,
        'ft_keyword_coverage': ft_coverage,
        'semantic_similarity': semantic_sim,
        'expected_keywords': expected_keywords
    }
    
    domain_results.append(result)
    
    if (i + 1) % 5 == 0:
        print(f"Processed {i + 1}/{len(domain_benchmark)} questions")

print(f"Domain benchmark evaluation completed")

In [None]:
# Evaluate on held-out test set
print("Evaluating on held-out test set...")

test_results = []

for i, item in enumerate(test_data[:20]):  # Limit to 20 for demo
    instruction = item["instruction"]
    reference = item["output"]
    
    prompt = format_prompt(instruction)
    
    # Generate responses
    base_response = generate_response(base_model, base_tokenizer, prompt)
    ft_response = generate_response(ft_model, ft_tokenizer, prompt)
    
    # Calculate metrics against reference
    base_rouge = calculate_rouge_scores(reference, base_response)
    ft_rouge = calculate_rouge_scores(reference, ft_response)
    
    base_bleu = calculate_bleu_score(reference, base_response)
    ft_bleu = calculate_bleu_score(reference, ft_response)
    
    base_semantic = calculate_semantic_similarity(reference, base_response)
    ft_semantic = calculate_semantic_similarity(reference, ft_response)
    
    result = {
        'test_id': i + 1,
        'instruction': instruction,
        'reference': reference,
        'base_response': base_response,
        'ft_response': ft_response,
        'base_rouge1': base_rouge['rouge1'],
        'base_rouge2': base_rouge['rouge2'],
        'base_rougeL': base_rouge['rougeL'],
        'ft_rouge1': ft_rouge['rouge1'],
        'ft_rouge2': ft_rouge['rouge2'],
        'ft_rougeL': ft_rouge['rougeL'],
        'base_bleu': base_bleu,
        'ft_bleu': ft_bleu,
        'base_semantic': base_semantic,
        'ft_semantic': ft_semantic
    }
    
    test_results.append(result)
    
    if (i + 1) % 5 == 0:
        print(f"Processed {i + 1}/20 test examples")

print(f"Test set evaluation completed")

## Results Analysis and Visualization

In [None]:
# Analyze domain benchmark results
domain_df = pd.DataFrame(domain_results)

print("=== Domain Benchmark Results ===")
print(f"Average Base Model Keyword Coverage: {domain_df['base_keyword_coverage'].mean():.3f}")
print(f"Average Fine-tuned Model Keyword Coverage: {domain_df['ft_keyword_coverage'].mean():.3f}")
print(f"Average Semantic Similarity: {domain_df['semantic_similarity'].mean():.3f}")

# Improvement in keyword coverage
coverage_improvement = domain_df['ft_keyword_coverage'].mean() - domain_df['base_keyword_coverage'].mean()
print(f"Keyword Coverage Improvement: {coverage_improvement:.3f}")

# Analyze test set results
test_df = pd.DataFrame(test_results)

print("\n=== Test Set Results ===")
print("Base Model Metrics:")
print(f"  ROUGE-1: {test_df['base_rouge1'].mean():.3f}")
print(f"  ROUGE-2: {test_df['base_rouge2'].mean():.3f}")
print(f"  ROUGE-L: {test_df['base_rougeL'].mean():.3f}")
print(f"  BLEU: {test_df['base_bleu'].mean():.3f}")
print(f"  Semantic Similarity: {test_df['base_semantic'].mean():.3f}")

print("\nFine-tuned Model Metrics:")
print(f"  ROUGE-1: {test_df['ft_rouge1'].mean():.3f}")
print(f"  ROUGE-2: {test_df['ft_rouge2'].mean():.3f}")
print(f"  ROUGE-L: {test_df['ft_rougeL'].mean():.3f}")
print(f"  BLEU: {test_df['ft_bleu'].mean():.3f}")
print(f"  Semantic Similarity: {test_df['ft_semantic'].mean():.3f}")

# Calculate improvements
print("\nImprovements:")
print(f"  ROUGE-1: {test_df['ft_rouge1'].mean() - test_df['base_rouge1'].mean():.3f}")
print(f"  ROUGE-2: {test_df['ft_rouge2'].mean() - test_df['base_rouge2'].mean():.3f}")
print(f"  ROUGE-L: {test_df['ft_rougeL'].mean() - test_df['base_rougeL'].mean():.3f}")
print(f"  BLEU: {test_df['ft_bleu'].mean() - test_df['base_bleu'].mean():.3f}")
print(f"  Semantic Similarity: {test_df['ft_semantic'].mean() - test_df['base_semantic'].mean():.3f}")

In [None]:
# Create visualizations
plt.style.use('default')
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. Keyword Coverage Comparison
coverage_data = {
    'Base Model': domain_df['base_keyword_coverage'].tolist(),
    'Fine-tuned Model': domain_df['ft_keyword_coverage'].tolist()
}
coverage_df = pd.DataFrame(coverage_data)
coverage_df.boxplot(ax=axes[0,0])
axes[0,0].set_title('Keyword Coverage Comparison')
axes[0,0].set_ylabel('Coverage Score')

# 2. ROUGE Scores Comparison
rouge_metrics = ['ROUGE-1', 'ROUGE-2', 'ROUGE-L']
base_rouge = [test_df['base_rouge1'].mean(), test_df['base_rouge2'].mean(), test_df['base_rougeL'].mean()]
ft_rouge = [test_df['ft_rouge1'].mean(), test_df['ft_rouge2'].mean(), test_df['ft_rougeL'].mean()]

x = np.arange(len(rouge_metrics))
width = 0.35

axes[0,1].bar(x - width/2, base_rouge, width, label='Base Model', alpha=0.8)
axes[0,1].bar(x + width/2, ft_rouge, width, label='Fine-tuned Model', alpha=0.8)
axes[0,1].set_xlabel('Metrics')
axes[0,1].set_ylabel('Score')
axes[0,1].set_title('ROUGE Scores Comparison')
axes[0,1].set_xticks(x)
axes[0,1].set_xticklabels(rouge_metrics)
axes[0,1].legend()

# 3. BLEU and Semantic Similarity
metrics = ['BLEU', 'Semantic Similarity']
base_scores = [test_df['base_bleu'].mean(), test_df['base_semantic'].mean()]
ft_scores = [test_df['ft_bleu'].mean(), test_df['ft_semantic'].mean()]

x = np.arange(len(metrics))
axes[1,0].bar(x - width/2, base_scores, width, label='Base Model', alpha=0.8)
axes[1,0].bar(x + width/2, ft_scores, width, label='Fine-tuned Model', alpha=0.8)
axes[1,0].set_xlabel('Metrics')
axes[1,0].set_ylabel('Score')
axes[1,0].set_title('BLEU and Semantic Similarity')
axes[1,0].set_xticks(x)
axes[1,0].set_xticklabels(metrics)
axes[1,0].legend()

# 4. Improvement Summary
improvements = {
    'Keyword Coverage': coverage_improvement,
    'ROUGE-1': test_df['ft_rouge1'].mean() - test_df['base_rouge1'].mean(),
    'ROUGE-L': test_df['ft_rougeL'].mean() - test_df['base_rougeL'].mean(),
    'BLEU': test_df['ft_bleu'].mean() - test_df['base_bleu'].mean(),
    'Semantic Sim': test_df['ft_semantic'].mean() - test_df['base_semantic'].mean()
}

metrics = list(improvements.keys())
values = list(improvements.values())
colors = ['green' if v > 0 else 'red' for v in values]

axes[1,1].bar(metrics, values, color=colors, alpha=0.7)
axes[1,1].set_title('Performance Improvements')
axes[1,1].set_ylabel('Improvement Score')
axes[1,1].tick_params(axis='x', rotation=45)
axes[1,1].axhline(y=0, color='black', linestyle='-', alpha=0.3)

plt.tight_layout()
plt.savefig(os.path.join(CONFIG["results_dir"], 'evaluation_results.png'), dpi=300, bbox_inches='tight')
plt.show()

print(f"Visualization saved to {CONFIG['results_dir']}/evaluation_results.png")

## Example Responses Comparison

In [None]:
# Show example responses
print("=== Example Response Comparisons ===")

for i in range(min(3, len(domain_results))):
    result = domain_results[i]
    print(f"\nExample {i+1}:")
    print(f"Question: {result['question']}")
    print(f"Expected Keywords: {', '.join(result['expected_keywords'])}")
    print(f"\nBase Model Response:")
    print(f"{result['base_response'][:200]}...")
    print(f"Keyword Coverage: {result['base_keyword_coverage']:.2f}")
    print(f"\nFine-tuned Model Response:")
    print(f"{result['ft_response'][:200]}...")
    print(f"Keyword Coverage: {result['ft_keyword_coverage']:.2f}")
    print("-" * 80)

## Save Results

In [None]:
# Save detailed results
domain_df.to_csv(os.path.join(CONFIG["results_dir"], 'domain_benchmark_results.csv'), index=False)
test_df.to_csv(os.path.join(CONFIG["results_dir"], 'test_set_results.csv'), index=False)

# Save summary metrics
summary = {
    'evaluation_date': datetime.now().isoformat(),
    'base_model': CONFIG['base_model_name'],
    'fine_tuned_model': CONFIG['fine_tuned_model_path'],
    'domain_benchmark_size': len(domain_results),
    'test_set_size': len(test_results),
    'metrics': {
        'keyword_coverage_improvement': coverage_improvement,
        'rouge1_improvement': test_df['ft_rouge1'].mean() - test_df['base_rouge1'].mean(),
        'rouge2_improvement': test_df['ft_rouge2'].mean() - test_df['base_rouge2'].mean(),
        'rougeL_improvement': test_df['ft_rougeL'].mean() - test_df['base_rougeL'].mean(),
        'bleu_improvement': test_df['ft_bleu'].mean() - test_df['base_bleu'].mean(),
        'semantic_similarity_improvement': test_df['ft_semantic'].mean() - test_df['base_semantic'].mean()
    },
    'base_model_performance': {
        'avg_keyword_coverage': domain_df['base_keyword_coverage'].mean(),
        'avg_rouge1': test_df['base_rouge1'].mean(),
        'avg_rouge2': test_df['base_rouge2'].mean(),
        'avg_rougeL': test_df['base_rougeL'].mean(),
        'avg_bleu': test_df['base_bleu'].mean(),
        'avg_semantic': test_df['base_semantic'].mean()
    },
    'fine_tuned_model_performance': {
        'avg_keyword_coverage': domain_df['ft_keyword_coverage'].mean(),
        'avg_rouge1': test_df['ft_rouge1'].mean(),
        'avg_rouge2': test_df['ft_rouge2'].mean(),
        'avg_rougeL': test_df['ft_rougeL'].mean(),
        'avg_bleu': test_df['ft_bleu'].mean(),
        'avg_semantic': test_df['ft_semantic'].mean()
    }
}

with open(os.path.join(CONFIG["results_dir"], 'evaluation_summary.json'), 'w') as f:
    json.dump(summary, f, indent=2)

print(f"Results saved to {CONFIG['results_dir']}/")
print("Files created:")
print("- domain_benchmark_results.csv")
print("- test_set_results.csv")
print("- evaluation_summary.json")
print("- evaluation_results.png")

## Evaluation Summary

In [None]:
print("=" * 60)
print("           EVALUATION SUMMARY")
print("=" * 60)

print(f"\n📊 EVALUATION OVERVIEW:")
print(f"   • Base Model: {CONFIG['base_model_name']}")
print(f"   • Fine-tuned Model: {CONFIG['fine_tuned_model_path']}")
print(f"   • Domain Benchmark: {len(domain_results)} questions")
print(f"   • Test Set: {len(test_results)} examples")

print(f"\n🎯 KEY IMPROVEMENTS:")
print(f"   • Keyword Coverage: {coverage_improvement:+.3f}")
print(f"   • ROUGE-1: {test_df['ft_rouge1'].mean() - test_df['base_rouge1'].mean():+.3f}")
print(f"   • ROUGE-L: {test_df['ft_rougeL'].mean() - test_df['base_rougeL'].mean():+.3f}")
print(f"   • BLEU Score: {test_df['ft_bleu'].mean() - test_df['base_bleu'].mean():+.3f}")
print(f"   • Semantic Similarity: {test_df['ft_semantic'].mean() - test_df['base_semantic'].mean():+.3f}")

print(f"\n📈 PERFORMANCE ANALYSIS:")
if coverage_improvement > 0:
    print(f"   ✅ Fine-tuning improved domain-specific keyword coverage")
else:
    print(f"   ⚠️  Fine-tuning did not improve keyword coverage")

rouge_improvement = test_df['ft_rouge1'].mean() - test_df['base_rouge1'].mean()
if rouge_improvement > 0:
    print(f"   ✅ Fine-tuning improved ROUGE scores")
else:
    print(f"   ⚠️  Fine-tuning did not improve ROUGE scores")

semantic_improvement = test_df['ft_semantic'].mean() - test_df['base_semantic'].mean()
if semantic_improvement > 0:
    print(f"   ✅ Fine-tuning improved semantic similarity")
else:
    print(f"   ⚠️  Fine-tuning did not improve semantic similarity")

print(f"\n💾 RESULTS SAVED TO: {CONFIG['results_dir']}/")
print("\n✅ EVALUATION COMPLETED SUCCESSFULLY!")
print("=" * 60)