# EVALUATION

In [None]:
from transformers import AutoTokenizer
import os
import sys
import pandas as pd
from datasets.arrow_dataset import Dataset
from peft import PeftModel, PeftConfig
import os
import json

from utils.prompts import (
    prompt_EUR_BASE,
    prompt_EUR_COT,
    prompt_EUR_COD,
    prompt_EUR_FEW_SHOT,
    prompt_LDD_BASE,
    prompt_LDD_COT,
    prompt_LDD_COD,
    prompt_LDD_FEW_SHOT,
    prompt_IE_BASE,
    prompt_IE_COT,
    prompt_IE_COD,
    prompt_IE_FEW_SHOT,
    prompt_SELF_CONSIS,
)

GPU_HOURS_DICT = {
    "Llama-3.2-1B-Instruct": {
        "train": 27.360,
        "inference": 25.782
    },
    "gemma-2-2b-it": {
        "train": 51.496,
        "inference": 32.664
    },
    "Llama-3.2-3B-Instruct": {
        "train": 65.522,
        "inference": 39.556
    },
    "ModernBERT-base": {
        "train": 27.006,
        "inference": 1.528
    }
}

os.environ["TOKENIZERS_PARALLELISM"] = "false"

# ========================== Constants ==========================
per_device_train_batch_size = 1
per_device_eval_batch_size = 1
num_train_epochs = 10
learning_rate = 1e-6
project_root = "/home/snt/projects_lujun/agentCLS"

training_dataset_path = "assets/training_dataset/EURLEX57K_split_equal_train_1000_val_300.jsonl"
model_path = "/home/snt/projects_lujun/base_models/Llama-3.2-1B-Instruct"
output_dir = "/home/snt/projects_lujun/agentCLS/assets/logs/prompt_tuning/EURLEX57K_split_equal_train_1000_val_300_0.005_Llama-3.2-1B-Instruct_output_03_17_17_53_45_PROMPT_TUNING_128"

def find_file(filename, search_dir):
    for root, dirs, files in os.walk(search_dir):
        if filename in files:
            return os.path.join(root, filename)
    return None

json_file_path = find_file("adapter_labels.json", output_dir)
max_length = 4096

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
is_prefix_tuning = True
is_prompt_tuning = False
num_virtual_tokens = 128

sys.path.append(os.path.abspath(project_root))
train_dataset_path = os.path.abspath(os.path.join(project_root, training_dataset_path))

peft_config = None

train_ratio = 1.0
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token
input_dataset_name = train_dataset_path.split("/")[-1].split(".")[0]
model_name = model_path.split("/")[-1]


if "EURLEX" in training_dataset_path:
    prompt_templates = [prompt_EUR_BASE, prompt_EUR_COT, prompt_EUR_COD, prompt_EUR_FEW_SHOT, prompt_SELF_CONSIS]
    is_EURLEX = True
elif "LDD" in training_dataset_path:
    prompt_templates = [prompt_LDD_BASE, prompt_LDD_COT, prompt_LDD_COD, prompt_LDD_FEW_SHOT, prompt_SELF_CONSIS]
    is_LDD = True
elif "FOYER" in training_dataset_path:
    prompt_templates = [prompt_IE_BASE, prompt_IE_COT, prompt_IE_COD, prompt_IE_FEW_SHOT, prompt_SELF_CONSIS]
    is_IE = True
else:
    raise ValueError(f"Unknown dataset: {training_dataset_path}")
    
dataset = pd.read_json(train_dataset_path, lines=True)
dataset.rename(columns={"cls_label": "labels"}, inplace=True)

# Compute sample counts for each group based on 'labels' and 'split'
sample_counts = dataset.groupby(['labels', 'split']).size() * train_ratio

filtered_train_data = dataset.groupby('labels', group_keys=False).apply(
    lambda x: x[x['split'] == 'train'].iloc[:int(sample_counts.loc[x.name, 'train'])]
)

filtered_validation_data = dataset.groupby('labels', group_keys=False).apply(
    lambda x: x[x['split'] == 'validation'].iloc[:int(sample_counts.loc[x.name, 'validation'])]
)

filtered_train = filtered_train_data.reset_index(drop=True)
filtered_validation = filtered_validation_data.reset_index(drop=True)


train_dataset = Dataset.from_pandas(filtered_train)
val_dataset = Dataset.from_pandas(filtered_validation)

# Tokenization
def tokenize(examples):
    return tokenizer(examples["content"], padding="max_length", truncation=True, max_length=max_length)

with open(json_file_path, "r") as json_file:
    loaded_data = json.load(json_file)

label2id = loaded_data["label2id"]
id2label = loaded_data["id2label"]
label2id = {label: int(i) for label, i in label2id.items()}
id2label = {int(i): label for i, label in id2label.items()}

train_dataset = train_dataset.map(lambda x: {"labels": label2id[x["labels"]]})
val_dataset = val_dataset.map(lambda x: {"labels": label2id[x["labels"]]})


keep_columns = ["labels", "input_ids", "attention_mask"]
tokenized_train_dataset = train_dataset.map(tokenize, batched=True, remove_columns=[col for col in train_dataset.column_names if col not in keep_columns])
tokenized_val_dataset = val_dataset.map(tokenize, batched=True, remove_columns=[col for col in val_dataset.column_names if col not in keep_columns])
train_dataset.features.keys()

  filtered_train_data = dataset.groupby('labels', group_keys=False).apply(
  filtered_validation_data = dataset.groupby('labels', group_keys=False).apply(
Map: 100%|██████████| 3000/3000 [00:00<00:00, 8810.34 examples/s]
Map: 100%|██████████| 900/900 [00:00<00:00, 9258.74 examples/s]
Map: 100%|██████████| 3000/3000 [00:08<00:00, 339.08 examples/s]
Map: 100%|██████████| 900/900 [00:02<00:00, 361.36 examples/s]


dict_keys(['celex_id', 'document_type', 'title', 'header', 'recitals', 'main_body', 'eurovoc_concepts', 'split', 'content', 'content_length', 'labels', 'description'])

In [3]:
def get_last_checkpoints(output_dir):
    checkpoints = os.listdir(output_dir)
    checkpoints = [c for c in checkpoints if "checkpoint" in c]
    checkpoints = [int(c.split("-")[-1]) for c in checkpoints]
    last_checkpoint = max(checkpoints)
    return f"{output_dir}/checkpoint-{last_checkpoint}"

def get_all_checkpoints(output_dir):
    # List all checkpoint directories in output_dir
    checkpoints = os.listdir(output_dir)
    # Filter for directories that include 'checkpoint' in their name
    checkpoints = [c for c in checkpoints if "checkpoint" in c]
    # Return the full paths to all the checkpoint directories
    return [os.path.join(output_dir, checkpoint) for checkpoint in checkpoints]

In [4]:
import os
import time
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from datetime import datetime
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from tensorboard.backend.event_processing import event_accumulator
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import os
import math 
# GPU resource consumption dictionary for different models
GPU_HOURS_DICT = {
    "Llama-3.2-1B-Instruct": {
        "train": 27.360,
        "inference": 25.782
    },
    "gemma-2-2b-it": {
        "train": 51.496,
        "inference": 32.664
    },
    "Llama-3.2-3B-Instruct": {
        "train": 65.522,
        "inference": 39.556
    },
    "ModernBERT-base": {
        "train": 27.006,
        "inference": 1.528
    }
}

# Weighting parameters for efficiency metrics
ALPHA = 0.5  # Training weight
BETA = 0.5   # Inference weight

def extract_training_time(log_dir, scalar_name="train/loss"):
    """
    Extract training duration from TensorBoard event files.
    
    Args:
        log_dir (str): Directory containing TensorBoard logs
        scalar_name (str): Name of scalar to track for timing
        
    Returns:
        tuple: Start time, end time, and total duration in seconds
    """
    # Find all event files in the directory
    event_files = [
        os.path.join(root, file)
        for root, _, files in os.walk(log_dir)
        for file in files
        if "events" in file
    ]

    if not event_files:
        raise FileNotFoundError(f"No TensorBoard event files found in {log_dir}")
    
    # Use the first event file found
    event_file = event_files[0] 
    
    # Load the event file
    ea = event_accumulator.EventAccumulator(event_file)
    ea.Reload()
    
    # Verify the scalar exists
    available_keys = ea.scalars.Keys()
    if scalar_name not in available_keys:
        raise ValueError(f"Scalar '{scalar_name}' not found. Available keys: {available_keys}")
    
    # Extract wall times from the scalar events
    wall_times = ea.Scalars(scalar_name)
    start_time = wall_times[0].wall_time
    end_time = wall_times[-1].wall_time
    
    return start_time, end_time, end_time - start_time



def evaluate_multiple(output_dir, val_dataset):
    """
    Evaluate model performance and efficiency metrics across multiple checkpoints.
    
    Args:
        output_dir (str): Directory containing model checkpoints
        val_dataset: Validation dataset for evaluation
    """
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    # Get all checkpoint paths
    checkpoint_paths = get_all_checkpoints(output_dir)
    
    for checkpoint_path in checkpoint_paths:
        config = PeftConfig.from_pretrained(checkpoint_path)
        model = AutoModelForSequenceClassification.from_pretrained(config.base_model_name_or_path, label2id=label2id, id2label=id2label, ).to(device)
        model = PeftModel.from_pretrained(model, checkpoint_path)
        model.config.pad_token_id = tokenizer.pad_token_id
        model.config.use_cache = False
        model.config.pretraining_tp = 1
        model.gradient_checkpointing_enable()
        model.eval()
        
        # Initialize data structures for evaluation
        validation_results = []
        true_label_one_hot_list = []
        true_labels = []
        predicted_labels = []
        all_probs = []
        
        # Start inference timing
        start_time = time.time()
        
        # Process validation data
        for input_sample in tqdm(val_dataset, desc="Processing validation data", unit="sample"):
            sample = input_sample['content']
            true_label_idx = int(input_sample['labels'])
            
            # Tokenize and predict
            tokenized_input = tokenizer(
                sample, 
                padding="max_length", 
                max_length=max_length, 
                truncation=True, 
                return_tensors="pt"
            ).to(device)
            
            with torch.no_grad():
                model_output = model(**tokenized_input)
                
            # Process prediction results
            logits = model_output.logits
            probabilities = torch.nn.functional.softmax(logits, dim=-1)
            predicted_class_idx = torch.argmax(probabilities, dim=-1).item()
            
            # Convert indices to labels
            predicted_label = id2label[predicted_class_idx]
            true_label = id2label[true_label_idx]
            
            # Create one-hot encoding of true label
            true_label_one_hot = np.zeros(probabilities.size(-1))
            true_label_one_hot[true_label_idx] = 1
            
            # Store results
            true_labels.append(true_label)
            true_label_one_hot_list.append(true_label_one_hot)
            predicted_labels.append(predicted_label)
            all_probs.append(probabilities.detach().cpu().numpy())
            
            # Store detailed results for each sample
            result = {
                'content': sample,
                'true_label': true_label,
                'predicted_label': predicted_label,
                'true_label_one_hot': true_label_one_hot.tolist(),
                'predicted_class_idx': predicted_class_idx,
                'probabilities': probabilities.detach().cpu().numpy().tolist()
            }
            validation_results.append(result)
        
        # Calculate inference time
        end_time = time.time()
        inference_time = end_time - start_time
        inference_time_hours = round(inference_time / 3600, 4)
        
        # Get training time
        start_time, end_time, training_duration = extract_training_time(output_dir)
        training_time_hours = round(training_duration / 3600, 4)
        
        # Save validation results
        timestamp = datetime.now().strftime("%m_%d_%H_%M_%S")
        df_validation_results = pd.DataFrame(validation_results)
        jsonl_file_path = os.path.join(checkpoint_path, f'validation_results_{timestamp}.jsonl')
        df_validation_results.to_json(jsonl_file_path, orient='records', lines=True)
        
        # Calculate performance metrics
        accuracy = accuracy_score(true_labels, predicted_labels)
        f1 = f1_score(true_labels, predicted_labels, average='weighted')
        
        # Find GPU resource consumption for the model
        model_name = None
        for name in GPU_HOURS_DICT:
            if name in output_dir:
                model_name = name
                gpu_ram_train = GPU_HOURS_DICT[name]["train"]
                gpu_ram_inference = GPU_HOURS_DICT[name]["inference"]
                break
        
        if not model_name:
            raise ValueError(f"Model name not found in GPU_HOURS_DICT: {output_dir}")
        
        # Calculate efficiency metrics
        training_gpu_hours_ram = gpu_ram_train * training_time_hours
        inference_gpu_hours_ram = gpu_ram_inference * inference_time_hours
        total_gpu_hours_ram = training_gpu_hours_ram + inference_gpu_hours_ram
        
        # Calculate composite metrics
        resource_m = f1 /  math.log((ALPHA * training_gpu_hours_ram + BETA * inference_gpu_hours_ram)+1)
        time_m = f1 /  math.log((ALPHA * training_time_hours + BETA * inference_time_hours+1))
        
        # Calculate time and resource ratios
        ratio_time = inference_time_hours / (inference_time_hours + training_time_hours)
        ratio_time_ram = inference_gpu_hours_ram / (inference_gpu_hours_ram + training_gpu_hours_ram)
        
        # Print evaluation results
        print("\n" + "="*50)
        print(f"Evaluation Results for {model_name or 'Unknown Model'}")
        print("="*50)
        print(f"Performance Metrics:")
        print(f"  Accuracy:    {accuracy:.4f}")
        print(f"  F1 Score:    {f1:.4f}")
        # print(f"  AUC:         {auc:.4f}")
        
        print("\nEfficiency Metrics:")
        print(f"  Resource_M:  {resource_m:.4f}")
        print(f"  Time_M:      {time_m:.4f}")
        
        print("\nTime Distribution:")
        print(f"  Inference/Total Time Ratio: {ratio_time:.4f}")
        print(f"  Inference/Total RAM Ratio:  {ratio_time_ram:.4f}")
        
        print("\nGPU Resource Usage (GPU hours × RAM):")
        print(f"  Total:       {total_gpu_hours_ram:.4f}")
        print(f"  Training:    {training_gpu_hours_ram:.4f}")
        print(f"  Inference:   {inference_gpu_hours_ram:.4f}")
        print("="*50 + "\n")

# Execute evaluation
print(f"Evaluating models in: {output_dir}")
evaluate_multiple(output_dir=output_dir, val_dataset=val_dataset)


Evaluating models in: /home/snt/projects_lujun/agentCLS/assets/logs/prompt_tuning/EURLEX57K_split_equal_train_1000_val_300_0.005_Llama-3.2-1B-Instruct_output_03_17_17_53_45_PROMPT_TUNING_128


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at /home/snt/projects_lujun/base_models/Llama-3.2-1B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Processing validation data:   0%|          | 0/900 [00:00<?, ?sample/s]LlamaForSequenceClassification will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`
Processing validation data:  13%|█▎        | 117/900 [01:33<10:26,  1.25sample/s]


KeyboardInterrupt: 