In [2]:
import pandas as pd

folder_path = "/home/snt/projects_lujun/agentCLS/assets/prompt_testing/"
results_path = folder_path+ "EURLEX57K_split_proportional_train_1500_val_300_03_17_13_32_is_base_Llama-3.2-1B-Instruct.jsonl"
df = pd.read_json(results_path, lines=True)

In [24]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import math

label_encoder = LabelEncoder()
gpu_per_hours_dict = {
    "Llama-3.2-1B-Instruct":{
        "train": 27.360,
        "inference": 25.782
    },
    "gemma-2-2b-it":{
        "train": 51.496,
        "inference": 32.664
    },
    "Llama-3.2-3B-Instruct":{
        "train": 65.522,
        "inference": 39.556
    },
    "ModernBERT-base":{
        "train": 27.006,
        "inference": 1.528
    }
}

def compute_metrics(df, results_path, alpha = 0.5, beta = 0.5):
    model_name = None
    for k,v in gpu_per_hours_dict.items():
        if k in results_path:
            model_name = k

    if model_name is None:
        raise ValueError(f"Unknown model name: {model_name}")
    
    gpu_ram_train = gpu_per_hours_dict[model_name]["train"]
    gpu_ram_inference = gpu_per_hours_dict[model_name]["inference"]
    
    label_encoder = LabelEncoder()
    y_true = label_encoder.fit_transform(df['Ground_Truth'])
    y_pred = label_encoder.transform(df['LLM_Prediciton'])  # 这样 y_pred 和 y_true 的类别编号一致

    # 计算评估指标
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='weighted')
    
    # 时间计算
    training_time_hours = 0  # 训练时间固定为0
    inference_time_hours = round(df['Time_Passed'].sum() / 3600, 4)  # 将秒转换为小时，保留四位小数
    
    # GPU计算
    training_gpu_hours_ram = 0.0
    inference_gpu_hours_ram = gpu_ram_inference * inference_time_hours
    total_gpu_hours_ram = inference_gpu_hours_ram + training_gpu_hours_ram
    
    # Resource_M 和 Time_M 计算
    Resource_M = f1 /  math.log((alpha * training_gpu_hours_ram + beta * inference_gpu_hours_ram) + 1) if total_gpu_hours_ram > 0 else np.nan
    Time_M = f1 / math.log((alpha * training_time_hours + beta * inference_time_hours) + 1) if inference_time_hours > 0 else np.nan
    
    # 比例计算
    ratio_time = inference_time_hours / (inference_time_hours + training_time_hours) if inference_time_hours > 0 else np.nan
    ratio_time_ram = inference_gpu_hours_ram / (inference_gpu_hours_ram + training_gpu_hours_ram) if inference_gpu_hours_ram > 0 else np.nan
    
    # 打印结果
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    # print(f"AUC: {auc:.4f}")
    print(f"Resource_M: {Resource_M:.4f}")
    print(f"Time_M: {Time_M:.4f}")
    print(f"ratio_time: {ratio_time:.4f}")
    print(f"ratio_time_ram: {ratio_time_ram:.4f}")
    print(f"total GPU hours (RAM): {total_gpu_hours_ram:.4f}")
    print(f"training GPU hours (RAM): {training_gpu_hours_ram:.4f}")
    print(f"inference GPU hours (RAM): {inference_gpu_hours_ram:.4f}")
    
    
    # return {
    #     'accuracy': accuracy,
    #     'f1': f1,
    #     'Resource_M': Resource_M,
    #     'Time_M': Time_M,
    #     'ratio_time': ratio_time,
    #     'ratio_time_ram': ratio_time_ram,
    #     'total_gpu_hours_ram': total_gpu_hours_ram,
    #     'training_gpu_hours_ram': training_gpu_hours_ram,
    #     'inference_gpu_hours_ram': inference_gpu_hours_ram,
    # }

compute_metrics(df,results_path)

Accuracy: 0.3278
F1 Score: 0.2669
Resource_M: 1.0163
Time_M: 23.0453
ratio_time: 1.0000
ratio_time_ram: 1.0000
total GPU hours (RAM): 0.6007
training GPU hours (RAM): 0.0000
inference GPU hours (RAM): 0.6007


In [6]:
df

Unnamed: 0,LLM_Input,LLM_Output,LLM_Prediciton,Ground_Truth,Iscorrect,Time_Passed
0,"[{'role': 'system', 'content': 'You are a help...",**Decision**\nClassification: Decision,Decision,Decision,True,0.092182
1,"[{'role': 'system', 'content': 'You are a help...",Decision \nClassification: Decision,Decision,Directive,False,0.066667
2,"[{'role': 'system', 'content': 'You are a help...",Classification: Directive,Directive,Regulation,False,0.054889
3,"[{'role': 'system', 'content': 'You are a help...",## Decision\n### Directive \n### Regulation,Decision,Regulation,False,0.079577
4,"[{'role': 'system', 'content': 'You are a help...",Decision \nDirective \nRegulation,Decision,Directive,False,0.113159
...,...,...,...,...,...,...
895,"[{'role': 'system', 'content': 'You are a help...",Decision\nClassification: Directive,Decision,Regulation,False,0.099411
896,"[{'role': 'system', 'content': 'You are a help...",Decision \nThe input text is a directive.,Decision,Directive,False,0.139500
897,"[{'role': 'system', 'content': 'You are a help...",**Decision**\n**Regulation**,Decision,Decision,True,0.077681
898,"[{'role': 'system', 'content': 'You are a help...",Decision \n1. **Decision** - Choose this categ...,Decision,Decision,True,0.357093


In [7]:
df.columns

Index(['LLM_Input', 'LLM_Output', 'LLM_Prediciton', 'Ground_Truth',
       'Iscorrect', 'Time_Passed'],
      dtype='object')