In [1]:
from datasets.arrow_dataset import Dataset
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments, BitsAndBytesConfig
import torch
import os
import numpy as np
from datetime import datetime
import sys
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from tqdm import tqdm
import time
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from transformers import EarlyStoppingCallback
os.environ["TOKENIZERS_PARALLELISM"] = "false"


# 
# ========================== CMD Argument Parser ==========================
# def parse_args():
#     parser = argparse.ArgumentParser(description="Train a model using CPT (Continual Pretraining Training)")
#     parser.add_argument("--per_device_train_batch_size", type=int, default=8, help="Batch size per device during training")
#     parser.add_argument("--per_device_eval_batch_size", type=int, default=8, help="Batch size per device during evaluation")
#     parser.add_argument("--num_train_epochs", type=int, default=1, help="Number of training epochs")
#     parser.add_argument("--learning_rate", type=float, default=1e-6, help="Learning rate for training")
#     parser.add_argument("--project_root", type=str, default="/Users/lujun.li/projects/mt_luxembourgish", help="Path to project root")
#     parser.add_argument("--training_dataset_path", type=str, default="data/processed/dataset_merged_llama_fake_targets.jsonl", help="Path to training dataset")
#     parser.add_argument("--model_path", type=str, default="/home/llama/Personal_Directories/srb/binary_classfication/Llama-3.2-3B-Instruct", help="Path to model")
#     parser.add_argument("--resume_from_checkpoint", type=bool, default=False, help="Resume training from checkpoint")
#     parser.add_argument("--resume_checkpoint_path", type=str, default=None, help="Path to checkpoint to resume training from")
#     parser.add_argument("--qlora", type=bool, default=False, help="Use QLoRA")
#     parser.add_argument("--r", type=int, default=16, help="Rank for LoRA")
#     return parser.parse_args()

# args = parse_args()


# print("Arguments passed:")
# print(f"Train Batch Size: {args.per_device_train_batch_size}")
# print(f"Eval Batch Size: {args.per_device_eval_batch_size}")
# print(f"Number of Epochs: {args.num_train_epochs}")
# print(f"Learning Rate: {args.learning_rate}")
# print(f"Project Root: {args.project_root}")
# print(f"Training Dataset Path: {args.training_dataset_path}")
# print(f"Model path: {args.model_path}")
# print(f"Resume from checkpoint: {args.resume_from_checkpoint}")
# print(f"Resume checkpoint path: {args.resume_checkpoint_path}")
# print(f"Qlora: {args.qlora}")
# print(f"Rank: {args.r}")

# per_device_train_batch_size = args.per_device_train_batch_size  # Batch size for training per device
# per_device_eval_batch_size = args.per_device_eval_batch_size  # Batch size for evaluation per device
# num_train_epochs = args.num_train_epochs  # Number of epochs for training
# learning_rate = args.learning_rate # Learning rate for the optimizer
# project_root = args.project_root
# training_dataset_path = args.training_dataset_path
# model_path = args.model_path
# resume_from_checkpoint = args.resume_from_checkpoint
# resume_checkpoint_path = args.resume_checkpoint_path
# qlora = args.qlora
# r = args.r


## Data preparation
per_device_train_batch_size = 8
per_device_eval_batch_size = 8
num_train_epochs = 10
learning_rate = 1e-6
project_root = "/home/snt/projects_lujun/agentCLS"
training_dataset_path = "assets/training_dataset/EURLEX57K_split_proportional_train_1500_val_300.jsonl"
model_path = "/home/snt/projects_lujun/base_models/Llama-3.2-1B-Instruct"
resume_from_checkpoint = False
resume_checkpoint_path = None
qlora = False
r = 16

train_dataset_path = os.path.abspath(os.path.join(project_root, training_dataset_path))
sys.path.append(project_root)


# Default Parameters
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

train_seed = 3407
train_ratio = 0.1
logging_steps = 10
eval_steps = 100
eval_strategy = "epoch"
save_strategy = "epoch"
save_total_limit = 2
logging_strategy = "steps"
max_grad_norm = 0.3
input_dataset_name = train_dataset_path.split("/")[-1].split(".")[0]
model_name = model_path.split("/")[-1]
max_length = 4096
load_in_4bit = True
bnb_4bit_quant_type = 'nf4'
quantization_config = None


if qlora:
# Quantization with Lora
    quantization_config = BitsAndBytesConfig(
        load_in_4bit = load_in_4bit, # enable 4-bit quantization
        bnb_4bit_quant_type = bnb_4bit_quant_type, # information theoretically optimal dtype for normally distributed weights
        bnb_4bit_use_double_quant = True, # quantize quantized weights //insert xzibit meme
        bnb_4bit_compute_dtype = torch.bfloat16 # optimized fp format for ML
    )

    # Lora
    lora_config = LoraConfig(
        r = r, # the dimension of the low-rank matrices
        lora_alpha = 8, # scaling factor for LoRA activations vs pre-trained weight activations
        target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj'],
        lora_dropout = 0.05, # dropout probability of the LoRA layers
        bias = 'none', # wether to train bias weights, set to 'none' for attention layers
        task_type = 'SEQ_CLS',
    )


if resume_from_checkpoint and resume_checkpoint_path is None:
    raise ValueError("Please provide a checkpoint path to resume training from")

if resume_from_checkpoint:
    output_dir = resume_checkpoint_path
else:
    current_time = datetime.now().strftime("%m_%d_%H_%M_%S")
    output_dir = f"{project_root}/assets/logs/SFT/{input_dataset_name}_{train_ratio}_{model_name}_output_{current_time}"

dataset = pd.read_json(train_dataset_path, lines=True)
dataset.rename(columns={"cls_label": "labels"}, inplace=True)

# Compute sample counts for each group based on 'labels' and 'split'
sample_counts = dataset.groupby(['labels', 'split']).size() * train_ratio

filtered_train_data = dataset.groupby('labels', group_keys=False).apply(
    lambda x: x[x['split'] == 'train'].iloc[:int(sample_counts.loc[x.name, 'train'])]
)

filtered_validation_data = dataset.groupby('labels', group_keys=False).apply(
    lambda x: x[x['split'] == 'validation'].iloc[:int(sample_counts.loc[x.name, 'validation'])]
)

filtered_train = filtered_train_data.reset_index(drop=True)
filtered_validation = filtered_validation_data.reset_index(drop=True)


train_dataset = Dataset.from_pandas(filtered_train)
val_dataset = Dataset.from_pandas(filtered_validation)


# Tokenization
def tokenize(examples):
    return tokenizer(examples["content"], padding="max_length", truncation=True, max_length=max_length)

labels =  set(train_dataset['labels'])
num_labels = len(labels)
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = i
    id2label[i] = label


train_dataset = train_dataset.map( lambda x: {"labels":label2id[x["labels"]]} )
val_dataset = val_dataset.map( lambda x: {"labels": label2id[x["labels"]]})

keep_columns = ["labels", "input_ids", "attention_mask"]
tokenized_train_dataset = train_dataset.map(tokenize, batched=True, remove_columns=[col for col in train_dataset.column_names if col not in keep_columns])
tokenized_val_dataset = val_dataset.map(tokenize, batched=True, remove_columns=[col for col in val_dataset.column_names if col not in keep_columns])
train_dataset.features.keys()

  from .autonotebook import tqdm as notebook_tqdm
  filtered_train_data = dataset.groupby('labels', group_keys=False).apply(
  filtered_validation_data = dataset.groupby('labels', group_keys=False).apply(


In [2]:
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=num_labels, label2id=label2id, id2label=id2label, quantization_config=quantization_config,)

if qlora:
    model = get_peft_model(prepare_model_for_kbit_training(model), lora_config)

model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()
train_dataset.features.keys()

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    acc = accuracy_score(labels, np.argmax(predictions, axis=-1))
    f1 = f1_score(labels, np.argmax(predictions, axis=-1), average="weighted")
    return {"accuracy": acc, "f1": f1}

def train():
    # Define training args
    training_args = TrainingArguments(
        output_dir= output_dir,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=per_device_eval_batch_size,
        learning_rate=learning_rate,
        num_train_epochs=num_train_epochs,
        bf16=True,
        optim="adamw_torch_fused", 
        logging_strategy=logging_strategy,
        logging_steps=logging_steps,
        eval_strategy=eval_strategy,
        eval_steps=eval_steps,
        save_strategy=save_strategy,
        save_total_limit=save_total_limit,
        load_best_model_at_end=True,
        max_grad_norm=max_grad_norm,
        # group_by_length=True,
        # use_mps_device=True,
        metric_for_best_model="eval_loss",
        # push to hub parameters
        # push_to_hub=True,
        # hub_strategy="every_save",
        # hub_token=HfFolder.get_token(),
        report_to="tensorboard",
        disable_tqdm=False,
        seed = train_seed,
    )
    
    # Create a Trainer instance
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_val_dataset,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )
    trainer_stats = trainer.train(resume_from_checkpoint=resume_from_checkpoint)
    print("Finished training SFT.")
    return trainer_stats


def evaluate():
    
    device = "cuda" if torch.cuda.is_available() else "cpu"
    def get_last_checkpoints(output_dir):
        checkpoints = os.listdir(output_dir)
        checkpoints = [c for c in checkpoints if "checkpoint" in c]
        checkpoints = [int(c.split("-")[-1]) for c in checkpoints]
        last_checkpoint = max(checkpoints)
        return f"{output_dir}/checkpoint-{last_checkpoint}"
    
    checkpoints_path  = get_last_checkpoints(output_dir)
    model = AutoModelForSequenceClassification.from_pretrained(checkpoints_path).to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_path) 
    tokenizer.pad_token_id = tokenizer.eos_token_id
    tokenizer.pad_token = tokenizer.eos_token
    validation_results = []

    # Initialize lists to store true and predicted labels
    true_label_one_hot_list = []
    true_labels = []
    predicted_labels = []
    all_probs = []

    # Start time for measuring inference efficiency
    start_time = time.time()

    # Iterate through validation dataset and make predictions
    for input in tqdm(val_dataset, desc="Processing validation data", unit="sample"):
        sample = input['content']
        true_label_idx = int(input['labels'])
        tokenized_input = tokenizer(sample, padding="max_length", max_length=max_length, truncation=True, return_tensors="pt").to(device)
        model_output = model(**tokenized_input)
        logits = model_output.logits
        probabilities = torch.nn.functional.softmax(logits, dim=-1)
        predicted_class_idx = torch.argmax(probabilities, dim=-1).item()
        predicted_label = id2label[str(predicted_class_idx)]
        
        true_label = id2label[str(true_label_idx)]
        true_label_one_hot = np.zeros(probabilities.size(-1))
        true_label_one_hot[true_label_idx] = 1

        true_labels.append(true_label)
        true_label_one_hot_list.append(true_label_one_hot)
        predicted_labels.append(predicted_label)
        all_probs.append(probabilities.detach().cpu().numpy())  # Store the raw probabilities for AUC
        result = {
            'content': sample,
            'true_label': true_label,
            'predicted_label': predicted_label,
            'true_label_one_hot': true_label_one_hot.tolist(),
            'predicted_class_idx': predicted_class_idx,
            'probabilities': probabilities.detach().cpu().numpy().tolist()  # Convert to list for JSON serialization
        }
        validation_results.append(result)

    timestamp = datetime.now().strftime("%m_%d_%H_%M_%S")
    df_validation_results = pd.DataFrame(validation_results)
    jsonl_file_path = os.path.join(checkpoints_path, f'validation_results_{timestamp}.jsonl')
    df_validation_results.to_json(jsonl_file_path, orient='records', lines=True)


    # Calculate accuracy, F1 score, and AUC
    accuracy = accuracy_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels, average='weighted')  # weighted F1 score
    auc = roc_auc_score(np.array(true_label_one_hot_list),  np.squeeze(np.array(all_probs), axis=1), multi_class='ovr', average='weighted')  # for multi-class AUC

    # Calculate inference time (average time per sample)
    end_time = time.time()
    inference_time = (end_time - start_time) / len(train_dataset)

    # Print the results
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"AUC: {auc:.4f}")
    print(f"Average Inference Time per Sample: {inference_time:.4f} seconds")
    return 
        



# python llama3_FT.py \
# --per_device_train_batch_size 8 \
# --per_device_eval_batch_size 8 \
# --num_train_epochs 10 \
# --learning_rate 1e-6 \
# --project_root /home/llama/Personal_Directories/srb/agentCLS \
# --training_dataset_path assets/training_dataset/LDD_split_equal_train_1000_val_300.jsonl \
# --model_path /home/llama/Personal_Directories/srb/binary_classfication/Llama-3.2-3B-Instruct \
# --resume_from_checkpoint "False" \
# --resume_checkpoint_path "" \
# --qlora False \
# --r 16

Map:   0%|          | 0/303 [00:00<?, ? examples/s]

Map: 100%|██████████| 303/303 [00:00<00:00, 7546.85 examples/s]
Map: 100%|██████████| 90/90 [00:00<00:00, 6727.75 examples/s]
Map: 100%|██████████| 303/303 [00:00<00:00, 373.14 examples/s]
Map: 100%|██████████| 90/90 [00:00<00:00, 344.35 examples/s]
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at /home/snt/projects_lujun/base_models/Llama-3.2-1B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
trainer_stats = None

def main():
    trainer_stats = train()
    return trainer_stats

if __name__ == "__main__":
    trainer_stats = main()
    eval_results = evaluate()

    print("Finished training and evaluation.")

## EVALUATION

In [1]:
from datasets.arrow_dataset import Dataset
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import os
import sys
import time
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import numpy as np

os.environ["TOKENIZERS_PARALLELISM"] = "false"


project_root = "/home/snt/projects_lujun/agentCLS"
model_path = "/home/snt/llm_models/ModernBERT-base"
# model_path = "/home/snt/projects_lujun/base_models/Llama-3.2-1B-Instruct"
# training_dataset_path = "/home/snt/projects_lujun/agentCLS/assets/training_dataset/EURLEX57K_split_proportional_train_1500_val_300.jsonl"
training_dataset_path = "/home/snt/projects_lujun/agentCLS/assets/training_dataset/LDD_split_proportional_train_1500_val_300.jsonl"

output_dir = "/home/snt/projects_lujun/agentCLS/assets/logs/SFT/LDD_split_equal_train_1000_val_300_1.0_ModernBERT-base_output_03_14_23_08_21"
max_length = 4096

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch
from collections import Counter, defaultdict
from tqdm import tqdm
import random

def find_right_label2id(model, tokenizer, train_dataset, max_length=512, device='cuda'):
    train_ratio = 0.01
    description_counts = defaultdict(int)
    for sample in train_dataset:
        description_counts[sample['labels']] += 1
    
    label2id = {}
    id2label = {}
    model.to(device)
    for description, count in description_counts.items():
        output_ids = []
        
        subset = [sample for sample in train_dataset if sample['labels'] == description]
        sampled_subset = random.sample(subset, max(1, int(len(subset) * train_ratio)))
        
        for sample in tqdm(sampled_subset, desc=f"Processing {description}", unit="sample"):
            tokenized_input = tokenizer(sample['content'], padding="max_length", max_length = max_length, truncation=True, return_tensors="pt").to(device)
            with torch.no_grad():
                model_output = model(**tokenized_input)
            
            logits = model_output.logits
            predicted_class_idx = torch.argmax(torch.nn.functional.softmax(logits, dim=-1), dim=-1).item()
            output_ids.append(predicted_class_idx)
        
        most_common_id = Counter(output_ids).most_common(1)[0][0]
        label2id[description] = most_common_id
        id2label[most_common_id] = description
    
    return label2id, id2label

def get_last_checkpoints(output_dir):
    checkpoints = os.listdir(output_dir)
    checkpoints = [c for c in checkpoints if "checkpoint" in c]
    checkpoints = [int(c.split("-")[-1]) for c in checkpoints]
    last_checkpoint = max(checkpoints)
    return f"{output_dir}/checkpoint-{last_checkpoint}"

def get_all_checkpoints(output_dir):
    checkpoints = os.listdir(output_dir)
    checkpoints = [c for c in checkpoints if c.startswith("checkpoint-") and c.split("-")[-1].isdigit()]
    checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[-1]))
    return [os.path.join(output_dir, c) for c in checkpoints]



In [4]:
checkpoint_path = get_all_checkpoints(output_dir)[0]
train_dataset_path = os.path.abspath(os.path.join(project_root, training_dataset_path))
sys.path.append(project_root)
tokenizer = AutoTokenizer.from_pretrained(model_path)
if "BERT" not in model_path:
    tokenizer.pad_token_id = tokenizer.eos_token_id
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForSequenceClassification.from_pretrained(checkpoint_path)

dataset = pd.read_json(train_dataset_path, lines=True)
dataset.rename(columns={"cls_label": "labels"}, inplace=True)
train_ratio = 1.0
# Compute sample counts for each group based on 'labels' and 'split'
sample_counts = dataset.groupby(['labels', 'split']).size() 
filtered_train_data = dataset.groupby('labels', group_keys=False).apply(lambda x: x[x['split'] == 'train'].iloc[:int(sample_counts.loc[x.name, 'train'])])
filtered_validation_data = dataset.groupby('labels', group_keys=False).apply(lambda x: x[x['split'] == 'validation'].iloc[:int(sample_counts.loc[x.name, 'validation'])])
filtered_train = filtered_train_data.reset_index(drop=True)
filtered_validation = filtered_validation_data.reset_index(drop=True)
train_dataset = Dataset.from_pandas(filtered_train)
val_dataset = Dataset.from_pandas(filtered_validation)
input_dataset_name = train_dataset_path.split("/")[-1].split(".")[0]
model_name = model_path.split("/")[-1]

label2id_n, id2label_n = find_right_label2id(model, tokenizer, train_dataset, max_length=4096, device='cuda')
num_labels = len(label2id_n)


  filtered_train_data = dataset.groupby('labels', group_keys=False).apply(lambda x: x[x['split'] == 'train'].iloc[:int(sample_counts.loc[x.name, 'train'])])
  filtered_validation_data = dataset.groupby('labels', group_keys=False).apply(lambda x: x[x['split'] == 'validation'].iloc[:int(sample_counts.loc[x.name, 'validation'])])
Processing cs.AI: 100%|██████████| 14/14 [00:07<00:00,  1.93sample/s]
Processing cs.CE: 100%|██████████| 12/12 [00:03<00:00,  3.71sample/s]
Processing cs.CV: 100%|██████████| 13/13 [00:03<00:00,  3.75sample/s]
Processing cs.DS: 100%|██████████| 15/15 [00:04<00:00,  3.14sample/s]
Processing cs.IT: 100%|██████████| 15/15 [00:04<00:00,  3.53sample/s]
Processing cs.NE: 100%|██████████| 13/13 [00:03<00:00,  3.53sample/s]
Processing cs.PL: 100%|██████████| 13/13 [00:03<00:00,  3.58sample/s]
Processing cs.SY: 100%|██████████| 15/15 [00:04<00:00,  3.64sample/s]
Processing math.AC: 100%|██████████| 14/14 [00:04<00:00,  3.43sample/s]
Processing math.GR: 100%|██████████| 15

In [20]:
train_dataset = train_dataset.map( lambda x: {"labels":label2id_n[x["labels"]]} )
val_dataset = val_dataset.map( lambda x: {"labels": label2id_n[x["labels"]]})

Map:   0%|          | 0/15682 [00:00<?, ? examples/s]

Map: 100%|██████████| 15682/15682 [00:03<00:00, 5161.85 examples/s]
Map: 100%|██████████| 3300/3300 [00:00<00:00, 3546.23 examples/s]


In [None]:
from tensorboard.backend.event_processing import event_accumulator
import os
import math 

max_length = 4096
gpu_per_hours_dict = {
    "Llama-3.2-1B-Instruct":{
        "train": 27.360,
        "inference": 25.782
    },
    "gemma-2-2b-it":{
        "train": 51.496,
        "inference": 32.664
    },
    "Llama-3.2-3B-Instruct":{
        "train": 65.522,
        "inference": 39.556
    },
    "ModernBERT-base":{
        "train": 27.006,
        "inference": 1.528
    }
}

alpha = 0.5
beta = 0.5

def extract_training_time(log_dir, scalar_name="train/loss"):

    event_files = [
        os.path.join(root, file)
        for root, _, files in os.walk(log_dir)
        for file in files
        if "events" in file
    ]

    if not event_files:
        raise FileNotFoundError("Directory does not contain any event files.")
    
    event_file = event_files[0] 

    ea = event_accumulator.EventAccumulator(event_file)
    ea.Reload() 

    available_keys = ea.scalars.Keys()
    if scalar_name not in available_keys:
        raise ValueError(f"Scalar '{scalar_name}' doesn't exist: {available_keys}")
    

    wall_times = ea.Scalars(scalar_name)
    start_time = wall_times[0].wall_time
    end_time = wall_times[-1].wall_time 

    training_duration = end_time - start_time
    return start_time, end_time, training_duration

def evaluate_multiple(output_dir, val_dataset):
    device = "cuda" if torch.cuda.is_available() else "cpu"

    checkpoints_paths= get_all_checkpoints(output_dir)
    for checkpoints_path in checkpoints_paths:
        model = AutoModelForSequenceClassification.from_pretrained(checkpoints_path).to(device)
        model.config.pad_token_id = tokenizer.pad_token_id
        model.config.use_cache = False
        model.config.pretraining_tp = 1
        model.gradient_checkpointing_enable()
        model.eval()
        validation_results = []

        # Initialize lists to store true and predicted labels
        true_label_one_hot_list = []
        true_labels = []
        predicted_labels = []
        all_probs = []

        # Start time for measuring inference efficiency
        start_time = time.time()

        # Iterate through validation dataset and make predictions
        for input in tqdm(val_dataset, desc="Processing validation data", unit="sample"):
            sample = input['content']
            true_label_idx = int(input['labels'])
            tokenized_input = tokenizer(sample, padding="max_length", max_length=max_length, truncation=True, return_tensors="pt").to(device)
            model_output = model(**tokenized_input)
            logits = model_output.logits
            probabilities = torch.nn.functional.softmax(logits, dim=-1)
            predicted_class_idx = torch.argmax(probabilities, dim=-1).item()
            predicted_label = id2label_n[predicted_class_idx]
            
            true_label = id2label_n[true_label_idx]
            true_label_one_hot = np.zeros(probabilities.size(-1))
            true_label_one_hot[true_label_idx] = 1

            true_labels.append(true_label)
            true_label_one_hot_list.append(true_label_one_hot)
            predicted_labels.append(predicted_label)
            all_probs.append(probabilities.detach().cpu().numpy())  # Store the raw probabilities for AUC
            result = {
                'content': sample,
                'true_label': true_label,
                'predicted_label': predicted_label,
                'true_label_one_hot': true_label_one_hot.tolist(),
                'predicted_class_idx': predicted_class_idx,
                'probabilities': probabilities.detach().cpu().numpy().tolist()  # Convert to list for JSON serialization
            }
            validation_results.append(result)

        timestamp = datetime.now().strftime("%m_%d_%H_%M_%S")
        df_validation_results = pd.DataFrame(validation_results)
        jsonl_file_path = os.path.join(checkpoints_path, f'validation_results_{timestamp}.jsonl')
        df_validation_results.to_json(jsonl_file_path, orient='records', lines=True)


        # Calculate accuracy, F1 score, and AUC
        accuracy = accuracy_score(true_labels, predicted_labels)
        f1 = f1_score(true_labels, predicted_labels, average='weighted')  # weighted F1 score
        auc = roc_auc_score(np.array(true_label_one_hot_list),  np.squeeze(np.array(all_probs), axis=1), multi_class='ovr', average='weighted')  # for multi-class AUC

        # Calculate inference time (average time per sample)
        end_time = time.time()
        inference_time = (end_time - start_time) 
        inference_time_hours = round(inference_time / 3600, 4)
        start_time, end_time, training_duration = extract_training_time(output_dir)
        training_time_hours = round(training_duration / 3600, 4)

        for k,v in gpu_per_hours_dict.items():
            if k in output_dir:
                gpu_ram_train = v["train"]
                gpu_ram_inference = v["inference"]
                break


        training_gpu_hours_ram = gpu_ram_train * training_time_hours
        inference_gpu_hours_ram = gpu_ram_inference * inference_time_hours
        total_gpu_hours_ram = training_gpu_hours_ram + inference_gpu_hours_ram
        Resource_M = f1 / math.log((alpha * training_gpu_hours_ram + beta * inference_gpu_hours_ram)+1)
        Time_M = f1 / math.log((alpha * training_time_hours + beta * inference_time_hours)+!)
        ratio_time = inference_time_hours/(inference_time_hours+training_time_hours)
        ratio_time_ram = inference_gpu_hours_ram/(inference_gpu_hours_ram+training_gpu_hours_ram)

        # Print the results
        print(f"Accuracy: {accuracy:.4f}")
        print(f"F1 Score: {f1:.4f}")
        # print(f"AUC: {auc:.4f}")
        print(f"Resource_M: {Resource_M:.4f}")
        print(f"Time_M: {Time_M:.4f}")
        print(f"""ratio_time: {ratio_time:.4f}""")
        print(f"""ratio_time_ram: {ratio_time_ram:.4f}""")
        print(f"total GPU hours (RAM): {total_gpu_hours_ram:.4f}")
        print(f"training GPU hours (RAM): {training_gpu_hours_ram:.4f}")
        print(f"inference GPU hours (RAM): {inference_gpu_hours_ram:.4f}")


        # print(f"Training started at: {round(start_time / 3600, 2)} hours")
        # print(f"Training ended at: {round(end_time / 3600, 2)} hours")
        # print(f"Training duration: {round(training_duration / 3600, 2)} hours")


    return 

evaluate_multiple(output_dir=output_dir, val_dataset=val_dataset)

Processing validation data: 100%|██████████| 3300/3300 [15:31<00:00,  3.54sample/s]


Accuracy: 0.8197
F1 Score: 0.8181
AUC: 0.9788
Resource_M: 0.0058
Time_M: 0.1525
ratio_time: 0.0242
ratio_time_ram: 0.0014
total GPU hours (RAM): 283.2029
training GPU hours (RAM): 282.8068
inference GPU hours (RAM): 0.3961


Processing validation data: 100%|██████████| 3300/3300 [15:29<00:00,  3.55sample/s]


Accuracy: 0.8170
F1 Score: 0.8144
AUC: 0.9790
Resource_M: 0.0058
Time_M: 0.1518
ratio_time: 0.0241
ratio_time_ram: 0.0014
total GPU hours (RAM): 283.2021
training GPU hours (RAM): 282.8068
inference GPU hours (RAM): 0.3953


In [19]:
import os
import time
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from datetime import datetime
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from tensorboard.backend.event_processing import event_accumulator
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# GPU resource consumption dictionary for different models
GPU_HOURS_DICT = {
    "Llama-3.2-1B-Instruct": {
        "train": 27.360,
        "inference": 25.782
    },
    "gemma-2-2b-it": {
        "train": 51.496,
        "inference": 32.664
    },
    "Llama-3.2-3B-Instruct": {
        "train": 65.522,
        "inference": 39.556
    },
    "ModernBERT-base": {
        "train": 27.006,
        "inference": 1.528
    }
}

# Weighting parameters for efficiency metrics
ALPHA = 0.5  # Training weight
BETA = 0.5   # Inference weight

def extract_training_time(log_dir, scalar_name="train/loss"):
    """
    Extract training duration from TensorBoard event files.
    
    Args:
        log_dir (str): Directory containing TensorBoard logs
        scalar_name (str): Name of scalar to track for timing
        
    Returns:
        tuple: Start time, end time, and total duration in seconds
    """
    # Find all event files in the directory
    event_files = [
        os.path.join(root, file)
        for root, _, files in os.walk(log_dir)
        for file in files
        if "events" in file
    ]

    if not event_files:
        raise FileNotFoundError(f"No TensorBoard event files found in {log_dir}")
    
    # Use the first event file found
    event_file = event_files[0] 
    
    # Load the event file
    ea = event_accumulator.EventAccumulator(event_file)
    ea.Reload()
    
    # Verify the scalar exists
    available_keys = ea.scalars.Keys()
    if scalar_name not in available_keys:
        raise ValueError(f"Scalar '{scalar_name}' not found. Available keys: {available_keys}")
    
    # Extract wall times from the scalar events
    wall_times = ea.Scalars(scalar_name)
    start_time = wall_times[0].wall_time
    end_time = wall_times[-1].wall_time
    
    return start_time, end_time, end_time - start_time



def evaluate_multiple(output_dir, val_dataset):
    """
    Evaluate model performance and efficiency metrics across multiple checkpoints.
    
    Args:
        output_dir (str): Directory containing model checkpoints
        val_dataset: Validation dataset for evaluation
    """
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    # Get all checkpoint paths
    checkpoint_paths = get_all_checkpoints(output_dir)
    
    for checkpoint_path in checkpoint_paths:
        model = AutoModelForSequenceClassification.from_pretrained(checkpoints_path).to(device)
        model.config.pad_token_id = tokenizer.pad_token_id
        model.config.use_cache = False
        model.config.pretraining_tp = 1
        model.gradient_checkpointing_enable()
        model.eval()
        
        # Initialize data structures for evaluation
        validation_results = []
        true_label_one_hot_list = []
        true_labels = []
        predicted_labels = []
        all_probs = []
        
        # Start inference timing
        start_time = time.time()
        
        # Process validation data
        for input_sample in tqdm(val_dataset, desc="Processing validation data", unit="sample"):
            sample = input_sample['content']
            true_label_idx = int(input_sample['labels'])
            
            # Tokenize and predict
            tokenized_input = tokenizer(
                sample, 
                padding="max_length", 
                max_length=max_length, 
                truncation=True, 
                return_tensors="pt"
            ).to(device)
            
            with torch.no_grad():
                model_output = model(**tokenized_input)
                
            # Process prediction results
            logits = model_output.logits
            probabilities = torch.nn.functional.softmax(logits, dim=-1)
            predicted_class_idx = torch.argmax(probabilities, dim=-1).item()
            
            # Convert indices to labels
            predicted_label = id2label_n[predicted_class_idx]
            true_label = id2label_n[true_label_idx]
            
            # Create one-hot encoding of true label
            true_label_one_hot = np.zeros(probabilities.size(-1))
            true_label_one_hot[true_label_idx] = 1
            
            # Store results
            true_labels.append(true_label)
            true_label_one_hot_list.append(true_label_one_hot)
            predicted_labels.append(predicted_label)
            all_probs.append(probabilities.detach().cpu().numpy())
            
            # Store detailed results for each sample
            result = {
                'content': sample,
                'true_label': true_label,
                'predicted_label': predicted_label,
                'true_label_one_hot': true_label_one_hot.tolist(),
                'predicted_class_idx': predicted_class_idx,
                'probabilities': probabilities.detach().cpu().numpy().tolist()
            }
            validation_results.append(result)
        
        # Calculate inference time
        end_time = time.time()
        inference_time = end_time - start_time
        inference_time_hours = round(inference_time / 3600, 4)
        
        # Get training time
        start_time, end_time, training_duration = extract_training_time(output_dir)
        training_time_hours = round(training_duration / 3600, 4)
        
        # Save validation results
        timestamp = datetime.now().strftime("%m_%d_%H_%M_%S")
        df_validation_results = pd.DataFrame(validation_results)
        jsonl_file_path = os.path.join(checkpoint_path, f'validation_results_{timestamp}.jsonl')
        df_validation_results.to_json(jsonl_file_path, orient='records', lines=True)
        
        # Calculate performance metrics
        accuracy = accuracy_score(true_labels, predicted_labels)
        f1 = f1_score(true_labels, predicted_labels, average='weighted')
        auc = roc_auc_score(
            np.array(true_label_one_hot_list),
            np.squeeze(np.array(all_probs), axis=1),
            multi_class='ovr',
            average='weighted'
        )
        
        # Find GPU resource consumption for the model
        model_name = None
        for name in GPU_HOURS_DICT:
            if name in output_dir:
                model_name = name
                gpu_ram_train = GPU_HOURS_DICT[name]["train"]
                gpu_ram_inference = GPU_HOURS_DICT[name]["inference"]
                break
        
        if not model_name:
            print(f"Warning: Model not found in GPU_HOURS_DICT. Using default values.")
            gpu_ram_train = 30.0
            gpu_ram_inference = 20.0
        
        # Calculate efficiency metrics
        training_gpu_hours_ram = gpu_ram_train * training_time_hours
        inference_gpu_hours_ram = gpu_ram_inference * inference_time_hours
        total_gpu_hours_ram = training_gpu_hours_ram + inference_gpu_hours_ram
        
        # Calculate composite metrics
        resource_m = f1 / (ALPHA * training_gpu_hours_ram + BETA * inference_gpu_hours_ram)
        time_m = f1 / (ALPHA * training_time_hours + BETA * inference_time_hours)
        
        # Calculate time and resource ratios
        ratio_time = inference_time_hours / (inference_time_hours + training_time_hours)
        ratio_time_ram = inference_gpu_hours_ram / (inference_gpu_hours_ram + training_gpu_hours_ram)
        
        # Print evaluation results
        print("\n" + "="*50)
        print(f"Evaluation Results for {model_name or 'Unknown Model'}")
        print("="*50)
        print(f"Performance Metrics:")
        print(f"  Accuracy:    {accuracy:.4f}")
        print(f"  F1 Score:    {f1:.4f}")
        print(f"  AUC:         {auc:.4f}")
        
        print("\nEfficiency Metrics:")
        print(f"  Resource_M:  {resource_m:.4f}")
        print(f"  Time_M:      {time_m:.4f}")
        
        print("\nTime Distribution:")
        print(f"  Inference/Total Time Ratio: {ratio_time:.4f}")
        print(f"  Inference/Total RAM Ratio:  {ratio_time_ram:.4f}")
        
        print("\nGPU Resource Usage (GPU hours × RAM):")
        print(f"  Total:       {total_gpu_hours_ram:.4f}")
        print(f"  Training:    {training_gpu_hours_ram:.4f}")
        print(f"  Inference:   {inference_gpu_hours_ram:.4f}")
        print("="*50 + "\n")

# Execute evaluation
print(f"Evaluating models in: {output_dir}")
evaluate_multiple(output_dir=output_dir, val_dataset=val_dataset)


Evaluating models in: /home/snt/projects_lujun/agentCLS/assets/logs/SFT/LDD_split_proportional_train_1500_val_300_1.0_ModernBERT-base_output_03_15_15_03_05


Processing validation data: 100%|██████████| 330/330 [01:37<00:00,  3.40sample/s]



Evaluation Results for ModernBERT-base
Performance Metrics:
  Accuracy:    0.1182
  F1 Score:    0.1204
  AUC:         0.4936

Efficiency Metrics:
  Resource_M:  0.0011
  Time_M:      0.0296

Time Distribution:
  Inference/Total Time Ratio: 0.0033
  Inference/Total RAM Ratio:  0.0002

GPU Resource Usage (GPU hours × RAM):
  Total:       218.9735
  Training:    218.9322
  Inference:   0.0413



Processing validation data: 100%|██████████| 330/330 [01:36<00:00,  3.41sample/s]



Evaluation Results for ModernBERT-base
Performance Metrics:
  Accuracy:    0.1212
  F1 Score:    0.1230
  AUC:         0.4950

Efficiency Metrics:
  Resource_M:  0.0011
  Time_M:      0.0302

Time Distribution:
  Inference/Total Time Ratio: 0.0033
  Inference/Total RAM Ratio:  0.0002

GPU Resource Usage (GPU hours × RAM):
  Total:       218.9733
  Training:    218.9322
  Inference:   0.0411

