## imports

In [1]:
import os
import gc
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

import torch
import random 
from datasets import load_dataset
from peft import LoraConfig
from transformers import AutoTokenizer, AutoModelForCausalLM
from trl import SFTConfig, SFTTrainer
import wandb


# =========================
# CONFIGURATION
# =========================

MODEL_NAME_OR_PATH = "META-LLAMA/LLAMA-3.2-1B"
DATA_ROOT = "Task_II/_csv/"
BASE_OUTPUT_DIR = "models_high_r8/"

TASKS = (
    # "openness_high",
    # "openness_low",
    # "conscientiousness_high", 
    # "conscientiousness_low",
    # "extraversion_high",
    # "extraversion_low",
    "agreeableness_high",
    "agreeableness_low",
#     "neuroticism_high",
#     "neuroticism_low" 
 )

# Hyperparameters
NUM_TRAIN_EPOCHS = 5.0
PER_DEVICE_TRAIN_BATCH_SIZE = 8
LEARNING_RATE = 2e-4
WEIGHT_DECAY = 0.01
MAX_LENGTH = 512
LORA_R = 8
LORA_ALPHA = 8
LOGGING_STEPS = 100
SAVE_TOTAL_LIMIT = 2
WARMUP_RATIO = 0.03
GLOBAL_SEED = 42



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import  set_seed


def main():
    torch.manual_seed(GLOBAL_SEED)

    for task in TASKS:
        print(f"=== STARTING TASK: {task} ===")
       
        random.seed(GLOBAL_SEED)  #set seeds IMPORTANT FOR TASKvec
        #np.random.seed(GLOBAL_SEED)
        torch.manual_seed(GLOBAL_SEED)
        torch.cuda.manual_seed_all(GLOBAL_SEED)
        set_seed(GLOBAL_SEED) # for transformer

        # change run names to monitor 
        os.environ["WANDB_NAME"] = f"sft_{task}"
        os.environ["WANDB_TAGS"] = task
        
        task_dir = os.path.join(BASE_OUTPUT_DIR, task)
        dataset_path = os.path.join(DATA_ROOT, task)
        
        # 1. Daten laden
        data_files = {
            "train": os.path.join(dataset_path, f"{task}_train.csv"),
            "validation": os.path.join(dataset_path, f"{task}_dev.csv"), 
        }
        if not os.path.exists(data_files["train"]):
            print(f"Skipping {task}, file not found.")
            continue

        dataset = load_dataset("csv", data_files=data_files)

       
        def prepare_dataset_cols(example):
            return {"prompt":f"{example['prompt']}\n",
                "completion": f"{example['answer']}<|end_of_text|>"
            }
        
        # Apply transformation to both splits
        dataset["train"] = dataset["train"].map(prepare_dataset_cols)
        dataset["validation"] = dataset["validation"].map(prepare_dataset_cols)

        # 2. Tokenizer & Model
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME_OR_PATH)
        tokenizer.pad_token = tokenizer.eos_token 
        
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME_OR_PATH,
            dtype=torch.bfloat16,
            use_cache=False,
            attn_implementation="sdpa"
        )


        # 4. Configs
        peft_config = LoraConfig(
            r=LORA_R,
            lora_alpha=LORA_ALPHA,
            bias="none",
            task_type="CAUSAL_LM",
            target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
        )

        sft_config = SFTConfig(
            output_dir=task_dir,
            run_name=f"sft_{task}",
            num_train_epochs=NUM_TRAIN_EPOCHS,
            per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
            learning_rate=LEARNING_RATE,
            weight_decay=WEIGHT_DECAY,
            max_length=MAX_LENGTH,
            logging_steps=LOGGING_STEPS,
            save_strategy="epoch",
            save_total_limit=SAVE_TOTAL_LIMIT,
            eval_strategy="epoch",
            warmup_ratio=WARMUP_RATIO,
            bf16=True,
            report_to=["wandb"],
            packing=False, 
            dataset_text_field=None,
            completion_only_loss=True,
            load_best_model_at_end=True,
            metric_for_best_model="eval_loss",
            seed=GLOBAL_SEED,             
            data_seed=GLOBAL_SEED,
        )

        # 5. Trainer
        trainer = SFTTrainer(
            model=model,
            args=sft_config,
            train_dataset=dataset["train"],
            eval_dataset=dataset["validation"],
            peft_config=peft_config,
            processing_class=tokenizer,
           
        )

        trainer.train()
        
        trainer.save_model(task_dir)
        tokenizer.save_pretrained(task_dir)

        # Cleanup
        del model, trainer
        gc.collect()
        torch.cuda.empty_cache()

In [3]:
main()

=== STARTING TASK: agreeableness_high ===


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 128001}.
[34m[1mwandb[0m: Currently logged in as: [33mchr-hau1[0m ([33munitrier[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
1,0.0,0.006984,2.455149,3327381.0,0.999
2,0.0031,0.003076,2.786441,6654762.0,0.99875
3,0.0018,4.5e-05,2.782121,9982143.0,1.0
4,0.0,0.000391,2.792431,13309524.0,0.9995
5,0.0,0.000323,2.76445,16636905.0,0.99975


=== STARTING TASK: agreeableness_low ===


Generating train split: 15951 examples [00:00, 94514.00 examples/s]
Generating validation split: 1993 examples [00:00, 71761.21 examples/s]
Map: 100%|██████████| 15951/15951 [00:00<00:00, 20639.41 examples/s]
Map: 100%|██████████| 1993/1993 [00:00<00:00, 20872.71 examples/s]
Adding EOS to train dataset: 100%|██████████| 15951/15951 [00:00<00:00, 18209.17 examples/s]
Tokenizing train dataset: 100%|██████████| 15951/15951 [00:16<00:00, 958.22 examples/s] 
Truncating train dataset: 100%|██████████| 15951/15951 [00:00<00:00, 195002.31 examples/s]
Adding EOS to eval dataset: 100%|██████████| 1993/1993 [00:00<00:00, 17603.16 examples/s]
Tokenizing eval dataset: 100%|██████████| 1993/1993 [00:02<00:00, 971.29 examples/s] 
Truncating eval dataset: 100%|██████████| 1993/1993 [00:00<00:00, 157241.03 examples/s]
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the t

Epoch,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
1,0.0001,0.003299,2.459791,3327381.0,0.99925
2,0.0023,0.000453,2.539026,6654762.0,0.99975
3,0.0043,0.002067,2.656106,9982143.0,0.99975
4,0.0001,0.000278,2.553114,13309524.0,1.0
5,0.0,0.000437,2.521757,16636905.0,0.99975
