In [1]:
import sys
sys.path.append("..")
from dataset import load_polaris_dataset, validate_dataset
from train import get_dataset
import numpy as np
from latex2sympy2_extended import NormalizationConfig
from math_verify import LatexExtractionConfig, parse
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    PreTrainedTokenizer
)
import torch
from peft import PeftModel
from peft import prepare_model_for_kbit_training
from trl import ModelConfig
from munch import Munch
import json
from pathlib import Path
from functools import partial
import hashlib
from collections import defaultdict

from train import GRPOTrainer2
import os
from trl import (
    GRPOConfig, 
    GRPOTrainer,
    get_peft_config
)
from dataclasses import field, dataclass

  from .autonotebook import tqdm as notebook_tqdm


INFO 03-02 03:25:52 __init__.py:190] Automatically detected platform cuda.


2025-03-02 03:25:52,942	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [2]:
DEFAULT_CHAT_TEMPLATE = "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"

In [3]:
def compute_mae(completions, ground_truth=None, log_normalize=False, model_name="pred", **kwargs):
    
    smiles = kwargs.get("smiles")
    num_generations = len(completions) / len(set(smiles))
    
    Path(f"./test/completions/{model_name}_{num_generations}/").mkdir(parents=True, exist_ok=True)
    
    contents = [completion[0]["content"] for completion in completions]
    rewards = []

    solutions = kwargs.get("solution") # Get solutions from kwargs
    

    if solutions is None:
        return [0.5] * len(completions) # Return neutral reward if no solution
    smiles2conts = defaultdict(list)
    #TODO: not parsed empty list
    #TODO: mae mean per group, take meadian and calculate
    #TODO: make more generations, save separately
    for content, gold_val, smiles_i in zip(contents, solutions, smiles):
        
        if gold_val is not None:  # Check if parsing was successful
            # Parse the model's answer with relaxed normalization
            answer_parsed = parse(
                content,
                extraction_config=[
                    LatexExtractionConfig(
                        normalization_config=NormalizationConfig(
                            nits=False,
                            malformed_operators=False,
                            basic_latex=True,
                            equations=True,
                            boxed="all",
                            units=True,
                        ),
                        boxed_match_priority=0,
                        try_extract_without_anchor=False,
                    )
                ],
                extraction_mode="first_match",
            )

            try:
                answer_val = float(answer_parsed[0])
                mae = np.mean(np.abs(gold_val - answer_val))
                print("parsed correctly", answer_val, gold_val)
            except Exception as e:
                answer_val = None
                mae = 6
                if len(answer_parsed) > 0:
                    print(e, answer_parsed)
        else:
            answer_val = None
            mae = 6
            print("Warning: Gold solution is None:", gold_val)
        if answer_val is not None:
            post = "parsed_"
        else:
            post = ""
        smiles_hash = hashlib.blake2b(smiles_i.encode('utf-8'), digest_size=4).hexdigest()
        rewards.append(mae)
        smiles2conts[smiles_hash].append({"completion": content, 
                       "gold_val": str(gold_val), 
                       "answer_parsed": str(answer_parsed), 
                       "smiles": smiles_i,
                       "answer_val": answer_val,
                       "mae": mae
                       }) 
    median_maes = []
    for k, v in smiles2conts.items():
        with open(f"./test/completions/{model_name}_{num_generations}/{post}{k}.json", "w") as f:
                answers_g = [v_i["answer_val"] for v_i in v]
                answers_g = [float(v_i) for v_i in answers_g if v_i is not None]
                answer_median = np.median(answers_g)
                mae_median = np.median(np.abs(float(v[0]["gold_val"]) - answer_median))
                median_maes.append(mae_median)
                json.dump({"completion": [v_i["completion"] for v_i in v], 
                        "gold_val": v[0]["gold_val"], 
                        "answer_parsed": [v_i["answer_parsed"] for v_i in v], 
                        "smiles": v[0]["smiles"],
                        "answer_val": [v_i["answer_val"] for v_i in v],
                        "mae": [v_i["mae"] for v_i in v],
                        "mae_median": str(mae_median)
                        }, f, indent=2)
    return median_maes

def get_tokenizer(
    model_args: ModelConfig, training_args, auto_set_chat_template: bool = True
) -> PreTrainedTokenizer:
    """Get the tokenizer for the model."""
    # https://github.com/huggingface/open-r1/blob/eeca246b078457bc0f69ba2e8297b799df0e2bda/src/open_r1/utils/model_utils.py#L11
    print("loading tokenizer")
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.model_name_or_path,
        revision=model_args.model_revision,
        trust_remote_code=False, # model_args.trust_remote_code
    )
    print("tokenizer loaded")

    if training_args.chat_template is not None:
        tokenizer.chat_template = training_args.chat_template
    elif auto_set_chat_template and tokenizer.get_chat_template() is None:
        tokenizer.chat_template = DEFAULT_CHAT_TEMPLATE
    print("chat template")
    # if processing_class is None:
    #     processing_class = AutoTokenizer.from_pretrained(model.config._name_or_path, padding_side="left")
    return tokenizer

def get_reward_functions(script_args, model_name):
    """
    Returns a list of reward functions based on the script arguments.
    """
    reward_funcs_list = []

    fnc = partial(compute_mae, model_name=model_name)
    fnc.__name__ = compute_mae.__name__
    reward_funcs_registry = {
        "mae": fnc,  # Assuming accuracy_reward is defined in previous steps
    }

    for func_name in script_args.reward_funcs:
        if func_name not in reward_funcs_registry:
            raise ValueError(f"Reward function '{func_name}' not found in registry.")
        reward_funcs_list.append(reward_funcs_registry[func_name])

    return reward_funcs_list

@dataclass
class GRPOScriptArguments:
    """
    Script arguments for GRPO training, specifically related to reward functions.
    """

    reward_funcs: list[str] = field(
        default_factory=lambda: ["mae"], 
        metadata={
            "help": "List of reward functions. Possible values: 'accuracy', 'format', 'reasoning_steps', 'repetition_penalty'"        },
    )

    repetition_n_grams: int = field(
        default=3,
        metadata={"help": "Number of n-grams for repetition penalty reward"},
    )
    repetition_max_penalty: float = field(
        default=-0.1,
        metadata={"help": "Maximum (negative) penalty for for repetition penalty reward"},
    )

In [4]:
dataset = get_dataset(params=["LogD"], subset_train=50)

Map: 100%|██████████| 221/221 [00:00<00:00, 5385.97 examples/s]
Map: 100%|██████████| 49/49 [00:00<00:00, 4380.61 examples/s]
Map:   0%|          | 0/52 [00:00<?, ? examples/s]

Map: 100%|██████████| 52/52 [00:00<00:00, 5059.00 examples/s]

Train set size: 221
Test set size: 49

Validating train split:
✓ All required fields present
✓ Prompt format is correct

Validating test split:
✓ All required fields present
✓ Prompt format is correct





In [6]:
MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,    # if required
    torch_dtype=torch.bfloat16,  # if you used bf16
    device_map="auto"           # or "cuda:0", depending on your environment
)

model_args_i = Munch.fromDict({
        "model_name_or_path": MODEL_NAME,
        "model_revision": "main",
        "trust_remote_code": False # TODO: everyboudy sets to True and default is True
        })

training_args_i = Munch.fromDict({"chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜>'}}{% endif %}"})

tokenizer = get_tokenizer(model_args_i, training_args_i)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

    
script_args = GRPOScriptArguments()

reward_functions = get_reward_functions(script_args, model_name="tuned_v3_correct_format_v2") #TODO: check trl they had someshere gpro example and used different rewards including lenght reward

training_args = TrainingArguments(
        logging_dir="./logs/wandb/",
        num_train_epochs=12,             # Total number of training epochs
        per_device_train_batch_size=16,  # Batch size per device during training
        per_device_eval_batch_size=16,   # Batch size for evaluation TODO: why it says this   File "/home/alisavin/AgenticADMET/train.py", line 534, in <module>
        gradient_accumulation_steps=4,  # Accumulate gradients to simulate larger batch size
        learning_rate=1e-6,            # Initial learning rate for AdamW optimizer
        warmup_ratio=0.1,              # Linear warmup over warmup_ratio fraction of training steps
        weight_decay=0.01,             # Apply weight decay to all layers except bias and LayerNorm weights
        logging_steps=1,              # Log every X updates steps
        logging_strategy="steps",
        logging_first_step=True,
        evaluation_strategy="epoch",    # Evaluate every `eval_steps`
        save_strategy="no",      # Disables regular checkpoints
        save_total_limit=0,      # Makes sure no checkpoints are kept
        load_best_model_at_end=False,  # Disables saving the best model
        dataloader_num_workers=4,      # Number of subprocesses to use for data loading
        seed=42,                       # Random seed for reproducibility
        bf16=True,                     # Use mixed precision BFP16 training #TODO: ??????
        push_to_hub=False,             # Whether to push the final model to Hugging Face Hub
        report_to=["wandb"],              # Reporting to no one
        run_name="test",
        disable_tqdm=False,
        gradient_checkpointing=True,   # Enable gradient checkpointing        
        remove_unused_columns=False,
        do_train=True,
        # do_eval=True, #TODO: use
        gradient_checkpointing_kwargs={"use_reentrant": False}, # TODO: use
        lr_scheduler_type="cosine_with_min_lr",
        lr_scheduler_kwargs={"min_lr_rate": 0.1},
        max_steps=-1, #TODO: change to -1
    )

grpo_config = GRPOConfig(
    **training_args.to_dict(), # Convert TrainingArguments to dictionary and unpack
    **{ 
    # REMOVED model_init_kwargs here 
    # We are passing the instantiated 'model' object, so GRPOTrainer doesn't need model_init_kwargs
    },
    num_generations=8, #TODO: 16
    use_vllm=True, #TODO: use True
    vllm_device="cuda:0",
    vllm_gpu_memory_utilization=0.25, # TODO: 0.25 0.7
    vllm_max_model_len=2048, #TODO: 2048
    max_prompt_length=800, #TODO: 800+
    max_completion_length=1024, #TODO: 1024+ (better 2048/4048 and more)
    temperature=0.7,
    reward_weights=[1.0]
    )

model_args = ModelConfig(model_name_or_path=MODEL_NAME, use_peft=True)

grpo_trainer = GRPOTrainer2(
    model=model,                      # Our initialized Qwen model
    reward_funcs=reward_functions,    # List of reward functions from previous step
    args=grpo_config,                # GRPOConfig (created from TrainingArguments)
    train_dataset=dataset['train'],   # Training dataset
    eval_dataset=dataset['validation'],    # Evaluation dataset
    processing_class=tokenizer, #TODO: check callback from config
    peft_config=get_peft_config(model_args) #TODO: check # label_names
    # peft_config=None
)

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.35s/it]


loading tokenizer
tokenizer loaded
chat template


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [10]:
train_result = grpo_trainer.train(resume_from_checkpoint="/home/alisavin/AgenticADMET/outputs/2025-02-26/22-18-57/checkpoint-60/")

  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)
	logging_steps: 1 (from args) != 10 (from trainer_state.json)


  checkpoint_rng_state = torch.load(rng_file)


parsed correctly 3.8 2.1
parsed correctly 1.85 2.1
parsed correctly 2.5 2.1
parsed correctly 2.8 2.1
parsed correctly 3.5 2.1
parsed correctly 3.2 2.1
parsed correctly 3.5 2.1
parsed correctly 2.8 2.1
parsed correctly 2.5 1.9
parsed correctly 1.25 1.9
parsed correctly 3.5 1.9
parsed correctly 1.3 1.9
parsed correctly 3.8 1.9
parsed correctly 4.7 1.9
parsed correctly 2.5 1.9


RuntimeError: The expanded size of the tensor (16) must match the existing size (2) at non-singleton dimension 0.  Target sizes: [16].  Tensor sizes: [2]