In [1]:
import sys
sys.path.append("..")
from dataset import load_polaris_dataset, validate_dataset
from train import get_dataset
import numpy as np
from latex2sympy2_extended import NormalizationConfig
from math_verify import LatexExtractionConfig, parse
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    PreTrainedTokenizer
)
import torch
from peft import PeftModel
from peft import prepare_model_for_kbit_training
from trl import ModelConfig
from munch import Munch
import json
from pathlib import Path
from functools import partial
import hashlib
from collections import defaultdict

  from .autonotebook import tqdm as notebook_tqdm


INFO 02-26 00:20:00 __init__.py:190] Automatically detected platform cuda.


2025-02-26 00:20:01,082	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mvladvin111[0m ([33mvladvin-org[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [2]:
DEFAULT_CHAT_TEMPLATE = "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"

In [3]:
def compute_mae(completions, ground_truth=None, log_normalize=False, model_name="pred", **kwargs):
    
    smiles = kwargs.get("smiles")
    num_generations = len(completions) / len(set(smiles))
    
    Path(f"./test/completions/{model_name}_{num_generations}/").mkdir(parents=True, exist_ok=True)
    
    contents = [completion[0]["content"] for completion in completions]
    rewards = []

    solutions = kwargs.get("solution") # Get solutions from kwargs
    

    if solutions is None:
        return [0.5] * len(completions) # Return neutral reward if no solution
    smiles2conts = defaultdict(list)
    #TODO: not parsed empty list
    #TODO: mae mean per group, take meadian and calculate
    #TODO: make more generations, save separately
    for content, gold_val, smiles_i in zip(contents, solutions, smiles):
        
        if gold_val is not None:  # Check if parsing was successful
            # Parse the model's answer with relaxed normalization
            answer_parsed = parse(
                content,
                extraction_config=[
                    LatexExtractionConfig(
                        normalization_config=NormalizationConfig(
                            nits=False,
                            malformed_operators=False,
                            basic_latex=True,
                            equations=True,
                            boxed="all",
                            units=True,
                        ),
                        boxed_match_priority=0,
                        try_extract_without_anchor=False,
                    )
                ],
                extraction_mode="first_match",
            )

            try:
                # if len(answer_parsed) == 0:
                #     raise Exception("Parsed values is empty")
                answer_val = float(answer_parsed[0])
                mae = np.mean(np.abs(gold_val - answer_val))
                reward = np.clip(1-(1/6)*mae, 0, 1)
                # print(content)
                print("parsed correctly", answer_val, gold_val)
            except Exception as e:
                answer_val = None
                reward = 0
                mae = None
                if len(answer_parsed) > 0:
                    print(e, answer_parsed)
        else:
            # If ground truth cannot be parsed, assign neutral reward (0.5)
            reward = 0.5
            answer_val = None
            mae = None
            print("Warning: Gold solution is None:", gold_val)
        if answer_val is not None:
            post = "parsed_"
        else:
            post = ""
        smiles_hash = hashlib.blake2b(smiles_i.encode('utf-8'), digest_size=4).hexdigest()
        rewards.append(reward)
        smiles2conts[smiles_hash].append({"completion": content, 
                       "gold_val": str(gold_val), 
                       "answer_parsed": str(answer_parsed), 
                       "smiles": smiles_i,
                       "answer_val": answer_val,
                       "reward": reward,
                       "mae": mae
                       }) 
    for k, v in smiles2conts.items():
        with open(f"./test/completions/{model_name}_{num_generations}/{post}{k}.json", "w") as f:
                answers_g = [v_i["answer_val"] for v_i in v]
                answers_g = [float(v_i) for v_i in answers_g if v_i is not None]
                answer_median = np.median(answers_g)
                mae_median = np.median(np.abs(float(v[0]["gold_val"]) - answer_median))
                json.dump({"completion": [v_i["completion"] for v_i in v], 
                        "gold_val": v[0]["gold_val"], 
                        "answer_parsed": [v_i["answer_parsed"] for v_i in v], 
                        "smiles": v[0]["smiles"],
                        "answer_val": [v_i["answer_val"] for v_i in v],
                        "reward": [v_i["reward"] for v_i in v],
                        "mae": [v_i["mae"] for v_i in v],
                        "mae_median": str(mae_median)
                        }, f, indent=2)
    return rewards

def get_tokenizer(
    model_args: ModelConfig, training_args, auto_set_chat_template: bool = True
) -> PreTrainedTokenizer:
    """Get the tokenizer for the model."""
    # https://github.com/huggingface/open-r1/blob/eeca246b078457bc0f69ba2e8297b799df0e2bda/src/open_r1/utils/model_utils.py#L11
    print("loading tokenizer")
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.model_name_or_path,
        revision=model_args.model_revision,
        trust_remote_code=False, # model_args.trust_remote_code
    )
    print("tokenizer loaded")

    if training_args.chat_template is not None:
        tokenizer.chat_template = training_args.chat_template
    elif auto_set_chat_template and tokenizer.get_chat_template() is None:
        tokenizer.chat_template = DEFAULT_CHAT_TEMPLATE
    print("chat template")
    # if processing_class is None:
    #     processing_class = AutoTokenizer.from_pretrained(model.config._name_or_path, padding_side="left")
    return tokenizer

In [4]:
dataset = get_dataset(params=["LogD"], subset_train=50)

Map: 100%|██████████| 206/206 [00:00<00:00, 11582.75 examples/s]
Map: 100%|██████████| 41/41 [00:00<00:00, 6923.52 examples/s]
Map: 100%|██████████| 48/48 [00:00<00:00, 7581.78 examples/s]

Train set size: 206
Test set size: 41

Validating train split:
✓ All required fields present
✓ Prompt format is correct

Validating test split:
✓ All required fields present
✓ Prompt format is correct





In [5]:
MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"

# model = AutoModelForCausalLM.from_pretrained()
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,    # if required
    torch_dtype=torch.bfloat16,  # if you used bf16
    device_map="auto"           # or "cuda:0", depending on your environment
)

# # 2) Load LoRA adapter weights onto the base model
model = PeftModel.from_pretrained(base_model, "/home/alisavin/AgenticADMET/outputs/2025-02-25/20-38-55/checkpoint-60/")
model = prepare_model_for_kbit_training(model)
# model = base_model
model.eval()

model_args_i = Munch.fromDict({
        "model_name_or_path": MODEL_NAME,
        "model_revision": "main",
        "trust_remote_code": False # TODO: everyboudy sets to True and default is True
        })

training_args_i = Munch.fromDict({"chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜>'}}{% endif %}"})

tokenizer = get_tokenizer(model_args_i, training_args_i)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


# # 4) Generate text from a prompt
# prompt = "Explain the concept of molecular solubility in simple terms."

# input = {
#         "ground_truth": example["solution"],
#         "prompt": [
#             {"role": "system", "content": SYSTEM_PROMPT},
#             {"role": "user", "content": example["problem"]},
#         ],
#     }
# inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

# with torch.no_grad():
#     outputs = model.generate(
#         **inputs,
#         max_new_tokens=128,
#         do_sample=True,
#         temperature=0.7,
#         top_p=0.9
#     )

# generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
# print("Generated:", generated_text)
from train import GRPOTrainer2
import os
from trl import (
    GRPOConfig, 
    GRPOTrainer,
    get_peft_config
)
from dataclasses import field, dataclass

def get_reward_functions(script_args, model_name):
    """
    Returns a list of reward functions based on the script arguments.
    """
    reward_funcs_list = []

    fnc = partial(compute_mae, model_name=model_name)
    fnc.__name__ = compute_mae.__name__
    reward_funcs_registry = {
        "mae": fnc,  # Assuming accuracy_reward is defined in previous steps
    }

    for func_name in script_args.reward_funcs:
        if func_name not in reward_funcs_registry:
            raise ValueError(f"Reward function '{func_name}' not found in registry.")
        reward_funcs_list.append(reward_funcs_registry[func_name])

    return reward_funcs_list

@dataclass
class GRPOScriptArguments:
    """
    Script arguments for GRPO training, specifically related to reward functions.
    """

    reward_funcs: list[str] = field(
        default_factory=lambda: ["mae"], 
        metadata={
            "help": "List of reward functions. Possible values: 'accuracy', 'format', 'reasoning_steps', 'repetition_penalty'"        },
    )

    repetition_n_grams: int = field(
        default=3,
        metadata={"help": "Number of n-grams for repetition penalty reward"},
    )
    repetition_max_penalty: float = field(
        default=-0.1,
        metadata={"help": "Maximum (negative) penalty for for repetition penalty reward"},
    )
    
script_args = GRPOScriptArguments()

reward_functions = get_reward_functions(script_args, model_name="tuned_v2") #TODO: check trl they had someshere gpro example and used different rewards including lenght reward

training_args = TrainingArguments(
    logging_dir="./logs/wandb/",
    num_train_epochs=10,             # Total number of training epochs
    per_device_train_batch_size=16,  # Batch size per device during training
    per_device_eval_batch_size=16,   # Batch size for evaluation TODO: why it says this   File "/home/alisavin/AgenticADMET/train.py", line 534, in <module>
    gradient_accumulation_steps=4,  # Accumulate gradients to simulate larger batch size
    learning_rate=1e-6,            # Initial learning rate for AdamW optimizer
    warmup_ratio=0.1,              # Linear warmup over warmup_ratio fraction of training steps
    weight_decay=0.01,             # Apply weight decay to all layers except bias and LayerNorm weights
    logging_steps=1,              # Log every X updates steps
    logging_strategy="steps",
    logging_first_step=True,
    evaluation_strategy="epoch",    # Evaluate every `eval_steps`
    save_strategy="epoch",      # Disables regular checkpoints
    save_total_limit=1,      # Makes sure no checkpoints are kept
    load_best_model_at_end=False,  # Disables saving the best model
    dataloader_num_workers=4,      # Number of subprocesses to use for data loading
    seed=42,                       # Random seed for reproducibility
    bf16=True,                     # Use mixed precision BFP16 training #TODO: ??????
    push_to_hub=False,             # Whether to push the final model to Hugging Face Hub
    report_to=["wandb"],              # Reporting to no one
    run_name="test",
    do_train=False,
    disable_tqdm=False,
    gradient_checkpointing=True,   # Enable gradient checkpointing        
    remove_unused_columns=False,
    do_eval=False, #TODO: use
    gradient_checkpointing_kwargs={"use_reentrant": False}, # TODO: use
    lr_scheduler_type="cosine_with_min_lr",
    lr_scheduler_kwargs={"min_lr_rate": 0.1},
    max_steps=-1, #TODO: change to -1
    resume_from_checkpoint="/home/alisavin/AgenticADMET/outputs/2025-02-25/20-38-55/checkpoint-60/"
)

grpo_config = GRPOConfig(
    **training_args.to_dict(), # Convert TrainingArguments to dictionary and unpack
    **{ 
    # REMOVED model_init_kwargs here 
    # We are passing the instantiated 'model' object, so GRPOTrainer doesn't need model_init_kwargs
    },
    num_generations=16, #TODO: 16
    use_vllm=True, #TODO: use True
    vllm_device="cuda:0",
    vllm_gpu_memory_utilization=0.25, # TODO: 0.25 0.7
    vllm_max_model_len=2048, #TODO: 2048
    max_prompt_length=800, #TODO: 800+
    max_completion_length=1024, #TODO: 1024+ (better 2048/4048 and more)
    temperature=0.7,
    reward_weights=[1.0]
    )

model_args = ModelConfig(model_name_or_path=MODEL_NAME, use_peft=False)

grpo_trainer = GRPOTrainer2(
    model=model,                      # Our initialized Qwen model
    reward_funcs=reward_functions,    # List of reward functions from previous step
    args=grpo_config,                # GRPOConfig (created from TrainingArguments)
    train_dataset=dataset['train'],   # Training dataset
    eval_dataset=dataset['validation'],    # Evaluation dataset
    processing_class=tokenizer, #TODO: check callback from config
    # peft_config=get_peft_config(model_args) #TODO: check # label_names
    peft_config=None
)

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.59s/it]


loading tokenizer


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


tokenizer loaded
chat template




INFO 02-26 00:20:16 config.py:542] This model supports multiple tasks: {'reward', 'embed', 'classify', 'generate', 'score'}. Defaulting to 'generate'.
INFO 02-26 00:20:16 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.2) with config: model='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', speculative_config=None, tokenizer='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=2048, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda:0, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=deep

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:01<00:01,  1.75s/it]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:04<00:00,  2.19s/it]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:04<00:00,  2.12s/it]



INFO 02-26 00:20:22 model_runner.py:1115] Loading model weights took 14.2712 GB
INFO 02-26 00:20:24 worker.py:267] Memory profiling takes 0.91 seconds
INFO 02-26 00:20:24 worker.py:267] the current vLLM instance can use total_gpu_memory (79.14GiB) x gpu_memory_utilization (0.25) = 19.78GiB
INFO 02-26 00:20:24 worker.py:267] model weights take 14.27GiB; non_torch_memory takes 0.02GiB; PyTorch activation peak memory takes 1.40GiB; the rest of the memory reserved for KV Cache is 4.09GiB.
INFO 02-26 00:20:24 executor_base.py:110] # CUDA blocks: 4791, # CPU blocks: 4681
INFO 02-26 00:20:24 executor_base.py:115] Maximum concurrency for 2048 tokens per request: 37.43x
INFO 02-26 00:20:28 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_ut

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:22<00:00,  1.54it/s]

INFO 02-26 00:20:50 model_runner.py:1562] Graph capturing finished in 23 secs, took 0.22 GiB
INFO 02-26 00:20:50 llm_engine.py:431] init engine (profile, create kv cache, warmup model) took 27.80 seconds





In [6]:
# Before creating the trainer
if hasattr(model, "active_adapter"):
    print(f"Active adapter: {grpo_trainer.model.active_adapter}")
    print("Adapter names:", grpo_trainer.model.peft_config.keys())
else:
    print("No adapter is active. This might be just the base model.")

Active adapter: default
Adapter names: dict_keys(['default'])


In [7]:
def are_models_identical(model1, model2):
    # Get named parameters for both models
    params1 = dict(model1.named_parameters())
    params2 = dict(model2.named_parameters())
    
    # Check if they have the same parameter names
    if params1.keys() != params2.keys():
        print("Models have different parameter structures")
        return False
    
    # Check if parameter values are identical
    all_equal = True
    for name in params1.keys():
        if not torch.allclose(params1[name], params2[name], atol=1e-5):
            print(f"Parameters differ at: {name}")
            all_equal = False
            # Optional: print some details about the differing parameters
            print(f"  Model1: min={params1[name].min()}, max={params1[name].max()}, mean={params1[name].mean()}")
            print(f"  Model2: min={params2[name].min()}, max={params2[name].max()}, mean={params2[name].mean()}")
            # Only show a few differences to avoid overwhelming output
            if not all_equal:
                break
                
    return all_equal

# Usage:
are_identical = are_models_identical(base_model, model)
print(f"Models are identical: {are_identical}")

Models have different parameter structures
Models are identical: False


In [8]:
train_result = grpo_trainer.evaluate()

equations is deprecated, as it handled by the parser now


parsed correctly 3.8 2.4


equations is deprecated, as it handled by the parser now


parsed correctly 5.2 1.8


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
equations is deprecated, as it handled by the parser now


parsed correctly 4.2 -0.4


equations is deprecated, as it handled by the parser now


parsed correctly 5.2 2.0


equations is deprecated, as it handled by the parser now
equations is deprecated, as it handled by the parser now


parsed correctly 4.5 0.4
parsed correctly 4.8 0.4


equations is deprecated, as it handled by the parser now


could not convert string to float: 'LogD value is high due to aromatic rings , chlorine, but precise value requires a LogD calculator.' ['LogD value is high due to aromatic rings , chlorine, but precise value requires a LogD calculator.']


equations is deprecated, as it handled by the parser now


parsed correctly 2.5 0.0


equations is deprecated, as it handled by the parser now
equations is deprecated, as it handled by the parser now


parsed correctly 3.2 0.7
parsed correctly 3.0 0.7


equations is deprecated, as it handled by the parser now
equations is deprecated, as it handled by the parser now


parsed correctly -2.5 2.8
parsed correctly 2.7 2.8


equations is deprecated, as it handled by the parser now


parsed correctly 4.2 2.2


equations is deprecated, as it handled by the parser now


parsed correctly 5.2 4.3


equations is deprecated, as it handled by the parser now


parsed correctly -0.5 1.5


equations is deprecated, as it handled by the parser now
equations is deprecated, as it handled by the parser now


parsed correctly 2.7 0.6
parsed correctly 4.8 0.6


equations is deprecated, as it handled by the parser now


parsed correctly 2.8 3.8


In [8]:
train_result = grpo_trainer.evaluate()

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


equations is deprecated, as it handled by the parser now


parsed correctly 4.2 2.0


equations is deprecated, as it handled by the parser now
equations is deprecated, as it handled by the parser now


parsed correctly 5.8 0.0
parsed correctly 4.2 0.0


equations is deprecated, as it handled by the parser now


parsed correctly 4.2 2.5


equations is deprecated, as it handled by the parser now


parsed correctly 4.0 4.4


equations is deprecated, as it handled by the parser now


parsed correctly 4.7 1.8


In [9]:
import glob

pred_path = "test/completions/tuned_8.0/*.json"
num_generations = Path(pred_path).parts[-2].split("_")[-1]
pths_1 = glob.glob(pred_path)
# pth_2 = glob.glob("test/not_tuned_pred/*json")

dict_all = {}

mean_mae_1 = []
mean_mae_2 = []

mean_mae_1_median = []
mean_mae_2_median = []


for pth_i in pths_1:
    with open(pth_i, "r") as f:
        dict_i = json.load(f)
    smiles = dict_i["smiles"]
    smiles_hash = hashlib.blake2b(smiles.encode('utf-8'), digest_size=4).hexdigest()

    pth = f"test/completions/init_{num_generations}/parsed_{smiles_hash}.json" if Path(f"test/completions/init_{num_generations}/parsed_{smiles_hash}.json").exists() else f"test/completions/init_{num_generations}/{smiles_hash}.json"
    with open(pth, "r") as f:
        dict_i_2 = json.load(f)
    
    dict_all[smiles] = {
        "7BQwen": {
            "completion": dict_i_2["completion"], 
            "answer_val": dict_i_2["answer_val"],
            "mae": dict_i_2["mae"],
            "mae_median": dict_i_2["mae_median"]
        },
        "7BQwenTuned": {
            "completion": dict_i["completion"], 
            "answer_val": dict_i["answer_val"],
            "mae": dict_i["mae"],
            "mae_median": dict_i["mae_median"]
        },
        "gold_val": dict_i_2["gold_val"],
    }
    if dict_i_2["answer_val"] is None:
        dict_all[smiles]["7BQwen"]["answer_parsed"] = dict_i_2["answer_parsed"]
    if dict_i["answer_val"] is None:
        dict_all[smiles]["7BQwenTuned"]["answer_parsed"] = dict_i_2["answer_parsed"]

    if dict_i["mae"] is not None:
        mean_mae_1.extend([float(v_i) if v_i is not None else 10 for v_i in dict_i["mae"]])
    if dict_i_2["mae"] is not None:
        mean_mae_2.append([float(v_i) if v_i is not None else 10 for v_i in dict_i_2["mae"]])

    if dict_i["mae_median"] is not None:
        mean_mae_1_median.append(float(dict_i["mae_median"]))
    if dict_i_2["mae_median"] is not None:
        mean_mae_2_median.append(float(dict_i_2["mae_median"]))
print(f"mean mae tuned - {np.mean(mean_mae_1)}, mean mae - {np.mean(mean_mae_2)}")
print(f"median: mean mae tuned - {np.mean([v_i for v_i in mean_mae_1_median if not np.isnan(v_i)])}, mean mae - {np.mean([v_i for v_i in mean_mae_2_median if not np.isnan(v_i)])}")

with open(f"./test/completions/all_results_{num_generations}.json", "w") as f:
    json.dump(dict_all, f, indent=2)
    


mean mae tuned - 9.888541666666667, mean mae - 9.888541666666667
median: mean mae tuned - 2.4400000000000004, mean mae - 2.4400000000000004


In [8]:
# import os

# directory = "./test/completions/pred/"  # Replace with your directory path

# for filename in os.listdir(directory):
#     if filename.startswith("parsed"):
#         # Remove the underscore and add "parsed_" if needed
#         new_name = "parsed_" + filename[7:]
        
#         # Full paths for renaming
#         old_path = os.path.join(directory, filename)
#         new_path = os.path.join(directory, new_name)
        
#         # Rename the file
#         os.rename(old_path, new_path)
#         print(f"Renamed: {filename} → {new_name}")

Renamed: parsed3afaf4aa.json → parsed_afaf4aa.json
Renamed: parsedbeff7dfa.json → parsed_eff7dfa.json
Renamed: parsed159fc2ec.json → parsed_59fc2ec.json
Renamed: parsed1a554c5c.json → parsed_a554c5c.json
Renamed: parsedc94a83f1.json → parsed_94a83f1.json
Renamed: parsed189835d9.json → parsed_89835d9.json
Renamed: parsed2bed14c1.json → parsed_bed14c1.json
Renamed: parsed16bad6bb.json → parsed_6bad6bb.json
Renamed: parsedb940e7a8.json → parsed_940e7a8.json
Renamed: parsed21a08268.json → parsed_1a08268.json
Renamed: parsedec93f253.json → parsed_c93f253.json
Renamed: parsed9ef3031d.json → parsed_ef3031d.json
Renamed: parsedfc6cfa4d.json → parsed_c6cfa4d.json
Renamed: parsede8858049.json → parsed_8858049.json
Renamed: parsed03476906.json → parsed_3476906.json
Renamed: parsedd26c74eb.json → parsed_26c74eb.json
Renamed: parsed9698e723.json → parsed_698e723.json
Renamed: parsedbb9f2209.json → parsed_b9f2209.json
Renamed: parsed822656fc.json → parsed_22656fc.json
Renamed: parsedf5f8aded.json → 