In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '6'
from unsloth import FastLanguageModel, PatchFastRL
PatchFastRL("GRPO", FastLanguageModel)
from unsloth import is_bfloat16_supported
import torch
max_seq_length = 8192 # Can increase for longer reasoning traces
lora_rank = 64 # Larger rank = smarter, but slower

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "/cephfs/qiuwentao/models/models--Qwen--Qwen2.5-7B/snapshots/d149729398750b98c0af14eb82c78cfe92750796",
    max_seq_length = max_seq_length,
    load_in_4bit = False, # False for LoRA 16bit
    fast_inference = True, # Enable vLLM fast inference
    max_lora_rank = lora_rank,
    gpu_memory_utilization = 0.8, # Reduce if out of memory
    max_logprobs         = 50
)
model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ], # Remove QKVO if out of memory
    lora_alpha = lora_rank,
    use_gradient_checkpointing = "unsloth", # Enable long context finetuning
    random_state = 3407,
)


In [None]:
SYSTEM_PROMPT = \
f"""You are given a problem.
Think about the problem and provide your working out.
You must think in Bahasa Indonesia."""
from datasets import load_dataset,Dataset
#打开/data/home/user0/llm-nas/panzy/llm/project/prorl-verl/recipe/prorl/data/math/dapo-math-17k.parquet
dataset = load_dataset("parquet", data_files = "./datasets/open-r1/DAPO-Math-17k-Processed/en/train-00000-of-00001.parquet")['train']
def get_IE(split = "train") -> Dataset:
    
    data = dataset.map(lambda x: {
        "prompt": [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": x["source_prompt"][0]['content']},
        ],
        "answer": x['solution'],
    }, remove_columns=dataset.column_names)
    return data
# dataset = get_gsm8k_questions()
dataset=get_IE()


In [3]:
dataset[1]

{'prompt': [{'content': 'You are given a problem.\nThink about the problem and provide your working out.\nYou must think in Bahasa Indonesia.',
   'role': 'system'},
  {'content': 'Solve the following math problem step by step. The last line of your response should be of the form Answer: $Answer (without quotes) where $Answer is the answer to the problem.\n\nLet $ABCD$ be a unit square in the plane. Points $X$ and $Y$ are chosen independently and uniformly at random on the perimeter of $ABCD$. If the expected value of the area of triangle $\\triangle AXY$ can be expressed as $\\frac{m}{n}$ for relatively prime positive integers $m$ and $n$, compute $m+n$.\n\nRemember to put your answer on its own line after "Answer:".',
   'role': 'user'}],
 'answer': '113'}

In [13]:
import re
def strict_format_reward_func(completions, **kwargs) -> list[float]:
    """Reward function that checks if the completion has a specific format."""
    pattern = r"^<think>\n.*?\n</think>"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, r) for r in responses]
    rewards=[0.5 if match else 0.0 for match in matches]
    return  rewards
def extract_xml_answer(text: str) -> str:
    answer = text.split("Answer")[-1]
    return answer.strip()
def correctness_ie_reward_func(prompts, completions, answer, **kwargs) -> list[float]:
    responses = [completion[0]['content'] for completion in completions]
    q = prompts[0][-1]['content']
    extracted_responses = [extract_xml_answer(r) for r in responses]
    print('-'*20, f"Question:\n{q}", f"\nAnswer:\n{answer[0]}", f"\nResponse:\n{responses[0]}", f"\nExtracted:\n{extracted_responses[0]}")
    rewards=[2.0 if str(a) in r   else 0.0 for r, a in zip(extracted_responses, answer)]
    return rewards

In [None]:
import json
import pandas as pd
import numpy as np
from typing import List, Dict
from tqdm import tqdm

# ==============================================================================
# New, extensible scoring function
# ==============================================================================
def _calculate_cluster_score(group_df: pd.DataFrame) -> pd.Series:
    """
    Calculates cluster_score for a group of samples under the same prompt_idx.

    Args:
        group_df (pd.DataFrame): DataFrame containing all samples for a single prompt_idx.

    Returns:
        pd.Series: A Series containing the cluster_score for each sample, with an index matching the input.
        
    Scoring Logic:
    - Ranks samples by reward_score in descending order.
    - The top-ranked sample gets a score of 1.0.
    - The second-ranked gets 0.9, and so on, decreasing by 0.1 each time.
    - If there are more than 10 samples, the minimum score is 0.0.
    
    Extensibility:
    You can copy this function and modify the internal 'scores' generation logic
    (e.g., to use exponential decay, normalization, etc.), then call your new function 
    in the main process.
    """
    # 1. Sort by reward_score in descending order to determine rank
    sorted_group = group_df.sort_values(by="reward_score", ascending=False)
    
    # 2. Generate the list of scores [1.0, 0.9, 0.8, ...]
    num_samples = len(sorted_group)
    scores = [max(0.0, 1.0 - i * 0.1) for i in range(num_samples)]
    
    # 3. Create a Series of scores with an index corresponding to the sorted group
    score_series = pd.Series(scores, index=sorted_group.index)
    
    # 4. Sort the score Series by the original index to ensure correct alignment with the original DataFrame
    return score_series.sort_index()


# ==============================================================================
# Other helper functions (unchanged)
# ==============================================================================
def _calculate_cluster_pos_distribution(df: pd.DataFrame, num_bins: int = 10) -> List[float]:
    """Calculates the distribution ratio of high-entropy tokens in different position bins within the text sequence."""
    high_df = df[df['high_entropy'] == 1].copy()
    total_high_entropy_tokens = len(high_df)

    if total_high_entropy_tokens == 0:
        return [0.0] * num_bins

    total_len = len(df)
    high_df['pos_norm_internal'] = (high_df.index + 1) / total_len
    
    bins = np.linspace(0.0, 1.0, num_bins + 1)
    bin_assignments = pd.cut(high_df['pos_norm_internal'], bins=bins, labels=False, include_lowest=True, right=True)
    
    bin_counts = bin_assignments.value_counts(sort=False)
    proportions = bin_counts.reindex(range(num_bins), fill_value=0) / total_high_entropy_tokens
    
    return [round(p, 4) for p in proportions.tolist()]

# ==============================================================================
# Main processing function (refactored to support grouped scoring)
# ==============================================================================
def process_jsonl(input_files: List[str], output_file: str, top_percent: float = None, top_k: int = None):
    """
    Processes JSONL files, adding grouped ranking and scoring functionality.

    Workflow:
    1. Read all data into an in-memory DataFrame.
    2. Group by 'prompt_idx' and call _calculate_cluster_score to compute 'cluster_score'.
    3. Iterate through each record to calculate 'high_entropy_pos_norm' and 'cluster_entropy_pos'.
    4. Write the complete records, including all new fields, to the output file.
    """
    if top_k is None and top_percent is None:
        raise ValueError("Error: You must specify either 'top_k' or 'top_percent'.")

    print(f"Starting to process files...")

    # --- Step 1: Read all data into memory ---
    all_data = []
    for input_file in input_files:
        with open(input_file, "r", encoding="utf-8") as fin:
            for line in fin:
                all_data.append(json.loads(line.strip()))
    
    if not all_data:
        print("Input file(s) are empty. Processing finished.")
        return
        
    main_df = pd.DataFrame(all_data)

    # --- Step 2: Group and calculate cluster_score ---
    print("Grouping by prompt_idx and calculating cluster_score...")
    # Using groupby().apply() is a clear way to handle this custom scoring logic per group.
    scores = main_df.groupby('prompt_idx').apply(_calculate_cluster_score).reset_index(level=0, drop=True)
    main_df['cluster_score'] = scores
    main_df['cluster_score'] = main_df['cluster_score'].round(4) # Format the score

    # --- Step 3 & 4: Iterate through records, calculate other features, and write to file ---
    print("Calculating other features and writing to file...")
    with open(output_file, "w", encoding="utf-8") as fout:
        # Convert DataFrame to a list of dictionaries for easier processing
        for record in tqdm(main_df.to_dict('records')):
            entropy_list = record.get('entropy', [])
            df_row = pd.DataFrame({'entropy': entropy_list})
            total_len = len(df_row)

            # Calculate n (number of high-entropy tokens)
            n = 0
            if total_len > 0:
                if top_k is not None:
                    n = min(top_k, total_len)
                elif top_percent is not None:
                    n = max(1, int(total_len * top_percent))

            # Calculate high_entropy_pos_norm
            high_entropy_pos_norm_list = []
            if n > 0:
                high_entropy_indices = df_row.nlargest(n, 'entropy').index.sort_values()
                raw_list = ((high_entropy_indices + 1) / total_len).tolist()
                high_entropy_pos_norm_list = [round(p, 4) for p in raw_list]
                
                # Prepare for distribution calculation
                df_row['high_entropy'] = 0
                df_row.loc[high_entropy_indices, 'high_entropy'] = 1
            else:
                 df_row['high_entropy'] = 0

            # Calculate cluster_entropy_pos
            cluster_pos_distribution = _calculate_cluster_pos_distribution(df_row, num_bins=10)

            # Build the final output
            final_output = {
                "prompt_idx": record.get("prompt_idx"),
                "sample_idx": record.get("sample_idx"),
                "reward_score": record.get("reward_score"),
                "cluster_score": record.get("cluster_score"),  # Insert the newly calculated score
                "cluster_entropy_pos": cluster_pos_distribution,
                "high_entropy_pos_norm": high_entropy_pos_norm_list,
            }
            
            fout.write(json.dumps(final_output, ensure_ascii=False) + "\n")
             
    print(f"Processing complete. Results written to {output_file}")

In [None]:
from vllm import SamplingParams
vllm_sampling_params = SamplingParams(
    min_p = 0.1,
    top_p = 1.0,
    top_k = -1,
    seed = 3407,
    stop = [tokenizer.eos_token],
    include_stop_str_in_output = True,
    logprobs=20
)

from trl import GRPOConfig, GRPOTrainer
training_args = GRPOConfig(
    use_vllm=True,
    temperature = 1.0,
    learning_rate = 5e-6,
    weight_decay = 0.01,
    warmup_ratio = 0.1,
    lr_scheduler_type = "linear",
    optim = "adamw_8bit",
    logging_steps = 1,
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 4, # Increase to 4 for smoother training
    num_generations = 4, # Decrease if out of memory
    max_prompt_length = 1024,
    max_completion_length = 5096,
    num_train_epochs = 30, # Set to 1 for a full training run
    # max_steps = 100,
    save_steps = 100,
    report_to = "swanlab", # Can use Weights & Biases
    output_dir = "outputs/20250727",

    # For optional training + evaluation
    # fp16_full_eval = True,
    # per_device_eval_batch_size = 4,
    # eval_accumulation_steps = 1,
    # eval_strategy = "steps",
    # eval_steps = 1,
)

Unsloth: We now expect `per_device_train_batch_size` to be a multiple of `num_generations`.
We will change the batch size of 2 to the `num_generations` of 4


In [None]:
# For optional training + evaluation
new_dataset = dataset.train_test_split(test_size = 0.01)

trainer = GRPOTrainer(
    model = model,
    processing_class = tokenizer,
    reward_funcs = [
        strict_format_reward_func,
        correctness_ie_reward_func,
        entropy_distribution_reward
    ],
    args = training_args,
    train_dataset = dataset,

    # For optional training + evaluation
    # train_dataset = new_dataset["train"],
    # eval_dataset = new_dataset["test"],
)
trainer.train()

In [None]:
model.save_pretrained_merged("Qwen3_high_entropy", tokenizer)