In [None]:
"""
GRPO
"""

In [None]:
from datasets import load_dataset
from trl import GRPOConfig, GRPOTrainer
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import re
pattern = r'\[.*?\]'

dataset = load_dataset("json", data_files="/opt/tiger/DataX/train/GRPO-train-1112-for-qwen.jsonl", split="train")

model_path = "/mnt/bn/brench-lf-volume/wkq_wsp/checkpoint-1340"
# 加载训练好的奖励模型
reward_model = AutoModelForSequenceClassification.from_pretrained(model_path)
reward_model.eval()
tokenizer = AutoTokenizer.from_pretrained(model_path, padding_side="left")

system_prompt = """你是评论质量评估专家,需评估以下维度并输出0-1分数(越高越好):
1. 相关性:与内容关联紧密;
2. 情绪:中性/正面优于负面;
3. 观点:清晰有逻辑;
4. 提问:具体有意义;
5. 俏皮话:自然有趣。"""
base_length: int = 12
sigma: float = 3.0
length_type: str = "char"
penalty_coef: float = 0.1
max_penalty = 100

import torch
from typing import List, Dict

import emoji
def extract_emojis(text):
    # 使用列表推导式遍历文本中的每个字符，判断是否为 Emoji
    return [char for char in text if emoji.is_emoji(char)]

def reward_with_length_penalty(prompts, completions, **kwargs):
    """
    带长度惩罚的奖励函数，支持多卡训练
    """
    rewards = []
    origin_reward = []
    cplx_texts = []
    
    # 获取当前进程的设备（多卡训练时每个进程有不同的local_rank）
    if torch.distributed.is_initialized():
        local_rank = torch.distributed.get_rank()
        device = torch.device(f"cuda:{local_rank}")
    else:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # 确保reward_model在正确的设备上
    if not hasattr(reward_with_length_penalty, '_model_moved'):
        reward_model.to(device)
        reward_with_length_penalty._model_moved = True
    
    for idx, completion in enumerate(completions):
        # 1. 获取生成的回复文本
        completion_text = completion[0]["content"].split("</think>\n\n")[1]
        cplx_texts.append(completion_text)

        # 提取用户输入内容(健壮性处理)
        try:
            user_content = next(
                x["content"] for x in prompts[0] if x["role"] == "user"
            )
            if not user_content:
                raise ValueError("用户输入内容为空")
        except StopIteration:
            raise ValueError("Prompt中未找到user角色的内容")
        
        # 2. 计算生成文本的长度
        if length_type == "char":
            length = len(completion_text)
        elif length_type == "token":
            # Token数计算(排除特殊Token,如<|im_start|>)
            length = len(tokenizer.encode(completion_text, add_special_tokens=False))
        else:
            raise ValueError(f"不支持的长度类型:{length_type},请选'char'或'token'")
        
        # 3. 计算长度惩罚项(核心:负二次函数)
        delta = abs(length - base_length)  # 偏离基准的差值(正=过长,负=过短)
        penalty_term = 3 if delta == 0 else -penalty_coef * (delta ** 2)  # 惩罚项:负二次函数(偏离越大,越负)
        
        # 可选:限制最大惩罚(防止惩罚过度,如max_penalty=10→惩罚不超过扣10分)
        if max_penalty is not None:
            penalty_term = max(-max_penalty, penalty_term)  # 惩罚项≥-max_penalty

        # 4. 构造模型输入(遵循ChatML格式)
        input_text = (
            f"<|im_start|>system\n{system_prompt}<|im_end|>\n"
            f"<|im_start|>user\n{user_content}<|im_end|>"
        )
        # Tokenize输入(适配模型输入格式,自动Padding/Truncation)
        inputs = tokenizer(
            input_text,
            completion_text,
            return_tensors="pt",
            padding="max_length",
            truncation=True,
            max_length=10000  # 根据模型最大序列长度调整
        )
        
        # 关键修复:将inputs移动到与模型相同的设备上
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        # 5. 模型评分(无梯度计算)
        with torch.no_grad():
            outputs = reward_model(**inputs)
            # 注意:需根据模型输出调整(此处假设模型输出标量评分)
            raw_score = outputs.logits.item()  # 原始得分(越高越好)


        # 定义正则表达式模式，用于匹配表情标签
        re.findall(pattern, completion_text)
        format_reward = -3 if "\n" in completion_text or "。" in completion_text else 1
        format_reward += len(re.findall(pattern, completion_text)) * -2
        emoji_len = len(extract_emojis(completion_text))
        format_reward += (emoji_len * -2) if emoji_len > 0 else 1
        
        # 6. 最终奖励 = 原始得分 + 长度惩罚项(可负)
        final_reward = raw_score + penalty_term + format_reward
        origin_reward.append(raw_score)
        rewards.append(final_reward)

    print(f"completion_text:{cplx_texts}")
    print(f"原始奖励:{origin_reward}")
    print(f"带长度惩罚的奖励:{rewards}")
    return rewards


training_args = GRPOConfig(
    output_dir="/mnt/bn/brench-lf-volume/wkq_wsp/train_result/Qwen3-4B-SFT-GRPO-1113",
    max_prompt_length=10000,
    epsilon_high=0.28,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    save_strategy="epoch",
    optim="paged_adamw_8bit",
    report_to="tensorboard",
    beta=0.001,
    logging_steps=2,
    # 可选优化配置
    gradient_checkpointing=True,  # 节省显存
    bf16=True,  # 使用BF16混合精度
    temperature=1,
    top_p=1
)

model_name_or_path = "/mnt/bn/brench-lf-volume/wkq_wsp/train_result/Qwen3-4B-Inst-SFT-1113/checkpoint-82"
# 2. 加载tokenizer
# train_tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side="left")

trainer = GRPOTrainer(
    model=model_name_or_path,
    reward_funcs=reward_with_length_penalty,
    args=training_args,
    train_dataset=dataset,
    # processing_class=train_tokenizer,
)

trainer.train()

In [None]:
"""
SFT
"""

In [None]:
from trl import SFTTrainer, SFTConfig
from datasets import load_dataset

sft_config = SFTConfig(
    output_dir="/mnt/bn/brench-lf-volume/wkq_wsp/train_result/Qwen3-4B-Inst-SFT-1113",
    chat_template_path="/mnt/bn/brench-lf-volume/wkq_wsp/models/Qwen3-4B-Instruct-2507",
    save_strategy="epoch",
    optim="paged_adamw_8bit",
    report_to="tensorboard",
    num_train_epochs=8,
    logging_steps=2,
    # 可选优化配置
    gradient_checkpointing=True,  # 节省显存
)

trainer = SFTTrainer(
    model="/mnt/bn/brench-lf-volume/wkq_wsp/models/Qwen3-4B-Instruct-2507",
    args=sft_config,
    train_dataset=load_dataset("json", data_files="/opt/tiger/DataX/train/yzc-train-data-2.58k.jsonl", split="train"),
)
trainer.train()

In [None]:
"""
inference
"""

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# 1. 加载训练好的模型
model_path = "/mnt/bn/brench-lf-volume/wkq_wsp/train_result/Qwen3-4B-Inst-SFT-1113/checkpoint-82"  # 替换为具体checkpoint
# 或者使用最终模型: 
# model_path = "/mnt/bn/brench-lf-volume/wkq_wsp/train_result/Qwen3-4B-Inst-GRPO-1113"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16,  # 与训练时一致
    device_map="auto"  # 自动分配设备
)
model.eval()

# 2. 准备输入
user_input = """### 视频描述
视频中，一位身穿蓝色衬衫的男子正在讲述一个街头发生的事件。画面中首先出现一位大爷摔倒的场景，一位小姐姐见状上前去扶大爷，然而大爷一把抓住小姐姐，坚称是小姐姐撞到了他。随后视频中不断插入街头现场的画面，展示了周围其他人的反应以及事件进一步的发展情况，包括现场不同人物的互动等，详细呈现了大爷摔倒后与小姐姐之间产生纠纷的整个过程，从小姐姐上前搀扶大爷，到大爷抓住小姐姐声称被撞，再到现场其他人的相关举动，完整地展现了这一具有争议性的街头事件的来龙去脉。

### 视频分类
时政社会

"""  # 替换为实际输入

# 构造ChatML格式的输入
messages = [
    {"role": "system", "content": """你是一个发评机器人，针对输入的内容生成6条不同角度的评论内容，用换行符分隔。评论字数控制在12字左右。"""},
    {"role": "user", "content": user_input}
]

# 使用tokenizer的chat_template格式化
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

# 3. Tokenize输入
inputs = tokenizer(text, return_tensors="pt").to(model.device)

# 4. 生成回复
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=50,  # 根据你的base_length=12调整
        temperature=1,
        top_p=1,
        do_sample=True,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id
    )

# 5. 解码输出
generated_text = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
# print(f"用户输入: {user_input}")
print(f"模型回复: {generated_text}")