#### 加载IDRR_data的dataframe

In [1]:
import pandas as pd
from IDRR_data import *

In [2]:
data_path = r'/data/whsun/idrr/data/raw/pdtb2.p1.csv'
df = IDRRDataFrames(
    data_name='pdtb2',
    data_level='top',
    data_relation='Implicit',
    data_path=data_path,
)

In [3]:
df.train_df

Unnamed: 0,arg1,arg2,conn1,conn2,conn1sense1,conn1sense2,conn2sense1,conn2sense2,relation,split,...,label12,label12id,label21,label21id,label22,label22id,ans_word1,ans_word1id,ans_word2,ans_word2id
1183,"In an Oct. 19 review of ""The Misanthrope"" at C...",Ms. Haag plays Elianti,however,,Comparison.Contrast.Juxtaposition,,,,Implicit,train,...,,,,,,,however,3,,
1184,BELL INDUSTRIES Inc. increased its quarterly t...,The new rate will be payable Feb. 15,and,,Expansion.Conjunction,,,,Implicit,train,...,,,,,,,and,11,,
1185,The new rate will be payable Feb. 15,A record date hasn't been set,however,,Comparison.Contrast.Juxtaposition,,,,Implicit,train,...,,,,,,,however,3,,
1186,Solo woodwind players have to be creative if t...,The oboist Heinz Holliger has taken a hard lin...,for example,,Expansion.Instantiation,,,,Implicit,train,...,,,,,,,instance,13,,
1187,"Richard Stoltzman has taken a gentler, more au...","Years ago, he collaborated with the new music ...",although,,Expansion.Restatement.Specification,,,,Implicit,train,...,,,,,,,although,0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13810,Exports declined for the second month in a row...,An analyst called it one of the worst trade re...,in fact,,Expansion.Conjunction,,,,Implicit,train,...,,,,,,,and,11,,
13811,"Industrial output fell 0.1% in September, the ...",weaker capital spending and exports,because,,Contingency.Cause.Reason,,,,Implicit,train,...,,,,,,,because,4,,
13812,Three big drug makers posted robust third-quar...,"Merck's profit climbed 25%, Warner-Lambert's 2...",specifically,,Expansion.Restatement.Specification,,,,Implicit,train,...,,,,,,,specifically,15,,
13813,The B/T gene rearrangement test is more accura...,the test initially will be used in conjunction...,so,,Contingency.Cause.Result,,,,Implicit,train,...,,,,,,,so,5,,


In [4]:
from collections import Counter
Counter(df.train_df['label11'])
def write_jsonl(file, data):
    with open(file, 'w') as f:
        for item in data:
            f.write(json.dumps(item) + '\n')
def read_txt(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    return content
def read_jsonl(file_path):
    import json
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    return data
def write_json(file, data):
    import json
    import os
    os.makedirs(os.path.dirname(file), exist_ok=True)
    with open(file, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=4, ensure_ascii=False)
    print(f"Data saved to {file}")

#### parquet格式化数据 (以qwen3为例)

In [5]:
from IDRR_data import IDRRDataFrames
df = IDRRDataFrames(
    data_name='pdtb2',
    data_level='top',
    data_relation='Implicit',
    data_path=r'/data/whsun/idrr/data/raw/pdtb2.p1.csv',
)
df

pdtb2_top_Implicit

In [None]:
prompt_template = read_txt("/data/whsun/idrr/prompts/rl_base.txt")
prompt_template

'### Task\nYou are an expert in the field of implicit discourse relations. Your task is to determine the semantic-logical relationship between two given text segments and select the most appropriate relation label. Output only one of A, B, C, or D, and enclose it in \\boxed{{}}.\n\n### Relations\n{relation_terms}\n\n### Segments\nText segment 1: {arg1}\nText segment 2: {arg2}\n\nYour answer:\n'

In [17]:
from tqdm import tqdm
from datasets import Dataset
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("/data/whsun/pretrained_models/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B")

def get_rl_data(data_source: str, df, label_list):
    rl_data = []
    relation_terms = '\n'.join([f"{chr(65 + i)}. {label}" for i, label in enumerate(label_list)])
    label2alpha = {label: chr(65 + i) for i, label in enumerate(label_list)}
    for index, row in df.iterrows():
        prompt = prompt_template.format(
            relation_terms=relation_terms,
            arg1=row['arg1'],
            arg2=row['arg2'],
        )
        grounth_truth_alpha = label2alpha[row["label11"]]
        messages = [
            {"role": "user", "content": prompt}
        ]
        prompt_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)

        rl_data.append(
            {
                "data_source": data_source,
                "prompt": [{"content": prompt_text, "role": "user"}],
                "reward_model": {"ground_truth": grounth_truth_alpha},
            }
        )
    return Dataset.from_list(rl_data, split="train")

train_rl_dataset = get_rl_data("pdtb", df.train_df, df.label_list)
train_rl_dataset[0]

{'data_source': 'pdtb',
 'prompt': [{'content': '<｜begin▁of▁sentence｜><｜User｜>### Task\nYou are an expert in the field of implicit discourse relations. Your task is to determine the semantic-logical relationship between two given text segments and select the most appropriate relation label. Output only one of A, B, C, or D, and enclose it in \\boxed{}.\n\n### Relations\nA. Comparison\nB. Contingency\nC. Expansion\nD. Temporal\n\n### Segments\nText segment 1: In an Oct. 19 review of "The Misanthrope" at Chicago\'s Goodman Theatre ("Revitalized Classics Take the Stage in Windy City," Leisure & Arts), the role of Celimene, played by Kim Cattrall, was mistakenly attributed to Christina Haag\nText segment 2: Ms. Haag plays Elianti\n\nYour answer:\n',
   'role': 'user'}],
 'reward_model': {'ground_truth': 'A'}}

In [18]:
dev_rl_dataset = get_rl_data("pdtb", df.dev_df, df.label_list)
test_rl_dataset = get_rl_data("pdtb", df.test_df, df.label_list)
dev_rl_dataset, test_rl_dataset

(Dataset({
     features: ['data_source', 'prompt', 'reward_model'],
     num_rows: 1183
 }),
 Dataset({
     features: ['data_source', 'prompt', 'reward_model'],
     num_rows: 1046
 }))

In [19]:
train_rl_dataset.to_parquet("/data/whsun/idrr/data/rl/verl/pdtb2/top/train.parquet")
# dev_rl_dataset.to_parquet("/data/whsun/idrr/data/rl/verl/pdtb2/top/dev.parquet")
test_rl_dataset.to_parquet("/data/whsun/idrr/data/rl/verl/pdtb2/top/test.parquet")

Creating parquet from Arrow format: 100%|██████████| 13/13 [00:00<00:00, 512.68ba/s]
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 949.80ba/s]


703089

#### 将label转化为关系定义进行sft
1. 将二级label转为关系定义
2. 模型1：论元对 -> 关系定义
3. 模型2：模型1给出文本 -> label

In [None]:
prompt_tmeplate = read_txt("/data/sunwh/idrr/prompts/arg2def.txt")
prompt_tmeplate

'Briefly describe the relationship between two arguments and output the final relation label.\nArg1: {arg1}\nArg2: {arg2}'

In [6]:
sense2df = {
    "Temporal": "The situations described in the arguments are related temporally.",
    "Temporal.Asynchronous": "One envent is described as preceding the other.",
    "Temporal.Synchrony": "There is some degree of temporal overlap between the events described by the arguments.",
    "Contingency": "One of the situations described in Arg1 and Arg2 causally influences the other.",
    "Contingency.Cause":"The situations described in the arguments are causally influenced and the two are not in a conditional relation.",
    "Contingency.Pragmatic cause":"Arg1 expresses a claim and Arg2 provides justification for this claim.",
    "Contingency.Condition": "The situation in Arg2 is taken to be the condition and the situation described in Arg1 is taken to be the consequence.",
    "Contingency.Pragmatic condition": "Used for instances of conditional constructions whose interpretation deviates from that of the semantics of “Condition”.",
    "Comparison": "A discourse relation is established between Arg1 and Arg2 in order to highlight prominent differences between the two situations.",
    "Comparison.Contrast": "Arg1 and Arg2 share a predicate or property and a difference is highlighted with respect to the values assigned to the shared property.",
    "Comparison.Pragmatic contrast": "A contrast between one of the arguments and an inference that can be drawn from the other, in many cases at the speech act level: The contrast is not between the situations described in Arg1 and Arg2.",
    "Comparison.Concession": "One argument denotes a fact that triggers a set of potential consequences, while the other denies one or more of them.",
    "Comparison.Pragmatic concession": "One argument denotes a fact that triggers a set of potential consequences, while the other denies one or more of them. The denial is not at the level of the situations described in Arg1 and Arg2, but rather at the level of inferences that can be drawn from them.",
    "Expansion": "Expanding the discourse and move its narrative or exposition forward.",
    "Expansion.Conjunction":"The situation described in Arg2 provides additional, discourse new, information that is related to the situation described in Arg1, but is not related to Arg1 in any of the ways described for other types of “EXPANSION”.",
    "Expansion.Instantiation":"Arg1 evokes a set and Arg2 describes it in further detail, It may be a set of events, a set of reasons, or a generic set of events, behaviors, attitudes, etc.",
    "Expansion.Restatement":"The semantics of Arg2 restates the semantics of Arg1. It is inferred that the situations described in Arg1 and Arg2 hold true at the same time.",
    "Expansion.Alternative":"Two arguments denote alternative situations.",
    "Expansion.Exception":"Arg2 specifies an exception to the generalization specified by Arg1. In other words, Arg1 is false because Arg2 is true, but if Arg2 were false, Arg1 would be true.",
    "Expansion.List":"Arguments are members of a list, defined in the prior discourse.“List”does not require the situations specified in Arg1 and Arg2 to be directly related."
}
def write_json(file, data):
    import json
    import os
    os.makedirs(os.path.dirname(file), exist_ok=True)
    with open(file, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=4, ensure_ascii=False)
    print(f"Data saved to {file}")

def write_alpaca_format(df, file_path):
    alpaca_data = []
    for _, row in df.iterrows():
        prompt = prompt_tmeplate.format(
            arg1=row['arg1'],
            arg2=row['arg2'],
        )
        sense = row['conn1sense1']
        sense_lst = sense.split('.')
        if len(sense_lst) > 1:
            sense = '.'.join(sense_lst[:2])
        alpaca_data.append(
            {
                "instruction": prompt,
                "input": "",
                "output": sense2df[sense] + f' Relation: {sense}' if len(df) > 9999 else sense,
            }
        )
    write_json(file_path, alpaca_data)

write_alpaca_format(df.train_df, "/data/sunwh/idrr/data/arg2def/pdtb2/aplaca/train.json")
write_alpaca_format(df.test_df, "/data/sunwh/idrr/data/arg2def/pdtb2/aplaca/test.json")
write_alpaca_format(df.dev_df, "/data/sunwh/idrr/data/arg2def/pdtb2/aplaca/dev.json")

Data saved to /data/sunwh/idrr/data/arg2def/pdtb2/aplaca/train.json
Data saved to /data/sunwh/idrr/data/arg2def/pdtb2/aplaca/test.json
Data saved to /data/sunwh/idrr/data/arg2def/pdtb2/aplaca/dev.json


#### 将easyr1的jsonl文件转为parquet格式

In [None]:



def write_rl_dataset(data_path):
    data = read_jsonl(data_path)
    rl_data = []
    # data格式为problem, answer
    from transformers import AutoTokenizer
    from datasets import Dataset
    tokenizer = AutoTokenizer.from_pretrained("/data/whsun/pretrained_models/Qwen/Qwen3-0.6B")
    data_source = "pdtb"
    for item in data:
        prompt_text = tokenizer.apply_chat_template(
            [{"role": "user", "content": item['problem']}],
            tokenize=False,
            add_generation_prompt=True,
        )
        grounth_truth_alpha = item['answer'].strip()
        rl_data.append(
            {
                "data_source": data_source,
                "prompt": [{"content": prompt_text, "role": "user"}],
                "reward_model": {"ground_truth": grounth_truth_alpha},
            }
        )
    return Dataset.from_list(rl_data, split="train")

train_rl_dataset = write_rl_dataset("/data/whsun/idrr/data/rl/easyr1/pdtb2/top/sft_rl_train.jsonl")
test_rl_dataset = write_rl_dataset("/data/whsun/idrr/data/rl/easyr1/pdtb2/top/sft_rl_test.jsonl")
train_rl_dataset[0]

{'data_source': 'pdtb',
 'prompt': [{'content': '<|im_start|>user\n# Instruction\nYou are an expert in the field of implicit discourse relation recognition. Your task is to analyze the implicit logical relation between two adjacent text segments and choose the most accurate relation from the options.\n\n# Input\nSegment 1: In an Oct. 19 review of "The Misanthrope" at Chicago\'s Goodman Theatre ("Revitalized Classics Take the Stage in Windy City," Leisure & Arts), the role of Celimene, played by Kim Cattrall, was mistakenly attributed to Christina Haag\nSegment 2: Ms. Haag plays Elianti\nRelation option:\nA. Comparison\nB. Contingency\nC. Expansion\nD. Temporal<|im_end|>\n<|im_start|>assistant\n',
   'role': 'user'}],
 'reward_model': {'ground_truth': 'A'}}

In [13]:
train_rl_dataset.to_parquet("/data/whsun/idrr/data/rl/verl/pdtb2/top/sft_rl_train.parquet")
test_rl_dataset.to_parquet("/data/whsun/idrr/data/rl/verl/pdtb2/top/sft_rl_test.parquet")

Creating parquet from Arrow format: 100%|██████████| 13/13 [00:00<00:00, 86.21ba/s]
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 645.72ba/s]


640329

#### 将parquet数据转成json

In [1]:
import pandas as pd
import json

def parquet_to_json_pandas(input_file, output_file, orient='records', indent=2):
    """
    使用pandas将Parquet转换为JSON
    
    参数:
        input_file: 输入的Parquet文件路径
        output_file: 输出的JSON文件路径
        orient: JSON格式（'records'为行记录，'split'为分离格式等）
        indent: JSON缩进，None为压缩格式，数字为缩进空格数
    """
    # 读取Parquet文件
    df = pd.read_parquet(input_file)
    
    # 转换为JSON字符串
    json_str = df.to_json(orient=orient, indent=indent, force_ascii=False)
    
    # 写入文件
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(json_str)
    
    print(f"转换完成！输出文件: {output_file}")
    print(f"数据形状: {df.shape}")

# 使用示例
if __name__ == "__main__":
    parquet_to_json_pandas('/data/whsun/idrr/data/rl/verl/pdtb2/top/distill_qwen_1.5b_gen_test.parquet', '/data/whsun/idrr/data/rl/verl/pdtb2/top/distill_qwen_1.5b_gen_test.json')

转换完成！输出文件: /data/whsun/idrr/data/rl/verl/pdtb2/top/distill_qwen_1.5b_gen_test.json
数据形状: (1046, 4)


#### 根据lmf输出构造alpaca训练数据

In [None]:
data = read_jsonl('/data/whsun/idrr/results/rl_cold_start/pdtb2/top/Qwen3-8B-E1_by_qwen3_max-DAPO-lora/global_step_210/train/generated_predictions.jsonl')
new_data = []
import re
def re_search(text, type):
    '''
    搜索text中符合type类型的内容,
    如果text中存在多个符合type类型的内容，则返回最后一个
    如果搜索不到，则抛出异常
    '''
    pattern = ''
    flags = re.DOTALL  # 添加 DOTALL 标志，让 . 也匹配换行符
    
    if type == 'json':
        pattern = r'```json\s*(.*?)\s*```'
    elif type == 'box':
        pattern = r'boxed{(.*?)}'
    elif type == 'xml':
        pattern = r'```xml\s*(.*?)\s*```'

    matches = re.findall(pattern, text, flags)
    if not matches and type == 'json':
        # 使用贪婪匹配来匹配完整的字典结构
        pattern = r'\{.*\}'
        matches = re.findall(pattern, text, flags)

    if len(matches) > 1:
        logger.warning(f"the number of matches is greater than 1:")
        for i, match in enumerate(matches):
            logger.debug(f"match{i}:\n{match}\n")
    elif not matches:
        logger.error(f"no matches found for {type} in text:\n{text}\n")
        raise ValueError()
        return text

    structured_text = matches[-1]
    # if type == 'json':
    #     structured_text = repair_json(structured_text)
    return structured_text


correct_cnt = 0
for item in data:
    pred = re_search(item['predict'], 'box')
    label = re_search(item['label'], 'box')
    # print(f"Predicted: {pred}, Label: {label}")
    if pred == label:
        correct_cnt += 1
        new_data.append({
                'instruction': item['prompt'].replace('\nassistant\n', '').replace('user\n', ''),
                'input': '',
                'output': item['predict'],
            })
        
print(f"Accuracy: {correct_cnt}/{len(data)} = {correct_cnt/len(data):.4f}")
print(f"Total samples: {len(new_data)}")
print(f"Correct samples: {correct_cnt}")
write_json('/data/whsun/idrr/data/sft/rl_cold_start/pdtb2/top/alpaca/qwen3-8b-exp-dapo_lora-distill_train.json', new_data)

Accuracy: 9485/12632 = 0.7509
Total samples: 9485
Correct samples: 9485
Data saved to /data/whsun/idrr/data/sft/rl_cold_start/pdtb2/top/alpaca/qwen3-8b-exp-dapo_lora-distill_train.json


#### API平台批量推理数据构造

In [9]:
### 生成label
data = {}
cnt = 1
label2alpha = {label: chr(65 + i) for i, label in enumerate(df.label_list)}
for _, row in df.test_df.iterrows():
    data[f"request-{cnt}"] = label2alpha[row['label11']]
    cnt += 1

write_json('/data/whsun/idrr/data/api/volcengine/pdtb2/top/id2label.json', data)

Data saved to /data/whsun/idrr/data/api/volcengine/pdtb2/top/id2label.json


##### volcengine

In [19]:
data = []
prompt_template = read_txt("/data/whsun/idrr/prompts/rl_base.txt")
print(prompt_template)
# {"custom_id": "request-1", "body": {"messages": [{"role": "user", "content": "天空为什么这么蓝？"}],"max_tokens": 1000,"top_p":1,"temperature":0.7}}
cnt = 1
for _, row in df.test_df.iterrows():
    data.append(
        {
            'custom_id': f"request-{cnt}",
            'body': {
                'reasoning_effort': 'high',
                'messages': [
                    {
                    'role': 'user',
                    'content': prompt_template.format(
                        relation_terms='\n'.join([f"{chr(65 + i)}. {label}" for i, label in enumerate(df.label_list)]),
                        arg1=row['arg1'],
                        arg2=row['arg2'],
                    )
                    }
                ],
                'max_tokens': 2048,
            }
        }
    )
    cnt += 1

write_jsonl('/data/whsun/idrr/data/api/volcengine/pdtb2/top/test.jsonl', data)

### Task
You are an expert in the field of implicit discourse relations. Your task is to determine the semantic-logical relationship between two given text segments and select the most appropriate relation label. Output only one of A, B, C, or D, and enclose it in \boxed{{}}.

### Relations
{relation_terms}

### Segments
Text segment 1: {arg1}
Text segment 2: {arg2}

Your answer:



##### 阿里云

In [6]:
data = []
prompt_template = read_txt("/data/whsun/idrr/prompts/rl_base.txt")
print(prompt_template)
# {"custom_id": "request-1", "body": {"messages": [{"role": "user", "content": "天空为什么这么蓝？"}],"max_tokens": 1000,"top_p":1,"temperature":0.7}}
cnt = 1
for _, row in df.test_df.iterrows():
    data.append(
        {
            'custom_id': f"request-{cnt}",
            'method': 'POST',
            'url': '/v1/chat/completions',
            'body': {
                'model': 'deepseek-v3.2',
                'messages': [
                    {
                        'role': 'user',
                        'content': prompt_template.format(
                            relation_terms='\n'.join([f"{chr(65 + i)}. {label}" for i, label in enumerate(df.label_list)]),
                            arg1=row['arg1'],
                            arg2=row['arg2'],
                        )
                    }
                ],
                'extra_body': {'enable_thinking': True},
            }
        }
    )
    cnt += 1

write_jsonl('/data/whsun/idrr/data/api/aliyun/pdtb2/top/ds_v3_2_think_test.jsonl', data)

### Task
You are an expert in the field of implicit discourse relations. Your task is to determine the semantic-logical relationship between two given text segments and select the most appropriate relation label. Output only one of A, B, C, or D, and enclose it in \boxed{{}}.

### Relations
{relation_terms}

### Segments
Text segment 1: {arg1}
Text segment 2: {arg2}

Your answer:

