In [6]:
import pandas as pd
import json

In [7]:
def generate_span_data(text_data, span_data):
    prompt_data = []
    text_data = text_data.to_dict(orient="records")  # 修正点

    for span, text in zip(span_data, text_data):
            hateful_span = span['item']['misogynistic']
            general_emo = span['item']['general_emotion']
            prompt = ""
            if hateful_span:
                prompt += f"<HATE_SPAN> {' | '.join(hateful_span)} </HATE_SPAN>"
            if general_emo:
                prompt += f"<EMO_SPAN> {' | '.join(general_emo)} </EMO_SPAN>"
            else:
                prompt += "<NO_SPAN>"

            prompt += text['text']  # 加入原始文本

            prompt_data.append({
                "filename": text['filename'],
                "text": text['text'],
                "prompt": prompt
            })
    return prompt_data


def concat_data(dataset_name):
    original_data = pd.read_csv(f'all/{dataset_name}.csv')
    print(f"original_data: {len(original_data)}")

    # 加载 span JSONL 数据
    emotion_span_data = []
    with open(f"spans/emotion_spans_{dataset_name}.jsonl", "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:
                item = json.loads(line)
                emotion_span_data.append(item)
    print(f"emotion_span_data: {len(emotion_span_data)}")
    prompt_data = generate_span_data(original_data, emotion_span_data)
    prompt_data = pd.DataFrame(prompt_data)
    print(f"prompt_data: {len(prompt_data)}")

    # 加载 caption 和 background
    caption_data = pd.read_csv(f'caption/{dataset_name}_caption_version.csv')
    print(f"caption_data: {len(caption_data)}")
    with open(f"background/{dataset_name}_background.json", "r") as f:
        background_data = json.load(f)
    background_data = pd.DataFrame(background_data)
    print(f"background_data: {len(background_data)}")

    # 标准化 filename 字段（去空格、转小写）
    prompt_data["filename"] = prompt_data["filename"].str.strip().str.lower()
    caption_data["filename"] = caption_data["filename"].str.strip().str.lower()
    background_data["filename"] = background_data["filename"].str.strip().str.lower()

    # 检查 filename 是否匹配
    prompt_filenames = set(prompt_data["filename"].unique())
    caption_filenames = set(caption_data["filename"].unique())
    background_filenames = set(background_data["filename"].unique())

    print("\n=== Filename 差异检查 ===")
    print(f"prompt - caption 缺失: {len(prompt_filenames - caption_filenames)}")
    print(list(prompt_filenames - caption_filenames)[:10])  # 仅查看前10个

    print(f"caption - prompt 缺失: {len(caption_filenames - prompt_filenames)}")
    print(list(caption_filenames - prompt_filenames)[:10])

    print(f"prompt - background 缺失: {len(prompt_filenames - background_filenames)}")
    print(list(prompt_filenames - background_filenames)[:10])

    print(f"background - prompt 缺失: {len(background_filenames - prompt_filenames)}")
    print(list(background_filenames - prompt_filenames)[:10])

    print(f"caption - background 缺失: {len(caption_filenames - background_filenames)}")
    print(list(caption_filenames - background_filenames)[:10])

    print(f"background - caption 缺失: {len(background_filenames - caption_filenames)}")
    print(list(background_filenames - caption_filenames)[:10])

    # 合并数据
    merge_data = pd.merge(prompt_data, background_data, on="filename", how="inner")
    merge_data = pd.merge(merge_data, caption_data, on="filename", how="inner")

    print(f"\n最终合并后数据量: {len(merge_data)}")

    return merge_data


In [8]:
dataset_dict = ["train", "test"]


for dataset_name in dataset_dict:
    merge_data = concat_data(dataset_name)
    
    print(len(merge_data))
    merge_data.to_csv(f"all/{dataset_name}_data.csv", index=False)

original_data: 1190
emotion_span_data: 1190
prompt_data: 1190
caption_data: 1190
background_data: 1190

=== Filename 差异检查 ===
prompt - caption 缺失: 0
[]
caption - prompt 缺失: 0
[]
prompt - background 缺失: 0
[]
background - prompt 缺失: 0
[]
caption - background 缺失: 0
[]
background - caption 缺失: 0
[]

最终合并后数据量: 1190
1190
original_data: 340
emotion_span_data: 340
prompt_data: 340
caption_data: 340
background_data: 340

=== Filename 差异检查 ===
prompt - caption 缺失: 0
[]
caption - prompt 缺失: 0
[]
prompt - background 缺失: 0
[]
background - prompt 缺失: 0
[]
caption - background 缺失: 0
[]
background - caption 缺失: 0
[]

最终合并后数据量: 340
340


In [10]:
train_data = pd.read_csv("all/train_data.csv")

In [11]:
train_data.drop(columns=["text_x", "text_y","Unnamed: 0","label_y"], inplace=True)

train_data.rename(columns={
    "label_x": "label",
},inplace=True)

train_data.to_csv("all/train_final.csv", index=False)

In [12]:
len(train_data)

1190

In [13]:
test_data = pd.read_csv("all/test_data.csv")
test_data.drop(columns=["text_x", "text_y","Unnamed: 0","label_y"], inplace=True)

test_data.rename(columns={
    "label_x": "label",
},inplace=True)

test_data.to_csv("all/test_final.csv", index=False)
print(len(test_data))

340
