# Chinese RL Evaluation Pipeline

This notebook combines data merging and evaluation functionality for Chinese RL (认同逻辑) analysis.

## 1. Import Required Libraries

In [19]:
import pandas as pd
from pathlib import Path
import re
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report

## 2. Data Merging and Cleaning

In [20]:
# Load the sample data
sample = pd.read_csv('../0_data_collection/dataset.csv')
print(f"Loaded sample data with {len(sample)} rows")
sample.head()

Loaded sample data with 483 rows


Unnamed: 0.1,Unnamed: 0,comment_id,comment_type,text,Golden
0,7,reply_chengdu_1_61_1,reply,热烈欢迎咱叙北第一带盐人 北门可不差，论三环外，东西三环可不一定有北三环好，再等火北弄好，那...,1
1,12,reply_chengdu_2_2_5,reply,真的很多，我就是周边的，不管现实还是网上看的很多，他们天天说自己土著然后说我们弯脚杆，碰到过...,147
2,16,reply_chengdu_2_2_9,reply,我是都江堰的，其他地方的我不敢说，但是我可是从小被成都口音嘲笑哦，我小时候去亲戚家耍过暑假，...,4
3,17,reply_chengdu_2_2_11,reply,那我说都江堰话被笑的更多，我以前被介绍了个成都人，然后天天给我说他家以前二环以内的，说他家以...,14
4,18,reply_chengdu_2_2_13,reply,那证明成都人的确不咋地，本来成都话口音离普通话语区就够偏了，很多怪音，他们还歧视本省的人，无...,4


In [21]:
# Merge results from all CSV files in the results directory
for file in Path('../2_run_llms/llm_outputs').glob("*.csv"):
    if not file.stem.startswith('sample') and not file.stem.endswith('en'):
        df = pd.read_csv(file)
        model = file.stem
        sample[model] = df['RL_Types']
        print(f"Added model: {model}")

print(f"\nFinal dataframe shape: {sample.shape}")
print(f"Columns: {list(sample.columns)}")

Added model: Qwen3-235B-A22B_few_shot
Added model: Qwen3-32B_no_cot
Added model: deepseek_zero_shot
Added model: gpt41_few_shot
Added model: gemma-3-27b-it_zero_shot
Added model: deepseek_no_cot
Added model: Qwen3-32B_few_shot
Added model: Qwen3-32B_zero_shot
Added model: gpt41_no_cot
Added model: deepseek_few_shot
Added model: gemma-3-27b-it_no_cot
Added model: gpt41_zero_shot
Added model: Qwen3-235B-A22B_zero_shot
Added model: gemma-3-27b-it_few_shot
Added model: Qwen3-235B-A22B_no_cot

Final dataframe shape: (483, 20)
Columns: ['Unnamed: 0', 'comment_id', 'comment_type', 'text', 'Golden', 'Qwen3-235B-A22B_few_shot', 'Qwen3-32B_no_cot', 'deepseek_zero_shot', 'gpt41_few_shot', 'gemma-3-27b-it_zero_shot', 'deepseek_no_cot', 'Qwen3-32B_few_shot', 'Qwen3-32B_zero_shot', 'gpt41_no_cot', 'deepseek_few_shot', 'gemma-3-27b-it_no_cot', 'gpt41_zero_shot', 'Qwen3-235B-A22B_zero_shot', 'gemma-3-27b-it_few_shot', 'Qwen3-235B-A22B_no_cot']


## 3. Define Cleaning Functions

In [22]:
def clean_RL_output(text):
    """Clean English RL outputs (e.g., 'RL1', 'RL23')"""
    if pd.isna(text):
        return text
    if not isinstance(text, str):
        return text

    # Preserve placeholder for "no RLs" if it's simply "-"
    if text.strip() == "-":
        return "-"

    # Find all occurrences of RL followed by digits (e.g., "RL1", "RL23")
    RL_patterns = re.findall(r'RL\d+', text)
    
    # Remove duplicates while preserving order of first appearance
    unique_RLs = []
    seen = set()
    for RL in RL_patterns:
        if RL not in seen:
            unique_RLs.append(RL)
            seen.add(RL)

    return ", ".join(unique_RLs)


def clean_zh_output(text):
    """Clean Chinese RL outputs (e.g., '认同逻辑1', '认同逻辑4')"""
    if pd.isna(text):
        return text
    if not isinstance(text, str):
        return text

    # Preserve placeholder for "no RLs" if it's simply "-"
    if text.strip() == "-":
        return "-"

    # Find all occurrences of RL followed by digits (e.g., "认同逻辑1", "认同逻辑23")
    RL_patterns = re.findall(r'认同逻辑\d+', text)
    
    # Remove duplicates while preserving order of first appearance
    unique_RLs = []
    seen = set()
    for RL in RL_patterns:
        if RL not in seen:
            unique_RLs.append(RL)
            seen.add(RL)

    return ", ".join(unique_RLs)

In [23]:
# Identify model output columns to be cleaned
known_non_model_cols = ['Unnamed: 0', 'comment_id', 'comment_type', 'text', 'Golden']
model_cols_to_clean = [col for col in sample.columns if col not in known_non_model_cols]

print(f"Model columns to clean: {model_cols_to_clean}")

# Apply the cleaning function to each identified model output column
for col_name in model_cols_to_clean:
    if col_name in sample.columns:
        if col_name.endswith('en'):
            sample[col_name] = sample[col_name].apply(clean_RL_output)
        else:
            sample[col_name] = sample[col_name].apply(clean_zh_output)
        print(f"Cleaned column: {col_name}")

# Save the cleaned data
sample.to_csv('sample_all.csv', index=False)
print("\nSaved cleaned data to 'sample_all.csv'")

Model columns to clean: ['Qwen3-235B-A22B_few_shot', 'Qwen3-32B_no_cot', 'deepseek_zero_shot', 'gpt41_few_shot', 'gemma-3-27b-it_zero_shot', 'deepseek_no_cot', 'Qwen3-32B_few_shot', 'Qwen3-32B_zero_shot', 'gpt41_no_cot', 'deepseek_few_shot', 'gemma-3-27b-it_no_cot', 'gpt41_zero_shot', 'Qwen3-235B-A22B_zero_shot', 'gemma-3-27b-it_few_shot', 'Qwen3-235B-A22B_no_cot']
Cleaned column: Qwen3-235B-A22B_few_shot
Cleaned column: Qwen3-32B_no_cot
Cleaned column: deepseek_zero_shot
Cleaned column: gpt41_few_shot
Cleaned column: gemma-3-27b-it_zero_shot
Cleaned column: deepseek_no_cot
Cleaned column: Qwen3-32B_few_shot
Cleaned column: Qwen3-32B_zero_shot
Cleaned column: gpt41_no_cot
Cleaned column: deepseek_few_shot
Cleaned column: gemma-3-27b-it_no_cot
Cleaned column: gpt41_zero_shot
Cleaned column: Qwen3-235B-A22B_zero_shot
Cleaned column: gemma-3-27b-it_few_shot
Cleaned column: Qwen3-235B-A22B_no_cot

Saved cleaned data to 'sample_all.csv'


## 4. Define Evaluation Functions

In [24]:
def parse_annotator(value):
    """Parses the 'Golden' column (e.g., "147" means labels 1, 4, 7)."""
    if pd.isna(value) or str(value).strip() == "-":
        return set()
    # Ensure value is treated as a string of digits
    return set(map(int, list(str(int(value))))) # int(value) handles potential float like 4.0

def parse_RL_category(value):
    """Parses the 'RL Category' column (e.g., "认同逻辑1, 认同逻辑4" or "认同逻辑1")."""
    if pd.isna(value) or not str(value).strip():
        return set()
    
    labels = set()
    # Split by comma, then process each part
    items = str(value).split(',')
    for item in items:
        item = item.strip() # Remove leading/trailing whitespace
        if "认同逻辑" in item:
            try:
                # Extract digits after "认同逻辑"
                label_num = int(item.split("认同逻辑")[1])
                labels.add(label_num)
            except ValueError:
                # Handle cases where number parsing fails
                print(f"Warning: Could not parse number from item: '{item}' in '{value}'")
    return labels

def evaluate_predictions(y_true_parsed_list, y_pred_parsed_list, true_label_source_name, pred_label_source_name):
    """
    Calculates and prints F1 scores and classification report for a given pair of true and predicted labels.
    Filters out samples where true labels are empty.
    """
    print(f"\n\n{'='*10} EVALUATING: {pred_label_source_name} (Predictions) vs. {true_label_source_name} (Golden Labels) {'='*10}")

    # Filter out samples where true labels are empty
    filtered_pairs = [(true, pred) for true, pred in zip(y_true_parsed_list, y_pred_parsed_list) if true]
    
    if not filtered_pairs:
        print("No annotated samples found after filtering. Skipping metrics calculation.")
        return
        
    y_true_filtered, y_pred_filtered = zip(*filtered_pairs)
    
    # Determine all unique labels present in this specific pairing
    all_labels_in_pair = set()
    for labels_set in y_true_filtered:
        all_labels_in_pair.update(labels_set)
    for labels_set in y_pred_filtered:
        all_labels_in_pair.update(labels_set)

    if not all_labels_in_pair:
        print("No labels found for this evaluation pair. Skipping metrics calculation.")
        return

    sorted_unique_labels = sorted(list(all_labels_in_pair))

    # Binarize labels for this specific pair
    mlb = MultiLabelBinarizer(classes=sorted_unique_labels)
    y_true_binarized = mlb.fit_transform(y_true_filtered)
    y_pred_binarized = mlb.transform(y_pred_filtered) # Use transform for predictions

    print(f"Classes considered for this evaluation: {mlb.classes_}")
    print(f"Number of samples after filtering non-annotated: {len(y_true_filtered)}")

    # Calculate F1 Scores
    print("\n--- F1 Scores ---")
    f1_micro = f1_score(y_true_binarized, y_pred_binarized, average='micro', zero_division=0)
    print(f"F1 Score (micro): {f1_micro:.4f}")

    f1_macro = f1_score(y_true_binarized, y_pred_binarized, average='macro', zero_division=0)
    print(f"F1 Score (macro): {f1_macro:.4f}")

    f1_weighted = f1_score(y_true_binarized, y_pred_binarized, average='weighted', zero_division=0)
    print(f"F1 Score (weighted): {f1_weighted:.4f}")

    f1_samples = f1_score(y_true_binarized, y_pred_binarized, average='samples', zero_division=0)
    print(f"F1 Score (samples): {f1_samples:.4f}")
    print("-" * 20)

    # Full Classification Report
    report_target_names = [f"RL{label}" for label in mlb.classes_]

    print("\n--- Classification Report ---")
    try:
        report = classification_report(
            y_true_binarized,
            y_pred_binarized,
            target_names=report_target_names,
            zero_division=0
        )
        print(report)
    except ValueError as e:
        print(f"Could not generate classification report: {e}")
        print("This can happen if some classes in `target_names` are not present in `y_true_binarized` or `y_pred_binarized` after binarization.")

    precision_micro_overall = precision_score(y_true_binarized, y_pred_binarized, average='micro', zero_division=0)
    recall_micro_overall = recall_score(y_true_binarized, y_pred_binarized, average='micro', zero_division=0)
    print(f"\nOverall Micro Precision: {precision_micro_overall:.4f}")
    print(f"Overall Micro Recall:    {recall_micro_overall:.4f}")

## 5. Run Evaluation

In [25]:
# Load the cleaned data for evaluation
df = pd.read_csv('sample_all.csv')
print(f"Loaded evaluation data with {len(df)} rows")
df.head()

Loaded evaluation data with 483 rows


Unnamed: 0.1,Unnamed: 0,comment_id,comment_type,text,Golden,Qwen3-235B-A22B_few_shot,Qwen3-32B_no_cot,deepseek_zero_shot,gpt41_few_shot,gemma-3-27b-it_zero_shot,deepseek_no_cot,Qwen3-32B_few_shot,Qwen3-32B_zero_shot,gpt41_no_cot,deepseek_few_shot,gemma-3-27b-it_no_cot,gpt41_zero_shot,Qwen3-235B-A22B_zero_shot,gemma-3-27b-it_few_shot,Qwen3-235B-A22B_no_cot
0,7,reply_chengdu_1_61_1,reply,热烈欢迎咱叙北第一带盐人 北门可不差，论三环外，东西三环可不一定有北三环好，再等火北弄好，那...,1,"认同逻辑1, 认同逻辑5",认同逻辑1,"认同逻辑1, 认同逻辑5",认同逻辑1,,"认同逻辑1, 认同逻辑5",认同逻辑1,"认同逻辑1, 认同逻辑5",,"认同逻辑1, 认同逻辑5, 认同逻辑6",认同逻辑1,认同逻辑1,"认同逻辑1, 认同逻辑5",认同逻辑1,认同逻辑1
1,12,reply_chengdu_2_2_5,reply,真的很多，我就是周边的，不管现实还是网上看的很多，他们天天说自己土著然后说我们弯脚杆，碰到过...,147,"认同逻辑1, 认同逻辑4","认同逻辑1, 认同逻辑4","认同逻辑1, 认同逻辑4","认同逻辑1, 认同逻辑4, 认同逻辑6","认同逻辑1, 认同逻辑4, 认同逻辑6","认同逻辑1, 认同逻辑4, 认同逻辑6","认同逻辑1, 认同逻辑4","认同逻辑1, 认同逻辑3, 认同逻辑4, 认同逻辑6","认同逻辑1, 认同逻辑4","认同逻辑1, 认同逻辑4, 认同逻辑6, 认同逻辑7","认同逻辑1, 认同逻辑4","认同逻辑1, 认同逻辑3, 认同逻辑4","认同逻辑1, 认同逻辑2, 认同逻辑3, 认同逻辑4","认同逻辑1, 认同逻辑4, 认同逻辑6, 认同逻辑3","认同逻辑1, 认同逻辑4"
2,16,reply_chengdu_2_2_9,reply,我是都江堰的，其他地方的我不敢说，但是我可是从小被成都口音嘲笑哦，我小时候去亲戚家耍过暑假，...,4,认同逻辑4,认同逻辑4,"认同逻辑1, 认同逻辑4",认同逻辑4,认同逻辑4,认同逻辑4,认同逻辑4,认同逻辑4,认同逻辑4,认同逻辑4,认同逻辑4,认同逻辑4,"认同逻辑3, 认同逻辑4, 认同逻辑1",认同逻辑4,"认同逻辑3, 认同逻辑4"
3,17,reply_chengdu_2_2_11,reply,那我说都江堰话被笑的更多，我以前被介绍了个成都人，然后天天给我说他家以前二环以内的，说他家以...,14,"认同逻辑1, 认同逻辑4","认同逻辑4, 认同逻辑1","认同逻辑1, 认同逻辑4","认同逻辑1, 认同逻辑4",,"认同逻辑1, 认同逻辑4","认同逻辑3, 认同逻辑4","认同逻辑1, 认同逻辑4","认同逻辑1, 认同逻辑4","认同逻辑1, 认同逻辑4, 认同逻辑6",认同逻辑4,"认同逻辑1, 认同逻辑4","认同逻辑4, 认同逻辑1, 认同逻辑3","认同逻辑1, 认同逻辑4","认同逻辑1, 认同逻辑4"
4,18,reply_chengdu_2_2_13,reply,那证明成都人的确不咋地，本来成都话口音离普通话语区就够偏了，很多怪音，他们还歧视本省的人，无...,4,认同逻辑4,"认同逻辑1, 认同逻辑4",认同逻辑4,认同逻辑4,"认同逻辑4, 认同逻辑6",认同逻辑4,认同逻辑4,认同逻辑4,认同逻辑4,认同逻辑4,认同逻辑4,,认同逻辑4,认同逻辑4,认同逻辑4


In [26]:
# Parse the golden standard annotations
annotator = df['Golden'].apply(parse_annotator)
print(f"Parsed {len(annotator)} golden standard annotations")

# Evaluate each model
for column in df.columns[5:]:
    if column.endswith('en'):
        continue
    print(f"\n\n -----------------\nEvaluating {column}")
    predictions = df[column].apply(parse_RL_category)

    # Evaluate predictions against golden standard
    evaluate_predictions(
        annotator.tolist(),
        predictions.tolist(),
        true_label_source_name="Golden Standard",
        pred_label_source_name=column
    )

print("\n\nEvaluation completed!")

Parsed 483 golden standard annotations


 -----------------
Evaluating Qwen3-235B-A22B_few_shot


Classes considered for this evaluation: [1 2 3 4 5 6 7]
Number of samples after filtering non-annotated: 343

--- F1 Scores ---
F1 Score (micro): 0.7901
F1 Score (macro): 0.7164
F1 Score (weighted): 0.7924
F1 Score (samples): 0.8027
--------------------

--- Classification Report ---
              precision    recall  f1-score   support

         RL1       0.86      0.83      0.84       157
         RL2       0.61      0.92      0.73        37
         RL3       0.74      0.84      0.79        75
         RL4       0.84      0.88      0.86        92
         RL5       0.61      0.77      0.68        35
         RL6       0.72      0.84      0.78        50
         RL7       0.33      0.33      0.33        15

   micro avg       0.75      0.83      0.79       461
   macro avg       0.67      0.77      0.72       461
weighted avg       0.77      0.83      0.79       461
 samples avg       0.

F1 Score (samples): 0.6440
--------------------

--- Classification Report ---
              precision    recall  f1-score   support

         RL1       0.75      0.70      0.72       157
         RL2       0.52      0.65      0.58        37
         RL3       0.61      0.72      0.66        75
         RL4       0.84      0.75      0.79        92
         RL5       0.56      0.51      0.54        35
         RL6       0.52      0.64      0.57        50
         RL7       0.33      0.07      0.11        15

   micro avg       0.67      0.67      0.67       461
   macro avg       0.59      0.58      0.57       461
weighted avg       0.67      0.67      0.67       461
 samples avg       0.63      0.71      0.64       461


Overall Micro Precision: 0.6696
Overall Micro Recall:    0.6681


 -----------------
Evaluating deepseek_no_cot


Classes considered for this evaluation: [1 2 3 4 5 6 7]
Number of samples after filtering non-annotated: 343

--- F1 Scores ---
F1 Score (micro): 0.7325
F1

## 6. Summary and Analysis

In [27]:
# Display summary statistics
print("Data Summary:")
print(f"Total samples: {len(df)}")
print(f"Number of models evaluated: {len([col for col in df.columns[5:] if not col.endswith('en')])}")

Data Summary:
Total samples: 483
Number of models evaluated: 15
