# English RL Evaluation Pipeline

This notebook combines data merging and evaluation functionality for English RL (Reasoning Logic) analysis.

## 1. Import Required Libraries

In [1]:
import pandas as pd
from pathlib import Path
import re
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report

## 2. Data Merging and Cleaning

In [2]:
# Load the sample data
sample = pd.read_csv('../0_data_collection/dataset.csv')
print(f"Loaded sample data with {len(sample)} rows")
sample.head()

Loaded sample data with 483 rows


Unnamed: 0.1,Unnamed: 0,comment_id,comment_type,text,Golden
0,7,reply_chengdu_1_61_1,reply,热烈欢迎咱叙北第一带盐人 北门可不差，论三环外，东西三环可不一定有北三环好，再等火北弄好，那...,1
1,12,reply_chengdu_2_2_5,reply,真的很多，我就是周边的，不管现实还是网上看的很多，他们天天说自己土著然后说我们弯脚杆，碰到过...,147
2,16,reply_chengdu_2_2_9,reply,我是都江堰的，其他地方的我不敢说，但是我可是从小被成都口音嘲笑哦，我小时候去亲戚家耍过暑假，...,4
3,17,reply_chengdu_2_2_11,reply,那我说都江堰话被笑的更多，我以前被介绍了个成都人，然后天天给我说他家以前二环以内的，说他家以...,14
4,18,reply_chengdu_2_2_13,reply,那证明成都人的确不咋地，本来成都话口音离普通话语区就够偏了，很多怪音，他们还歧视本省的人，无...,4


In [3]:
# Merge results from all CSV files in the results directory
for file in Path('../2_run_llms/llm_outputs').glob("*_en.csv"):
    if not file.stem.startswith('sample'):
        df = pd.read_csv(file)
        model = file.stem
        sample[model] = df['RL_Types']
        print(f"Added model: {model}")

print(f"\nFinal dataframe shape: {sample.shape}")
print(f"Columns: {list(sample.columns)}")

Added model: gpt41_zero_shot_en
Added model: deepseek_few_shot_en
Added model: Qwen3-235B-A22B_no_cot_en
Added model: gemma-3-27b-it_no_cot_en
Added model: Qwen3-235B-A22B_zero_shot_en
Added model: Qwen3-32B_few_shot_en
Added model: gpt41_few_shot_en
Added model: Qwen3-32B_zero_shot_en
Added model: gemma-3-27b-it_zero_shot_en
Added model: deepseek_zero_shot_en
Added model: gemma-3-27b-it_few_shot_en
Added model: gpt41_no_cot_en
Added model: Qwen3-235B-A22B_few_shot_en
Added model: deepseek_no_cot_en
Added model: Qwen3-32B_no_cot_en

Final dataframe shape: (483, 20)
Columns: ['Unnamed: 0', 'comment_id', 'comment_type', 'text', 'Golden', 'gpt41_zero_shot_en', 'deepseek_few_shot_en', 'Qwen3-235B-A22B_no_cot_en', 'gemma-3-27b-it_no_cot_en', 'Qwen3-235B-A22B_zero_shot_en', 'Qwen3-32B_few_shot_en', 'gpt41_few_shot_en', 'Qwen3-32B_zero_shot_en', 'gemma-3-27b-it_zero_shot_en', 'deepseek_zero_shot_en', 'gemma-3-27b-it_few_shot_en', 'gpt41_no_cot_en', 'Qwen3-235B-A22B_few_shot_en', 'deepseek_no_

## 3. Define Cleaning Functions

In [4]:
def clean_RL_output(text):
    """Clean English RL outputs (e.g., 'RL1', 'RL23', 'Output: Recognition Logic 1')"""
    if pd.isna(text):
        return text
    if not isinstance(text, str):
        return text

    # Preserve placeholder for "no RLs" if it's simply "-"
    if text.strip() == "-":
        return "-"

    # Handle "Output: Recognition Logic 1, Recognition Logic 4" format
    if "Output:" in text and "Recognition Logic" in text:
        # Extract the part after "Output:"
        output_part = text.split("Output:")[1].strip()
        # Find all "Recognition Logic N" patterns
        logic_patterns = re.findall(r'Recognition Logic (\d+)', output_part)
        # Convert to RL format
        RL_patterns = [f"RL{num}" for num in logic_patterns]
    else:
        # Find all occurrences of RL followed by digits (e.g., "RL1", "RL23")
        RL_patterns = re.findall(r'RL\d+', text)
    
    # Remove duplicates while preserving order of first appearance
    unique_RLs = []
    seen = set()
    for RL in RL_patterns:
        if RL not in seen:
            unique_RLs.append(RL)
            seen.add(RL)

    return ", ".join(unique_RLs)


def clean_zh_output(text):
    """Clean Chinese RL outputs (e.g., '认同逻辑1', '认同逻辑4')"""
    if pd.isna(text):
        return text
    if not isinstance(text, str):
        return text

    # Preserve placeholder for "no RLs" if it's simply "-"
    if text.strip() == "-":
        return "-"

    # Find all occurrences of RL followed by digits (e.g., "认同逻辑1", "认同逻辑23")
    RL_patterns = re.findall(r'认同逻辑\d+', text)
    
    # Remove duplicates while preserving order of first appearance
    unique_RLs = []
    seen = set()
    for RL in RL_patterns:
        if RL not in seen:
            unique_RLs.append(RL)
            seen.add(RL)

    return ", ".join(unique_RLs)

In [5]:
# Identify model output columns to be cleaned
known_non_model_cols = ['Unnamed: 0', 'comment_id', 'comment_type', 'text', 'Golden']
model_cols_to_clean = [col for col in sample.columns if col not in known_non_model_cols]

print(f"Model columns to clean: {model_cols_to_clean}")

# Apply the cleaning function to each identified model output column
for col_name in model_cols_to_clean:
    if col_name in sample.columns:
        if col_name.endswith('en'):
            sample[col_name] = sample[col_name].apply(clean_RL_output)
        else:
            sample[col_name] = sample[col_name].apply(clean_zh_output)
        print(f"Cleaned column: {col_name}")

Model columns to clean: ['gpt41_zero_shot_en', 'deepseek_few_shot_en', 'Qwen3-235B-A22B_no_cot_en', 'gemma-3-27b-it_no_cot_en', 'Qwen3-235B-A22B_zero_shot_en', 'Qwen3-32B_few_shot_en', 'gpt41_few_shot_en', 'Qwen3-32B_zero_shot_en', 'gemma-3-27b-it_zero_shot_en', 'deepseek_zero_shot_en', 'gemma-3-27b-it_few_shot_en', 'gpt41_no_cot_en', 'Qwen3-235B-A22B_few_shot_en', 'deepseek_no_cot_en', 'Qwen3-32B_no_cot_en']
Cleaned column: gpt41_zero_shot_en
Cleaned column: deepseek_few_shot_en
Cleaned column: Qwen3-235B-A22B_no_cot_en
Cleaned column: gemma-3-27b-it_no_cot_en
Cleaned column: Qwen3-235B-A22B_zero_shot_en
Cleaned column: Qwen3-32B_few_shot_en
Cleaned column: gpt41_few_shot_en
Cleaned column: Qwen3-32B_zero_shot_en
Cleaned column: gemma-3-27b-it_zero_shot_en
Cleaned column: deepseek_zero_shot_en
Cleaned column: gemma-3-27b-it_few_shot_en
Cleaned column: gpt41_no_cot_en
Cleaned column: Qwen3-235B-A22B_few_shot_en
Cleaned column: deepseek_no_cot_en
Cleaned column: Qwen3-32B_no_cot_en


## 4. Define Evaluation Functions

In [6]:
def parse_annotator(value):
    """Parses the 'Golden' column (e.g., "147" means labels 1, 4, 7)."""
    if pd.isna(value) or str(value).strip() == "-":
        return set()
    # Ensure value is treated as a string of digits
    return set(map(int, list(str(int(value))))) # int(value) handles potential float like 4.0

def parse_RL_category_en(value):
    """Parses the 'RL Category' column for English outputs (e.g., "RL1, RL4" or "RL1")."""
    if pd.isna(value) or not str(value).strip():
        return set()
    
    labels = set()
    value_str = str(value).strip()
    
    # Handle "Output: Recognition Logic 1, Recognition Logic 4" format
    if "Output:" in value_str and "Recognition Logic" in value_str:
        # Extract the part after "Output:"
        output_part = value_str.split("Output:")[1].strip()
        # Find all "Recognition Logic N" patterns
        logic_patterns = re.findall(r'Recognition Logic (\d+)', output_part)
        for logic_num in logic_patterns:
            try:
                labels.add(int(logic_num))
            except ValueError:
                print(f"Warning: Could not parse number from 'Recognition Logic {logic_num}' in '{value}'")
        return labels
    
    # Handle "RL1, RL4" format (existing logic)
    items = value_str.split(',')
    for item in items:
        item = item.strip() # Remove leading/trailing whitespace
        if "RL" in item:
            try:
                # Extract digits after "RL"
                label_num = int(item.split("RL")[1])
                labels.add(label_num)
            except ValueError:
                # Handle cases where number parsing fails
                print(f"Warning: Could not parse number from item: '{item}' in '{value}'")
    
    return labels

def evaluate_predictions(y_true_parsed_list, y_pred_parsed_list, true_label_source_name, pred_label_source_name):
    """
    Calculates and prints F1 scores and classification report for a given pair of true and predicted labels.
    Filters out samples where true labels are empty.
    """
    print(f"\n\n{'='*10} EVALUATING: {pred_label_source_name} (Predictions) vs. {true_label_source_name} (Golden Labels) {'='*10}")

    # Filter out samples where true labels are empty
    filtered_pairs = [(true, pred) for true, pred in zip(y_true_parsed_list, y_pred_parsed_list) if true]
    
    if not filtered_pairs:
        print("No annotated samples found after filtering. Skipping metrics calculation.")
        return
        
    y_true_filtered, y_pred_filtered = zip(*filtered_pairs)
    
    # Determine all unique labels present in this specific pairing
    all_labels_in_pair = set()
    for labels_set in y_true_filtered:
        all_labels_in_pair.update(labels_set)
    for labels_set in y_pred_filtered:
        all_labels_in_pair.update(labels_set)

    if not all_labels_in_pair:
        print("No labels found for this evaluation pair. Skipping metrics calculation.")
        return

    sorted_unique_labels = sorted(list(all_labels_in_pair))

    # Binarize labels for this specific pair
    mlb = MultiLabelBinarizer(classes=sorted_unique_labels)
    y_true_binarized = mlb.fit_transform(y_true_filtered)
    y_pred_binarized = mlb.transform(y_pred_filtered) # Use transform for predictions

    print(f"Classes considered for this evaluation: {mlb.classes_}")
    print(f"Number of samples after filtering non-annotated: {len(y_true_filtered)}")

    # Calculate F1 Scores
    print("\n--- F1 Scores ---")
    f1_micro = f1_score(y_true_binarized, y_pred_binarized, average='micro', zero_division=0)
    print(f"F1 Score (micro): {f1_micro:.4f}")

    f1_macro = f1_score(y_true_binarized, y_pred_binarized, average='macro', zero_division=0)
    print(f"F1 Score (macro): {f1_macro:.4f}")

    f1_weighted = f1_score(y_true_binarized, y_pred_binarized, average='weighted', zero_division=0)
    print(f"F1 Score (weighted): {f1_weighted:.4f}")

    f1_samples = f1_score(y_true_binarized, y_pred_binarized, average='samples', zero_division=0)
    print(f"F1 Score (samples): {f1_samples:.4f}")
    print("-" * 20)

    # Full Classification Report
    report_target_names = [f"RL{label}" for label in mlb.classes_]

    print("\n--- Classification Report ---")
    try:
        report = classification_report(
            y_true_binarized,
            y_pred_binarized,
            target_names=report_target_names,
            zero_division=0
        )
        print(report)
    except ValueError as e:
        print(f"Could not generate classification report: {e}")
        print("This can happen if some classes in `target_names` are not present in `y_true_binarized` or `y_pred_binarized` after binarization.")

    precision_micro_overall = precision_score(y_true_binarized, y_pred_binarized, average='micro', zero_division=0)
    recall_micro_overall = recall_score(y_true_binarized, y_pred_binarized, average='micro', zero_division=0)
    print(f"\nOverall Micro Precision: {precision_micro_overall:.4f}")
    print(f"Overall Micro Recall:    {recall_micro_overall:.4f}")

## 5. Run Evaluation

In [7]:
# Load the cleaned data for evaluation
sample.head()

Unnamed: 0.1,Unnamed: 0,comment_id,comment_type,text,Golden,gpt41_zero_shot_en,deepseek_few_shot_en,Qwen3-235B-A22B_no_cot_en,gemma-3-27b-it_no_cot_en,Qwen3-235B-A22B_zero_shot_en,Qwen3-32B_few_shot_en,gpt41_few_shot_en,Qwen3-32B_zero_shot_en,gemma-3-27b-it_zero_shot_en,deepseek_zero_shot_en,gemma-3-27b-it_few_shot_en,gpt41_no_cot_en,Qwen3-235B-A22B_few_shot_en,deepseek_no_cot_en,Qwen3-32B_no_cot_en
0,7,reply_chengdu_1_61_1,reply,热烈欢迎咱叙北第一带盐人 北门可不差，论三环外，东西三环可不一定有北三环好，再等火北弄好，那...,1,"RL1, RL5","RL1, RL5","RL1, RL5","RL1, RL5",RL1,RL1,RL1,"RL1, RL5","RL1, RL2, RL5","RL1, RL5",RL1,"RL1, RL5",RL1,"RL1, RL5",RL1
1,12,reply_chengdu_2_2_5,reply,真的很多，我就是周边的，不管现实还是网上看的很多，他们天天说自己土著然后说我们弯脚杆，碰到过...,147,"RL1, RL4","RL1, RL4, RL7",,"RL1, RL2, RL3, RL4, RL6","RL1, RL3, RL4","RL1, RL4, RL3","RL1, RL4, RL6","RL1, RL4","RL1, RL3, RL4, RL6","RL1, RL4","RL1, RL4, RL5, RL6","RL1, RL4","RL1, RL4, RL7","RL1, RL4",
2,16,reply_chengdu_2_2_9,reply,我是都江堰的，其他地方的我不敢说，但是我可是从小被成都口音嘲笑哦，我小时候去亲戚家耍过暑假，...,4,RL4,RL4,,RL4,RL4,RL4,RL4,"RL1, RL4",RL4,RL4,RL4,RL4,RL4,RL4,"RL1, RL4"
3,17,reply_chengdu_2_2_11,reply,那我说都江堰话被笑的更多，我以前被介绍了个成都人，然后天天给我说他家以前二环以内的，说他家以...,14,"RL1, RL4","RL1, RL3, RL4","RL4, RL3","RL4, RL6","RL1, RL3","RL1, RL4",RL4,"RL1, RL4",,"RL3, RL4","RL3, RL4","RL1, RL4",RL4,"RL3, RL4","RL1, RL4"
4,18,reply_chengdu_2_2_13,reply,那证明成都人的确不咋地，本来成都话口音离普通话语区就够偏了，很多怪音，他们还歧视本省的人，无...,4,RL4,RL4,,"RL4, RL1","RL1, RL4",RL4,RL4,RL4,"RL1, RL4",RL4,RL4,RL4,RL4,RL4,"RL1, RL4"


In [8]:
# Parse the golden standard annotations
annotator = sample['Golden'].apply(parse_annotator)
print(f"Parsed {len(annotator)} golden standard annotations")

# Evaluate each English model
print(f"Evaluating {len(sample.columns[5:])} English models")
for column in sample.columns[5:]:
    print(f"\n\n -----------------\nEvaluating {column}")
    predictions = sample[column].apply(parse_RL_category_en)

    # Evaluate predictions against golden standard
    evaluate_predictions(
        annotator.tolist(),
        predictions.tolist(),
        true_label_source_name="Golden Standard",
        pred_label_source_name=column
    )

print("\n\nEvaluation completed!")

Parsed 483 golden standard annotations
Evaluating 15 English models


 -----------------
Evaluating gpt41_zero_shot_en


Classes considered for this evaluation: [1 2 3 4 5 6 7]
Number of samples after filtering non-annotated: 343

--- F1 Scores ---
F1 Score (micro): 0.7130
F1 Score (macro): 0.6365
F1 Score (weighted): 0.7048
F1 Score (samples): 0.6570
--------------------

--- Classification Report ---
              precision    recall  f1-score   support

         RL1       0.79      0.61      0.69       157
         RL2       0.58      0.70      0.63        37
         RL3       0.68      0.81      0.74        75
         RL4       0.91      0.86      0.88        92
         RL5       0.67      0.40      0.50        35
         RL6       0.83      0.60      0.70        50
         RL7       0.75      0.20      0.32        15

   micro avg       0.76      0.67      0.71       461
   macro avg       0.74      0.60      0.64       461
weighted avg       0.77      0.67      0.70       46

## 6. Summary and Analysis

In [9]:
# Display summary statistics
print("Data Summary:")
print(f"Total samples: {len(df)}")
print(f"Number of models evaluated: {len([col for col in df.columns[5:] if not col.endswith('en')])}")

Data Summary:
Total samples: 483
Number of models evaluated: 0
