In [None]:
# import required libraries
import glob
import json
import pandas as pd
import numpy as np
from typing import Dict, List, Any
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
import os
from collections import defaultdict
from scipy.stats import chi2
from statsmodels.stats.contingency_tables import mcnemar

from utils_baseline import *
from utils_experiments import *
# from utils_results import *

# MAMI

In [None]:
# Configuration for MAMI dataset paths
# Note: Dataset files are not included in this repository due to privacy restrictions.
# To obtain the dataset, please contact the original dataset creators.

# Default paths (update these according to your local setup)
mami_training_data = "" # Path to training data
mami_dev_data =  "" # Path to development data  
mami_test_data = ""  # Path to test data


mami_training_df = pd.read_json(mami_training_data,orient='index')
mami_dev_df = pd.read_json(mami_dev_data,orient='index')
mami_test_df = pd.read_json(mami_test_data,orient='index')

#MAMI: combine training and dev data as training data
mami_training_df = pd.concat([mami_training_df, mami_dev_df]).sort_index()

# Check available columns
print("Training data columns:", mami_training_df.columns)
print("Test data columns:", mami_test_df.columns)

#gold labels 
gold_test_txt = "" # Path to MAMI text gold label (txt)
gold_test_bin = "" # Path to MAMI binary classification gold labels (json) 
gold_test_ml =  "" # Path to MAMI multi-label classification gold labels (json)  

## Binary Classification: Misogynous vs. non-misogynous

In [None]:
class NumpyEncoder(json.JSONEncoder):
    """Custom JSON encoder for numpy arrays and other types"""
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        return super().default(obj)
        
def run_baseline_with_save(dataset_name, training_df, test_df, binary_label, label_names):
    """
    Run baseline model and save results in the same format as ablation experiments.
    
    Parameters:
    -----------
    dataset_name : str
        Dataset name ('MAMI' or 'EXIST2024')
    training_df : DataFrame
        Training data
    test_df : DataFrame
        Test data
    binary_label : str
        Binary label column name
    label_names : list
        List of label names
    
    Returns:
    --------
    dict : Baseline results dictionary
    """
    
    print(f"\n🎯 RUNNING BASELINE FOR {dataset_name}")
    print("=" * 50)
    
    # Data preparation
    X_train = training_df['svm representation'].tolist()
    y_train = training_df[binary_label].tolist()
    X_test = test_df['svm representation'].tolist()
    y_test = test_df[binary_label].tolist()
    
    # Set model name
    model_name = "svm_baseline_bow"
    
    # Create classifier
    svm_model, vec = build_bin_classifier(X_train, y_train)
    
    # Make predictions on test data
    y_pred_svm = classify_data(X_test, svm_model, vec)
    
    # Set evaluation parameters
    evaluation_type = 'binary'
    
    # Set gold standard file paths
    if dataset_name == "MAMI":
        gold_test_bin = "" # Path to MAMI binary classification gold labels (json)
        gold_test_txt = "" # Path to MAMI text gold labels (txt)
    else:  # EXIST2024
        gold_test_bin = "" # Path to EXIST2024 binary classification gold labels (json)
        gold_test_txt = "" # Path to EXIST2024 text gold labels (txt)
    
    # Save prediction results
    test_pred_json, test_pred_txt = save_evaluation(
        test_df, "evaluation/predictions", dataset_name, "test", 
        evaluation_type, model_name, y_pred_svm, binary_label, []
    )
    
    # Calculate all metrics
    accuracy = accuracy_score(y_test, y_pred_svm)
    precision_macro = precision_score(y_test, y_pred_svm, average='macro')
    recall_macro = recall_score(y_test, y_pred_svm, average='macro')
    f1_macro = f1_score(y_test, y_pred_svm, average='macro')
    
    # Get detailed classification report
    class_report_dict = classification_report(
        y_test, y_pred_svm, 
        target_names=label_names,
        zero_division=0,
        digits=3,
        output_dict=True
    )
    
    # Calculate binary F1 score 
    binary_f1 = evaluate_f1_scores(gold_test_txt, test_pred_txt, 2)
    
    # Save baseline results to JSON file WITH PREDICTIONS
    baseline_results = {
        'binary_f1': binary_f1,
        'macro_f1': f1_macro,
        'accuracy': accuracy,
        'precision_macro': precision_macro,
        'recall_macro': recall_macro,
        'per_label_metrics': class_report_dict,
        'predictions': y_pred_svm.tolist(), 
        'true_labels': y_test,  
        'prediction_files': {
            'json': test_pred_json,
            'txt': test_pred_txt
        }
    }
    
    # Create results directory
    os.makedirs("evaluation/results/binary/SVM", exist_ok=True)
    
    # Save baseline results with updated filename format
    baseline_results_file = f"evaluation/results/binary/SVM/{model_name}_{dataset_name}_bin_baseline_results.json"
    
    with open(baseline_results_file, 'w') as f:
        json.dump(baseline_results, f, indent=2, cls=NumpyEncoder)
    
    print(f"✅ Enhanced baseline results saved to: {baseline_results_file}")
    
    # Display evaluation results
    evaluate_binary_classification(
        gold_test_bin, test_pred_json,
        y_test, y_pred_svm,
        gold_test_txt, test_pred_txt,
        label_names,
        model_name="SVM baseline"
    )
    
    return baseline_results

In [None]:
# MAMI Dataset
print("🚀 Running MAMI baseline with save...")
mami_baseline_results = run_baseline_with_save(
    "MAMI", mami_training_df, mami_test_df, 
    "misogynous", ["non-misogynous", "misogynous"]
)

## Multi-label Classification: Misogyny Categorization

### Hierarchical classification: only fine-grained categories

In [None]:
def run_multilabel_baseline_with_predictions(dataset_name, training_df, test_df, binary_label, fine_grained_labels):
    """
    Run multilabel baseline using hierarchical approach and save predictions.
    """
    
    print(f"\n🎯 RUNNING MULTILABEL BASELINE FOR {dataset_name} (WITH PREDICTIONS)")
    print("=" * 60)
    
    # Prepare data for hierarchical approach
    X_train = training_df["svm representation"].tolist()
    y_train_binary = training_df[binary_label].tolist()
    train_df_bin_positive = training_df.loc[training_df[binary_label] == 1]
    X_train_bin_positive = train_df_bin_positive["svm representation"].tolist()
    y_train_categories = train_df_bin_positive[fine_grained_labels]
    
    X_test = test_df["svm representation"].tolist()
    
    # Prepare complete label list (binary + fine-grained)
    all_labels = [binary_label] + fine_grained_labels
    y_test_all = test_df[all_labels]
    
    # Train models for hierarchical classification and get predictions
    test_pred_df, bin_clf, bin_vec, ml_model, ml_vec = build_hierarchical_multilabel_classifier(
        X_train, y_train_binary, X_train_bin_positive, y_train_categories, 
        X_test, binary_label, fine_grained_labels
    )
    
    # Set model name and evaluation type
    model_name = "svm_baseline_bow_hierarchy"
    evaluation_type = "hierarchical"
    
    # Set gold file paths
    if dataset_name == "MAMI":
        gold_test_ml = "" # Path to gold label of MAMI for multi-label classification (json file) 
        gold_test_txt = "" # Path to gold label of MAMI (txt file)
    else:  # EXIST2024
        gold_test_ml = "" # Path to gold label of EXIST2024 for multi-label classification (json file) 
        gold_test_txt = "" # Path to gold label of EXIST2024 (txt file)
    
    # Create file with predictions
    test_pred_json_ml, test_pred_txt_ml = save_evaluation(
        test_df, "evaluation/predictions", dataset_name, "test", evaluation_type, 
        model_name, test_pred_df, binary_label, all_labels
    )
    
    # Get evaluation metrics
    baseline_metrics = evaluate_multilabel_classification(
        gold_test_ml, test_pred_json_ml,
        y_test_all, test_pred_df.to_numpy(),
        gold_test_txt, test_pred_txt_ml,
        all_labels, hierarchy=True
    )
    
    
    baseline_metrics['predictions'] = test_pred_df.to_numpy().tolist()  
    baseline_metrics['true_labels'] = y_test_all.values.tolist()  
    baseline_metrics['prediction_files'] = {
        'json': test_pred_json_ml,
        'txt': test_pred_txt_ml
    }
    baseline_metrics['all_labels'] = all_labels
    
    # Save baseline results with predictions
    os.makedirs("evaluation/results/multi-label/SVM/fine-grained", exist_ok=True)
    baseline_results_file = f"evaluation/results/multi-label/SVM/fine-grained/{model_name}_{dataset_name}_results.json"
    
    with open(baseline_results_file, 'w') as f:
        json.dump(baseline_metrics, f, indent=2, cls=NumpyEncoder)
    
    print(f"✅ Multilabel baseline results with predictions saved to: {baseline_results_file}")
    
    return baseline_metrics
    

print("🚀 Running MAMI multilabel baseline with predictions...")
mami_multilabel_baseline = run_multilabel_baseline_with_predictions(
    "MAMI", mami_training_df, mami_test_df, "misogynous", 
    ["shaming", "stereotype", "objectification", "violence"]
)

# EXIST 

In [None]:
# file paths


exist_training_data = "" # Path to training data
exist_dev_data =  # Path to development data  
exist_test_data = ""  # Path to test data

exist_training_df = pd.read_json(exist_training_data,orient='index')
exist_dev_df = pd.read_json(exist_dev_data,orient='index')
exist_test_df = pd.read_json(exist_test_data,orient='index')

exist_training_df = pd.concat([exist_training_df, exist_dev_df]).sort_index()

# Check available columns
print("Training data columns:", exist_training_df.columns)
print("Test data columns:", exist_test_df.columns)


#gold labels 
gold_test_txt = "" # Path to EXIST2024 text gold label (txt)
gold_test_bin = "" # Path to EXIST2024 binary classification gold labels (json) 
gold_test_ml =  "" # Path to EXIST2024 multi-label classification gold labels (json)  

## Binary Classification: Sexist vs. non-sexist

In [None]:
# EXIST Dataset  
print("🚀 Running EXIST baseline with save...")
exist_baseline_results = run_baseline_with_save(
    "EXIST2024", exist_training_df, exist_test_df,
    "sexist", ["non-sexist", "sexist"]
)

## Multi-label Classification: Sexism Categorization

### Hierarchical classification: only fine-grained categories

In [None]:

print("🚀 Running EXIST2024 multilabel baseline with predictions...")
exist_multilabel_baseline = run_multilabel_baseline_with_predictions(
    "EXIST2024", exist_training_df, exist_test_df, "sexist",
    ["ideological-inequality", "stereotyping-dominance", "objectification", 
     "sexual-violence", "misogyny-non-sexual-violence"]
)

# Ablation Experiments

### Binary

In [None]:
def save_random_including_results(y_test, y_pred_random_including, dataset_name, feature_type, ablation_method):
    """
    Save random including ablation results to JSON file.
    
    Parameters:
    -----------
    y_test : list
        True test labels
    y_pred_random_including : array
        Predictions from random including ablation model
    dataset_name : str
        Dataset name ('MAMI' or 'EXIST2024')
    feature_type : str
        Feature type being ablated (e.g., 'sentiment_pos', 'sentiment_neg', 'hate', 'function')
    ablation_method : str
        Ablation method ('mask' or 'remove')
    """
    
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred_random_including)
    precision_macro = precision_score(y_test, y_pred_random_including, average='macro')
    recall_macro = recall_score(y_test, y_pred_random_including, average='macro')
    f1_macro = f1_score(y_test, y_pred_random_including, average='macro')
    
    # Set label names based on dataset
    if dataset_name == "MAMI":
        label_names = ["non-misogynous", "misogynous"]
    else:  # EXIST2024
        label_names = ["non-sexist", "sexist"]
    
    # Generate detailed classification report
    class_report_dict = classification_report(y_test, y_pred_random_including, 
                                            target_names=label_names, 
                                            zero_division=0, digits=3, 
                                            output_dict=True)
    
    # Create results dictionary matching the format used for feature ablation
    random_results = {
        'binary_f1': f1_macro,  # Using macro F1 as binary F1 metric
        'macro_f1': f1_macro,
        'accuracy': accuracy,
        'precision_macro': precision_macro,
        'recall_macro': recall_macro,
        'per_label_metrics': class_report_dict
    }
    
    # Generate filename following the same pattern as feature ablation results
    model_name = f"random_including_{feature_type}_{ablation_method}_{dataset_name.lower()}"
    os.makedirs("evaluation/results/binary/SVM", exist_ok=True)
    results_file = f"evaluation/results/binary/SVM/{model_name}_bin_results.json"
    
    # Save results to JSON file using custom encoder for numpy arrays
    with open(results_file, 'w') as f:
        json.dump(random_results, f, indent=2, cls=NumpyEncoder)
    
    print(f"✅ Random including results saved to: {results_file}")


def run_coarse_binary_ablation_experiment(dataset_df_train, dataset_df_test, feature_type, 
                                        ablation_method, dataset_name, binary_label='misogynous'):
    """
    Run a coarse-grained ablation experiment with three comparison conditions:
    1. Feature ablation (remove/mask specific feature category)
    2. Random ablation excluding feature words (remove/mask same number of random words, but not the feature words)
    3. Random ablation including feature words (remove/mask same number of random words, potentially including feature words)
    
    For function words, comparison is to random words.
    For sentiment/hate speech terms, comparison is to random content words (non-function words).
    
    Parameters:
    -----------
    dataset_df_train : DataFrame
        Training dataset
    dataset_df_test : DataFrame  
        Test dataset
    feature_type : str
        Type of feature:
        - 'sentiment_pos': only positive sentiment words
        - 'sentiment_neg': only negative sentiment words
        - 'hate': all hate speech terms as a group
        - 'function': all function words as a group
    ablation_method : str
        'mask' or 'remove'
    dataset_name : str
        Name of dataset (e.g., 'MAMI')
    binary_label : str
        Column for binary classification
    """
    # Load necessary lexicons
    # Note: Lexicon files are not included in this repository. 
    # Please download them from their respective sources before running this code.
    nrc_lexicon = load_nrc_lexicon("NRC-Emotion-Lexicon/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt")
    function_words_dict = load_function_words()
    hurtlex_dict = load_hurtlex("hurtlex-master/lexica/EN/1.2/hurtlex_EN.tsv") 
    
    # Get original texts
    X_train_orig = dataset_df_train["svm representation"].tolist()
    X_test_orig = dataset_df_test["svm representation"].tolist()
    y_train = dataset_df_train[binary_label].tolist()
    y_test = dataset_df_test[binary_label].tolist()
    
    # Prepare containers for different ablation conditions
    X_train_feature_ablated = []            # Feature words ablated
    X_train_random_excluding_ablated = []   # Random words ablated (excluding feature words)
    X_train_random_including_ablated = []   # Random words ablated (including feature words)
    
    X_test_feature_ablated = []
    X_test_random_excluding_ablated = []
    X_test_random_including_ablated = []
    
    print(f"Running coarse-grained ablation experiment for {feature_type}...")
    print("1. Feature ablation")
    if feature_type in ['sentiment_pos', 'sentiment_neg', 'hate']:
        print("2. Random content word ablation excluding feature words")
        print("3. Random content word ablation including feature words")
        random_comparison_type = "content words"
    else:
        print("2. Random word ablation excluding feature words")
        print("3. Random word ablation including feature words")
        random_comparison_type = "words"
    
    # Training set statistics
    train_total_feature_words_processed = 0
    train_documents_with_features = 0
    train_feature_words_per_doc = []
    
    # Test set statistics  
    test_total_feature_words_processed = 0
    test_documents_with_features = 0
    test_feature_words_per_doc = []
    
    # Get all function words (needed for sentiment/hate comparisons)
    all_function_words = set()
    for category, words in function_words_dict.items():
        all_function_words.update(words)
    
    # Process training data
    for text in X_train_orig:
        # Extract feature words based on feature type
        feature_words = []
        if feature_type == 'sentiment_pos':
            # Extract only positive sentiment words
            feature_words = extract_sentiment_words(text, nrc_lexicon, 'positive')
            
        elif feature_type == 'sentiment_neg':
            # Extract only negative sentiment words
            feature_words = extract_sentiment_words(text, nrc_lexicon, 'negative')
            
        elif feature_type == 'hate':
            # Extract all hate speech terms (any category)
            hate_words = []
            for word in text.lower().split():
                if word in hurtlex_dict:
                    hate_words.append(word)
            feature_words = list(set(hate_words))
            
        elif feature_type == 'function':
            # Extract all function words from all categories
            function_words = []
            for category in function_words_dict.keys():
                category_words = extract_function_words(text, function_words_dict, category)
                function_words.extend(category_words)
            feature_words = list(set(function_words))
        
        # Convert to set for faster lookup
        feature_words_set = set(feature_words)
        
        # Count feature words in this text
        num_feature_words = len([word for word in text.lower().split() if word in feature_words_set])
        train_feature_words_per_doc.append(num_feature_words)
        
        if num_feature_words > 0:
            train_documents_with_features += 1
        
        # Determine random word sampling strategy based on feature type
        if feature_type in ['sentiment_pos', 'sentiment_neg', 'hate']:
            # For sentiment and hate speech, compare to random content words (excluding function words)
            # Get all non-function words in the text
            all_words = text.lower().split()
            content_words_indices = [i for i, word in enumerate(all_words) if word not in all_function_words]
            
            # Process with feature ablation
            if ablation_method == 'mask':
                feature_ablated_text, count = replace_mask_features(text, feature_words, mask_token='UNK')
                
                # 1. Random ablation EXCLUDING feature words
                random_excluding_ablated_text = text
                if content_words_indices and num_feature_words > 0:
                    # Get indices of content words that are NOT feature words
                    words_list = text.lower().split()
                    feature_word_indices = [i for i, word in enumerate(words_list) if word in feature_words_set]
                    non_feature_content_indices = [i for i in content_words_indices if i not in feature_word_indices]
                    
                    if non_feature_content_indices:
                        # Adjust number to mask if fewer eligible words available
                        num_to_mask = min(num_feature_words, len(non_feature_content_indices))
                        rand_indices = random.sample(non_feature_content_indices, num_to_mask)
                        
                        # Convert to list for modification
                        random_words = text.split()
                        for idx in rand_indices:
                            random_words[idx] = 'UNK'
                        random_excluding_ablated_text = ' '.join(random_words)
                
                # 2. Random ablation INCLUDING feature words
                random_including_ablated_text = text
                if content_words_indices and num_feature_words > 0:
                    # Adjust number to mask if fewer content words available
                    num_to_mask = min(num_feature_words, len(content_words_indices))
                    rand_indices = random.sample(content_words_indices, num_to_mask)
                    
                    # Convert to list for modification
                    random_words = text.split()
                    for idx in rand_indices:
                        random_words[idx] = 'UNK'
                    random_including_ablated_text = ' '.join(random_words)
            else:  # remove
                feature_ablated_text, count = remove_features(text, feature_words)
                
                # 1. Random ablation EXCLUDING feature words
                random_excluding_ablated_text = text
                if content_words_indices and num_feature_words > 0:
                    # Get indices of content words that are NOT feature words
                    words_list = text.lower().split()
                    feature_word_indices = [i for i, word in enumerate(words_list) if word in feature_words_set]
                    non_feature_content_indices = [i for i in content_words_indices if i not in feature_word_indices]
                    
                    if non_feature_content_indices:
                        # Adjust number to remove if fewer eligible words available
                        num_to_remove = min(num_feature_words, len(non_feature_content_indices))
                        rand_indices = random.sample(non_feature_content_indices, num_to_remove)
                        
                        # Sort indices in descending order to avoid index shifting when removing
                        rand_indices.sort(reverse=True)
                        
                        # Convert to list for modification
                        random_words = text.split()
                        for idx in rand_indices:
                            del random_words[idx]
                        random_excluding_ablated_text = ' '.join(random_words)
                
                # 2. Random ablation INCLUDING feature words
                random_including_ablated_text = text
                if content_words_indices and num_feature_words > 0:
                    # Adjust number to remove if fewer content words available
                    num_to_remove = min(num_feature_words, len(content_words_indices))
                    rand_indices = random.sample(content_words_indices, num_to_remove)
                    
                    # Sort indices in descending order to avoid index shifting when removing
                    rand_indices.sort(reverse=True)
                    
                    # Convert to list for modification
                    random_words = text.split()
                    for idx in rand_indices:
                        del random_words[idx]
                    random_including_ablated_text = ' '.join(random_words)
            
        else:  # function words
            # For function words, compare to random words from entire vocabulary
            if ablation_method == 'mask':
                feature_ablated_text, count = replace_mask_features(text, feature_words, mask_token='UNK')
                
                # 1. Random ablation EXCLUDING function words
                words_list = text.lower().split()
                non_function_indices = [i for i, word in enumerate(words_list) if word not in feature_words_set]
                
                random_excluding_ablated_text = text
                if non_function_indices and num_feature_words > 0:
                    # Adjust number to mask if fewer eligible words available
                    num_to_mask = min(num_feature_words, len(non_function_indices))
                    rand_indices = random.sample(non_function_indices, num_to_mask)
                    
                    # Convert to list for modification
                    random_words = text.split()
                    for idx in rand_indices:
                        random_words[idx] = 'UNK'
                    random_excluding_ablated_text = ' '.join(random_words)
                
                # 2. Random ablation INCLUDING all words
                random_including_ablated_text, _ = replace_mask_random_words(text, num_feature_words, mask_token='UNK')
            else:  # remove
                feature_ablated_text, count = remove_features(text, feature_words)
                
                # 1. Random ablation EXCLUDING function words
                words_list = text.lower().split()
                non_function_indices = [i for i, word in enumerate(words_list) if word not in feature_words_set]
                
                random_excluding_ablated_text = text
                if non_function_indices and num_feature_words > 0:
                    # Adjust number to remove if fewer eligible words available
                    num_to_remove = min(num_feature_words, len(non_function_indices))
                    rand_indices = random.sample(non_function_indices, num_to_remove)
                    
                    # Sort indices in descending order to avoid index shifting when removing
                    rand_indices.sort(reverse=True)
                    
                    # Convert to list for modification
                    random_words = text.split()
                    for idx in rand_indices:
                        del random_words[idx]
                    random_excluding_ablated_text = ' '.join(random_words)
                
                # 2. Random ablation INCLUDING all words
                random_including_ablated_text, _ = remove_random_words(text, num_feature_words)
        
        X_train_feature_ablated.append(feature_ablated_text)
        X_train_random_excluding_ablated.append(random_excluding_ablated_text)
        X_train_random_including_ablated.append(random_including_ablated_text)
        train_total_feature_words_processed += count
    
    # Process test data similarly
    for text in X_test_orig:
        # Extract feature words based on feature type
        feature_words = []
        if feature_type == 'sentiment_pos':
            # Extract only positive sentiment words
            feature_words = extract_sentiment_words(text, nrc_lexicon, 'positive')
            
        elif feature_type == 'sentiment_neg':
            # Extract only negative sentiment words
            feature_words = extract_sentiment_words(text, nrc_lexicon, 'negative')
            
        elif feature_type == 'hate':
            # Extract all hate speech terms (any category)
            hate_words = []
            for word in text.lower().split():
                if word in hurtlex_dict:
                    hate_words.append(word)
            feature_words = list(set(hate_words))
            
        elif feature_type == 'function':
            # Extract all function words from all categories
            function_words = []
            for category in function_words_dict.keys():
                category_words = extract_function_words(text, function_words_dict, category)
                function_words.extend(category_words)
            feature_words = list(set(function_words))
        
        # Convert to set for faster lookup
        feature_words_set = set(feature_words)
        
        # Count feature words in this text
        num_feature_words = len([word for word in text.lower().split() if word in feature_words_set])
        test_feature_words_per_doc.append(num_feature_words)
        
        if num_feature_words > 0:
            test_documents_with_features += 1
        
        # Determine random word sampling strategy based on feature type
        if feature_type in ['sentiment_pos', 'sentiment_neg', 'hate']:
            # For sentiment and hate speech, compare to random content words (excluding function words)
            # Get all non-function words in the text
            all_words = text.lower().split()
            content_words_indices = [i for i, word in enumerate(all_words) if word not in all_function_words]
            
            # Process with feature ablation
            if ablation_method == 'mask':
                feature_ablated_text, count = replace_mask_features(text, feature_words, mask_token='UNK')
                
                # 1. Random ablation EXCLUDING feature words
                random_excluding_ablated_text = text
                if content_words_indices and num_feature_words > 0:
                    # Get indices of content words that are NOT feature words
                    words_list = text.lower().split()
                    feature_word_indices = [i for i, word in enumerate(words_list) if word in feature_words_set]
                    non_feature_content_indices = [i for i in content_words_indices if i not in feature_word_indices]
                    
                    if non_feature_content_indices:
                        # Adjust number to mask if fewer eligible words available
                        num_to_mask = min(num_feature_words, len(non_feature_content_indices))
                        rand_indices = random.sample(non_feature_content_indices, num_to_mask)
                        
                        # Convert to list for modification
                        random_words = text.split()
                        for idx in rand_indices:
                            random_words[idx] = 'UNK'
                        random_excluding_ablated_text = ' '.join(random_words)
                
                # 2. Random ablation INCLUDING feature words
                random_including_ablated_text = text
                if content_words_indices and num_feature_words > 0:
                    # Adjust number to mask if fewer content words available
                    num_to_mask = min(num_feature_words, len(content_words_indices))
                    rand_indices = random.sample(content_words_indices, num_to_mask)
                    
                    # Convert to list for modification
                    random_words = text.split()
                    for idx in rand_indices:
                        random_words[idx] = 'UNK'
                    random_including_ablated_text = ' '.join(random_words)
            else:  # remove
                feature_ablated_text, count = remove_features(text, feature_words)
                
                # 1. Random ablation EXCLUDING feature words
                random_excluding_ablated_text = text
                if content_words_indices and num_feature_words > 0:
                    # Get indices of content words that are NOT feature words
                    words_list = text.lower().split()
                    feature_word_indices = [i for i, word in enumerate(words_list) if word in feature_words_set]
                    non_feature_content_indices = [i for i in content_words_indices if i not in feature_word_indices]
                    
                    if non_feature_content_indices:
                        # Adjust number to remove if fewer eligible words available
                        num_to_remove = min(num_feature_words, len(non_feature_content_indices))
                        rand_indices = random.sample(non_feature_content_indices, num_to_remove)
                        
                        # Sort indices in descending order to avoid index shifting when removing
                        rand_indices.sort(reverse=True)
                        
                        # Convert to list for modification
                        random_words = text.split()
                        for idx in rand_indices:
                            del random_words[idx]
                        random_excluding_ablated_text = ' '.join(random_words)
                
                # 2. Random ablation INCLUDING feature words
                random_including_ablated_text = text
                if content_words_indices and num_feature_words > 0:
                    # Adjust number to remove if fewer content words available
                    num_to_remove = min(num_feature_words, len(content_words_indices))
                    rand_indices = random.sample(content_words_indices, num_to_remove)
                    
                    # Sort indices in descending order to avoid index shifting when removing
                    rand_indices.sort(reverse=True)
                    
                    # Convert to list for modification
                    random_words = text.split()
                    for idx in rand_indices:
                        del random_words[idx]
                    random_including_ablated_text = ' '.join(random_words)
            
        else:  # function words
            # For function words, compare to random words from entire vocabulary
            if ablation_method == 'mask':
                feature_ablated_text, count = replace_mask_features(text, feature_words, mask_token='UNK')
                
                # 1. Random ablation EXCLUDING function words
                words_list = text.lower().split()
                non_function_indices = [i for i, word in enumerate(words_list) if word not in feature_words_set]
                
                random_excluding_ablated_text = text
                if non_function_indices and num_feature_words > 0:
                    # Adjust number to mask if fewer eligible words available
                    num_to_mask = min(num_feature_words, len(non_function_indices))
                    rand_indices = random.sample(non_function_indices, num_to_mask)
                    
                    # Convert to list for modification
                    random_words = text.split()
                    for idx in rand_indices:
                        random_words[idx] = 'UNK'
                    random_excluding_ablated_text = ' '.join(random_words)
                
                # 2. Random ablation INCLUDING all words
                random_including_ablated_text, _ = replace_mask_random_words(text, num_feature_words, mask_token='UNK')
            else:  # remove
                feature_ablated_text, count = remove_features(text, feature_words)
                
                # 1. Random ablation EXCLUDING function words
                words_list = text.lower().split()
                non_function_indices = [i for i, word in enumerate(words_list) if word not in feature_words_set]
                
                random_excluding_ablated_text = text
                if non_function_indices and num_feature_words > 0:
                    # Adjust number to remove if fewer eligible words available
                    num_to_remove = min(num_feature_words, len(non_function_indices))
                    rand_indices = random.sample(non_function_indices, num_to_remove)
                    
                    # Sort indices in descending order to avoid index shifting when removing
                    rand_indices.sort(reverse=True)
                    
                    # Convert to list for modification
                    random_words = text.split()
                    for idx in rand_indices:
                        del random_words[idx]
                    random_excluding_ablated_text = ' '.join(random_words)
                
                # 2. Random ablation INCLUDING all words
                random_including_ablated_text, _ = remove_random_words(text, num_feature_words)
        
        X_test_feature_ablated.append(feature_ablated_text)
        X_test_random_including_ablated.append(random_including_ablated_text)
        test_total_feature_words_processed += count
    
    # Summary statistics
    print(f"\n{'=' * 50}")
    print(f"Processing Summary:")
    print(f"Training set:")
    print(f"  Total documents: {len(X_train_feature_ablated)}")
    print(f"  Documents with features: {train_documents_with_features}")
    print(f"  Total feature words processed: {train_total_feature_words_processed}")
    print(f"  Average feature words per document: {np.mean(train_feature_words_per_doc):.2f}")
    
    print(f"Test set:")
    print(f"  Total documents: {len(X_test_feature_ablated)}")
    print(f"  Documents with features: {test_documents_with_features}")
    print(f"  Total feature words processed: {test_total_feature_words_processed}")
    print(f"  Average feature words per document: {np.mean(test_feature_words_per_doc):.2f}")

    print(f"{'=' * 50}")
    
    # Set up model names
    feature_model_name = f"{feature_type}_{ablation_method}_{dataset_name.lower()}"
    random_including_model_name = f"random_including_{feature_type}_{ablation_method}_{dataset_name.lower()}"

    # Set up gold standard file paths based on dataset
    if dataset_name == "MAMI":
        gold_test_bin = "" # Path to MAMI binary classification gold labels (json)
        gold_test_txt = "" # Path to MAMI text gold labels (txt)
        label_names = ["non-misogynous", "misogynous"]
    else:  # EXIST2024
        gold_test_bin = "" # Path to EXIST2024 binary classification gold labels (json)
        gold_test_txt = "" # Path to EXIST2024 text gold labels (txt)
        label_names = ["non-sexist", "sexist"]
    

    
    # Train model on feature-ablated data
    print("Training feature ablation model...")
    feature_model, feature_vec = build_bin_classifier(X_train_feature_ablated, y_train)
    y_pred_feature = classify_data(X_test_feature_ablated, feature_model, feature_vec)
    

    
    # Train model on random-ablated data (including feature words)
    print("Training random ablation model (including feature words)...")
    random_including_model, random_including_vec = build_bin_classifier(X_train_random_including_ablated, y_train)
    y_pred_random_including = classify_data(X_test_random_including_ablated, random_including_model, random_including_vec)
    
    # Create files with predictions for all three models
    # 1. Feature ablation
    print(f"\n{'='*20} Feature Ablation Results {'='*20}")
    test_pred_json_feature, test_pred_txt_feature = save_evaluation(
        dataset_df_test, "evaluation/predictions", dataset_name, "test", "binary", 
        feature_model_name, y_pred_feature, binary_label, []
    )
    feature_metrics = evaluate_binary_classification(
        gold_test_bin, test_pred_json_feature, y_test, y_pred_feature, 
        gold_test_txt, test_pred_txt_feature, label_names, model_name=feature_model_name
    )
    
    # 2. Random ablation - Excluding feature words
    print(f"\n{'='*20} Random Ablation - EXCLUDING feature words ({random_comparison_type}) Results {'='*20}")
    test_pred_json_random_excluding, test_pred_txt_random_excluding = save_evaluation(
        dataset_df_test, "evaluation/predictions", dataset_name, "test", "binary", 
        random_excluding_model_name, y_pred_random_excluding, binary_label, []
    )
    random_excluding_metrics = evaluate_binary_classification(
        gold_test_bin, test_pred_json_random_excluding, y_test, y_pred_random_excluding, 
        gold_test_txt, test_pred_txt_random_excluding, label_names, model_name=random_excluding_model_name
    )
    
    # 3. Random ablation - Including feature words
    print(f"\n{'='*20} Random Ablation - INCLUDING feature words ({random_comparison_type}) Results {'='*20}")
    test_pred_json_random_including, test_pred_txt_random_including = save_evaluation(
        dataset_df_test, "evaluation/predictions", dataset_name, "test", "binary", 
        random_including_model_name, y_pred_random_including, binary_label, []
    )
    random_including_metrics = evaluate_binary_classification(
        gold_test_bin, test_pred_json_random_including, y_test, y_pred_random_including, 
        gold_test_txt, test_pred_txt_random_including, label_names, model_name=random_including_model_name
    )
    
    # Calculate metrics for feature ablation model
    accuracy = accuracy_score(y_test, y_pred_feature)
    precision_macro = precision_score(y_test, y_pred_feature, average='macro')
    recall_macro = recall_score(y_test, y_pred_feature, average='macro')
    f1_macro = f1_score(y_test, y_pred_feature, average='macro')
    
    # Get classification report as dictionary
    class_report_dict = classification_report(y_test, y_pred_feature, 
                                            target_names=label_names, 
                                            zero_division=0, digits=3, 
                                            output_dict=True)
    
    # Calculate binary F1 score (MAMI evaluation metric)
    binary_f1 = evaluate_f1_scores(gold_test_txt, test_pred_txt_feature, 2)
    
    # Create structured results dictionary - ONLY for feature ablation
    results = {
        'feature_type': feature_type,
        'ablation_method': ablation_method,
        'dataset_name': dataset_name,
        'binary_label': binary_label,
        'feature_ablation': {
            'model_name': feature_model_name,
            'binary_f1': binary_f1,
            'macro_f1': f1_macro,
            'accuracy': accuracy,
            'precision_macro': precision_macro,
            'recall_macro': recall_macro,
            'per_label_metrics': class_report_dict,
            'prediction_files': {
                'json': test_pred_json_feature,
                'txt': test_pred_txt_feature
            }
        },
        'train_statistics': {
            'total_documents': len(X_train_feature_ablated),
            'documents_with_features': train_documents_with_features,
            'total_feature_words_processed': train_total_feature_words_processed,
            'avg_feature_words_per_doc': np.mean(train_feature_words_per_doc),
            'max_feature_words_per_doc': max(train_feature_words_per_doc) if train_feature_words_per_doc else 0,
            'feature_coverage_percentage': train_documents_with_features/len(X_train_feature_ablated)*100
        },
        'test_statistics': {
            'total_documents': len(X_test_feature_ablated),
            'documents_with_features': test_documents_with_features,
            'total_feature_words_processed': test_total_feature_words_processed,
            'avg_feature_words_per_doc': np.mean(test_feature_words_per_doc),
            'max_feature_words_per_doc': max(test_feature_words_per_doc) if test_feature_words_per_doc else 0,
            'feature_coverage_percentage': test_documents_with_features/len(X_test_feature_ablated)*100
        }
    }

    
    
    # Save feature ablation results to JSON file for later analysis
    os.makedirs("evaluation/results/binary/SVM", exist_ok=True)
    results_file = f"evaluation/results/binary/SVM/{feature_model_name}_bin_results.json"
    
    # Create the format expected by analyze_coarse_binary_ablation_drops
    save_results = {
        'binary_f1': binary_f1,
        'macro_f1': f1_macro,
        'accuracy': accuracy,
        'precision_macro': precision_macro,
        'recall_macro': recall_macro,
        'per_label_metrics': class_report_dict,
        'predictions': y_pred_feature.tolist(),  # Save actual predictions
        'true_labels': y_test  # Save true labels for reference
    }
    
    with open(results_file, 'w') as f:
        json.dump(save_results, f, indent=2, cls=NumpyEncoder)
    
    print(f"✅ Feature ablation results saved to: {results_file}")


    # Save random including results for comparison
    print(f"💾 Saving random including results...")
    save_random_including_results(y_test, y_pred_random_including, dataset_name, feature_type, ablation_method)
    
    
    return results

def run_binary_coarse_grained_experiments_without_baseline():
    """
    Run all coarse-grained binary ablation experiments for both datasets (MAMI and EXIST2024).
    This focuses only on binary classification experiments without baseline comparison.
    Uses the modified ablation experiment that includes three conditions:
    1. Feature ablation
    2. Random ablation excluding feature words
    3. Random ablation including feature words
    """
    print("=" * 80)
    print("RUNNING COARSE-GRAINED BINARY ABLATION EXPERIMENTS (WITHOUT BASELINE)")
    print("=" * 80)
    
    # Define feature types and ablation methods
    feature_types = ['sentiment_pos', 'sentiment_neg', 'hate', 'function']
    ablation_methods = ['mask', 'remove']
    
    # Storage for results
    all_results = {
        'MAMI': {},
        'EXIST2024': {}
    }
    
    # MAMI Dataset Experiments
    print("\n" + "=" * 50)
    print("MAMI DATASET EXPERIMENTS")
    print("=" * 50)
    
    # MAMI Binary Classification Experiments
    for feature_type in feature_types:
        all_results['MAMI'][feature_type] = {}
        for ablation_method in ablation_methods:
            print(f"\n{'='*20} MAMI Binary: {feature_type.capitalize()} with {ablation_method} {'='*20}")
            results = run_coarse_binary_ablation_experiment(
                mami_training_df, mami_test_df, feature_type, ablation_method, "MAMI"
            )
            all_results['MAMI'][feature_type][ablation_method] = results
    
    # EXIST2024 Dataset Experiments
    print("\n" + "=" * 50)
    print("EXIST DATASET EXPERIMENTS")
    print("=" * 50)
    
    # EXIST2024 Binary Classification Experiments
    for feature_type in feature_types:
        all_results['EXIST2024'][feature_type] = {}
        for ablation_method in ablation_methods:
            print(f"\n{'='*20} EXIST2024 Binary: {feature_type.capitalize()} with {ablation_method} {'='*20}")
            results = run_coarse_binary_ablation_experiment(
                exist_training_df, exist_test_df, feature_type, ablation_method, 
                "EXIST2024", binary_label='sexist'
            )
            all_results['EXIST2024'][feature_type][ablation_method] = results
    


# Call the function to run all experiments
all_results = run_binary_coarse_grained_experiments_without_baseline()

#### Binary ablation_statistics

In [None]:
def extract_binary_category_features(text, category, nrc_lexicon, hurtlex_dict, function_words_dict):
    """Extract features for binary ablation categories from text."""
    
    if category == "sentiment_pos":
        return extract_sentiment_words(text, nrc_lexicon, 'positive')
    elif category == "sentiment_neg":
        return extract_sentiment_words(text, nrc_lexicon, 'negative')
    elif category == "hate":
        # Extract all hate speech terms (any category)
        hate_words = []
        for word in text.lower().split():
            if word in hurtlex_dict:
                hate_words.append(word)
        return list(set(hate_words))
    elif category == "function":
        # Extract all function words from all categories
        function_words = []
        for func_category in function_words_dict.keys():
            category_words = extract_function_words(text, function_words_dict, func_category)
            function_words.extend(category_words)
        return list(set(function_words))
    else:
        return []


def calculate_binary_ablation_statistics(dataset_df, feature_categories, dataset_name):
    """
    Calculate statistics for binary ablation categories.
    Counts features WITHOUT artificial deduplication.
    Combined feature count should be the actual count from 'svm representation' text.
    
    Parameters:
    -----------
    dataset_df : DataFrame
        Dataset containing meme data
    feature_categories : list
        List of binary feature categories ['sentiment_pos', 'sentiment_neg', 'hate', 'function']
    dataset_name : str
        Name of dataset ('MAMI' or 'EXIST2024')
    
    Returns:
    --------
    dict : Statistics for each binary feature category
    """
    
    print(f"🔍 CALCULATING BINARY ABLATION STATISTICS FOR {dataset_name}")
    print("=" * 70)
    
    # Load linguistic resources
    nrc_lexicon = load_nrc_lexicon("NRC-Emotion-Lexicon/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt")
    hurtlex_dict = load_hurtlex("hurtlex-master/lexica/EN/1.2/hurtlex_EN.tsv")
    function_words_dict = load_function_words()
    
    results = {}

    for category in feature_categories:
        print(f"\n📊 ANALYZING: {category.upper()}")
        print("-" * 50)
        
        # Initialize counters
        stats = {
            'total_documents': len(dataset_df),
            'documents_with_features': 0,
            'features_in_text_only': 0,
            'features_in_caption_only': 0,
            'features_in_both': 0,
            'features_in_neither': 0,
            'text_feature_count': 0,
            'caption_feature_count': 0,
            'combined_feature_count': 0,
            'overlapping_instances_count': 0,  # Documents where same word appears in both text and caption
            'unique_feature_words': set(),
            'coverage_percentage': 0.0,
            'unique_feature_count': 0,
            'total_overlapping_features': 0  # Total count of overlapping feature instances
        }
        
        for idx, row in dataset_df.iterrows():
            # Get text components
            meme_text = str(row.get('meme text', '')).lower()
            meme_caption = str(row.get('meme caption', '')).lower()
            combined_text = str(row.get('svm representation', '')).lower()

            # Extract features from each component
            text_features = extract_binary_category_features(meme_text, category, nrc_lexicon, hurtlex_dict, function_words_dict)
            caption_features = extract_binary_category_features(meme_caption, category, nrc_lexicon, hurtlex_dict, function_words_dict)
            combined_features = extract_binary_category_features(combined_text, category, nrc_lexicon, hurtlex_dict, function_words_dict)
            
            # Update unique words (from all sources)
            stats['unique_feature_words'].update(text_features)
            stats['unique_feature_words'].update(caption_features)
            stats['unique_feature_words'].update(combined_features)
            
            # Count occurrences - these are RAW COUNTS, no deduplication
            text_count = len(text_features)
            caption_count = len(caption_features)
            combined_count = len(combined_features)  # This is the ACTUAL count from svm representation
            
            stats['text_feature_count'] += text_count
            stats['caption_feature_count'] += caption_count
            stats['combined_feature_count'] += combined_count
            
            # Check for overlapping features (for analysis purposes only)
            text_set = set(text_features)
            caption_set = set(caption_features)
            overlapping_words = text_set.intersection(caption_set)
            
            if overlapping_words:
                stats['overlapping_instances_count'] += 1
                # Count how many times overlapping words appear
                overlap_count = 0
                for word in overlapping_words:
                    overlap_count += text_features.count(word)
                    overlap_count += caption_features.count(word)
                stats['total_overlapping_features'] += overlap_count

            # Categorize documents based on where features appear
            has_text_features = text_count > 0
            has_caption_features = caption_count > 0
            
            if has_text_features or has_caption_features:
                stats['documents_with_features'] += 1
                
            if has_text_features and has_caption_features:
                stats['features_in_both'] += 1
            elif has_text_features and not has_caption_features:
                stats['features_in_text_only'] += 1
            elif not has_text_features and has_caption_features:
                stats['features_in_caption_only'] += 1
            else:
                stats['features_in_neither'] += 1
        
        # Calculate final statistics
        stats['coverage_percentage'] = (stats['documents_with_features'] / stats['total_documents']) * 100
        stats['unique_feature_count'] = len(stats['unique_feature_words'])
        stats['unique_feature_words'] = list(stats['unique_feature_words'])  # Convert to list for JSON
        
        # Display results with proper explanation
        print(f"📋 DOCUMENT COVERAGE:")
        print(f"   • Total documents: {stats['total_documents']}")
        print(f"   • Documents with features: {stats['documents_with_features']} ({stats['coverage_percentage']:.1f}%)")
        print(f"   • Features in text only: {stats['features_in_text_only']}")
        print(f"   • Features in caption only: {stats['features_in_caption_only']}")
        print(f"   • Features in both: {stats['features_in_both']}")
        print(f"   • Features in neither: {stats['features_in_neither']}")
        
        print(f"\n📁 FEATURE WORD COUNTS (RAW COUNTS):")
        print(f"   • Text feature occurrences: {stats['text_feature_count']}")
        print(f"   • Caption feature occurrences: {stats['caption_feature_count']}")
        print(f"   • Combined feature occurrences: {stats['combined_feature_count']}")
        print(f"   • Unique feature words: {stats['unique_feature_count']}")
        
        # Analyze the relationship between text+caption and combined
        expected_sum = stats['text_feature_count'] + stats['caption_feature_count']
        actual_difference = expected_sum - stats['combined_feature_count']
        
        print(f"\n📊 COUNTING ANALYSIS:")
        print(f"   • Text + Caption sum: {stats['text_feature_count']} + {stats['caption_feature_count']} = {expected_sum}")
        print(f"   • Actual combined count: {stats['combined_feature_count']}")
        
        if actual_difference > 0:
            print(f"   • Difference: -{actual_difference} (combined has fewer)")
            print(f"   • Reason: Likely overlapping words between text and caption")
            print(f"   • Documents with overlaps: {stats['overlapping_instances_count']}")
        elif actual_difference < 0:
            print(f"   • Difference: +{abs(actual_difference)} (combined has more)")
            print(f"   • Reason: 'svm representation' may have additional processing/words")
        else:
            print(f"   • Perfect match: Text + Caption = Combined")
            print(f"   • This indicates no overlapping words between text and caption")
        
        # Show sample words
        if len(stats['unique_feature_words']) > 0:
            sample_words = stats['unique_feature_words'][:10]
            print(f"   • Sample words: {', '.join(sample_words)}")
        
        results[category] = stats
    
    return results


def analyze_binary_text_caption_overlap(dataset_df, feature_categories, dataset_name, sample_size=3):
    """
    Analyze and show examples of text-caption overlap for binary ablation features.
    """
    print(f"\n🔍 ANALYZING TEXT-CAPTION OVERLAP FOR {dataset_name} - BINARY CATEGORIES")
    print("=" * 80)
    
    # Load linguistic resources
    nrc_lexicon = load_nrc_lexicon("NRC-Emotion-Lexicon/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt")
    hurtlex_dict = load_hurtlex("hurtlex-master/lexica/EN/1.2/hurtlex_EN.tsv")
    function_words_dict = load_function_words()
    
    for category in feature_categories:
        print(f"\n📊 {category.upper()} OVERLAP ANALYSIS:")
        print("-" * 50)
        
        overlap_examples = []
        total_overlaps = 0
        
        for idx, row in dataset_df.iterrows():
            meme_text = str(row.get('meme text', '')).lower()
            meme_caption = str(row.get('meme caption', '')).lower()
            
            text_features = extract_binary_category_features(meme_text, category, nrc_lexicon, hurtlex_dict, function_words_dict)
            caption_features = extract_binary_category_features(meme_caption, category, nrc_lexicon, hurtlex_dict, function_words_dict)
            
            text_set = set(text_features)
            caption_set = set(caption_features)
            overlapping_words = text_set.intersection(caption_set)
            
            if overlapping_words:
                total_overlaps += 1
                
                if len(overlap_examples) < sample_size:
                    overlap_examples.append({
                        'meme_id': row.get('meme id', idx),
                        'text': meme_text[:100] + "..." if len(meme_text) > 100 else meme_text,
                        'caption': meme_caption[:100] + "..." if len(meme_caption) > 100 else meme_caption,
                        'overlapping_words': list(overlapping_words),
                        'text_features': text_features,
                        'caption_features': caption_features
                    })
        
        print(f"   • Total documents with overlapping features: {total_overlaps}")
        print(f"   • Percentage of dataset: {(total_overlaps/len(dataset_df))*100:.1f}%")
        
        if overlap_examples:
            print(f"\n   📝 EXAMPLE OVERLAPS:")
            for i, example in enumerate(overlap_examples, 1):
                print(f"\n   Example {i} (ID: {example['meme_id']}):") 
                print(f"     Text: \"{example['text']}\"")
                print(f"     Caption: \"{example['caption']}\"")
                print(f"     Overlapping words: {example['overlapping_words']}")
                print(f"     Text features: {example['text_features']}")
                print(f"     Caption features: {example['caption_features']}")
        else:
            print(f"   • No overlapping features found for {category}")


def combine_binary_train_test_stats(train_stats, test_stats, dataset_name):
    """Combine training and test statistics for binary ablation."""
    
    print(f"\n🔗 COMBINING TRAIN AND TEST STATS FOR {dataset_name} - BINARY ABLATION")
    print("-" * 50)
    
    combined = {}
    
    for category in train_stats.keys():
        if category in test_stats:
            train = train_stats[category]
            test = test_stats[category]
            
            # Combine counts
            combined[category] = {
                'total_documents': train['total_documents'] + test['total_documents'],
                'documents_with_features': train['documents_with_features'] + test['documents_with_features'],
                'features_in_text_only': train['features_in_text_only'] + test['features_in_text_only'],
                'features_in_caption_only': train['features_in_caption_only'] + test['features_in_caption_only'],
                'features_in_both': train['features_in_both'] + test['features_in_both'],
                'features_in_neither': train['features_in_neither'] + test['features_in_neither'],
                'text_feature_count': train['text_feature_count'] + test['text_feature_count'],
                'caption_feature_count': train['caption_feature_count'] + test['caption_feature_count'],
                'combined_feature_count': train['combined_feature_count'] + test['combined_feature_count'],
                'overlapping_instances_count': train['overlapping_instances_count'] + test['overlapping_instances_count'],
                'total_overlapping_features': train['total_overlapping_features'] + test['total_overlapping_features'],
                
                # Combine unique words
                'unique_feature_words': list(set(train['unique_feature_words'] + test['unique_feature_words'])),
            }
            
            # Recalculate percentages
            total_docs = combined[category]['total_documents']
            docs_with_features = combined[category]['documents_with_features']
            combined[category]['coverage_percentage'] = (docs_with_features / total_docs) * 100
            combined[category]['unique_feature_count'] = len(combined[category]['unique_feature_words'])
            
            print(f"   {category}: {combined[category]['coverage_percentage']:.1f}% coverage ({docs_with_features}/{total_docs})")
    
    return combined


def generate_binary_comparison_report(mami_stats, exist_stats):
    """Generate comparison report for binary ablation statistics."""
    
    print(f"\n📊 BINARY ABLATION COMPARATIVE ANALYSIS")
    print("=" * 80)
    
    categories = ['sentiment_pos', 'sentiment_neg', 'hate', 'function']
    
    print(f"\n🎯 MAMI BINARY FEATURES (Combined Train+Test):")
    for category in categories:
        if category in mami_stats:
            stats = mami_stats[category]
            expected_sum = stats['text_feature_count'] + stats['caption_feature_count']
            actual_combined = stats['combined_feature_count']
            difference = expected_sum - actual_combined
            
            print(f"   • {category}:")
            print(f"     - Coverage: {stats['coverage_percentage']:.1f}% ({stats['documents_with_features']}/{stats['total_documents']})")
            print(f"     - Text: {stats['text_feature_count']}, Caption: {stats['caption_feature_count']}, Combined: {actual_combined}")
            print(f"     - Difference: {difference} ({'overlap' if difference > 0 else 'additional features' if difference < 0 else 'perfect match'})")
            print(f"     - Overlapping instances: {stats.get('overlapping_instances_count', 0)} documents")
            print(f"     - Unique words: {stats['unique_feature_count']}")
    
    print(f"\n🎯 EXIST2024 BINARY FEATURES (Combined Train+Test):")
    for category in categories:
        if category in exist_stats:
            stats = exist_stats[category]
            expected_sum = stats['text_feature_count'] + stats['caption_feature_count']
            actual_combined = stats['combined_feature_count']
            difference = expected_sum - actual_combined
            
            print(f"   • {category}:")
            print(f"     - Coverage: {stats['coverage_percentage']:.1f}% ({stats['documents_with_features']}/{stats['total_documents']})")
            print(f"     - Text: {stats['text_feature_count']}, Caption: {stats['caption_feature_count']}, Combined: {actual_combined}")
            print(f"     - Difference: {difference} ({'overlap' if difference > 0 else 'additional features' if difference < 0 else 'perfect match'})")
            print(f"     - Overlapping instances: {stats.get('overlapping_instances_count', 0)} documents")
            print(f"     - Unique words: {stats['unique_feature_count']}")
    
    # Cross-dataset comparison
    print(f"\n🔍 CROSS-DATASET COMPARISON:")
    print(f"{'Category':<15} {'MAMI Coverage':<15} {'EXIST Coverage':<15} {'MAMI Words':<12} {'EXIST Words':<12}")
    print("-" * 75)
    
    for category in categories:
        mami_coverage = mami_stats[category]['coverage_percentage'] if category in mami_stats else 0
        exist_coverage = exist_stats[category]['coverage_percentage'] if category in exist_stats else 0
        mami_words = mami_stats[category]['unique_feature_count'] if category in mami_stats else 0
        exist_words = exist_stats[category]['unique_feature_count'] if category in exist_stats else 0
        
        print(f"{category:<15} {mami_coverage:<15.1f} {exist_coverage:<15.1f} {mami_words:<12} {exist_words:<12}")
    
    print(f"\n🔍 KEY INSIGHTS:")
    print("   • Combined counts reflect actual occurrences in 'svm representation' text")
    print("   • Differences indicate natural overlap between text and caption content")
    print("   • This analysis helps understand feature distribution patterns for binary ablation")
    print("   • No artificial deduplication applied - raw occurrence counts maintained")
    print("   • Coverage percentages show how widespread each feature type is in the datasets")


def run_binary_ablation_statistics_analysis():
    """
    Run comprehensive binary ablation statistics analysis.
    """
    
    print("🚀 BINARY ABLATION STATISTICS ANALYSIS")
    print("=" * 90)
    print("Analyzing four main binary ablation categories for both datasets")
    print("Categories: sentiment_pos, sentiment_neg, hate, function")
    print()
    
    # Define binary ablation categories
    binary_categories = ['sentiment_pos', 'sentiment_neg', 'hate', 'function']
    
    # Run analysis for MAMI
    print("🔍 MAMI DATASET ANALYSIS")
    print("Training set:")
    mami_training_stats = calculate_binary_ablation_statistics(mami_training_df, binary_categories, "MAMI")
    print("\nTest set:")
    mami_test_stats = calculate_binary_ablation_statistics(mami_test_df, binary_categories, "MAMI")
    
    print("\n\n🔍 EXIST2024 DATASET ANALYSIS") 
    print("Training set:")
    exist_training_stats = calculate_binary_ablation_statistics(exist_training_df, binary_categories, "EXIST2024")
    print("\nTest set:")
    exist_test_stats = calculate_binary_ablation_statistics(exist_test_df, binary_categories, "EXIST2024")
    
    # Combine stats for overall analysis
    mami_combined_stats = combine_binary_train_test_stats(mami_training_stats, mami_test_stats, "MAMI")
    exist_combined_stats = combine_binary_train_test_stats(exist_training_stats, exist_test_stats, "EXIST2024")
    
    # Analyze text-caption overlap for both datasets
    print(f"\n🔍 TEXT-CAPTION OVERLAP ANALYSIS:")
    analyze_binary_text_caption_overlap(mami_test_df, binary_categories, "MAMI", sample_size=3)
    analyze_binary_text_caption_overlap(exist_test_df, binary_categories, "EXIST2024", sample_size=3)
    
    # Save results
    os.makedirs("evaluation/binary_ablation_analysis", exist_ok=True)
    
    # Save detailed results
    detailed_results = {
        'MAMI': {
            'training': mami_training_stats,
            'test': mami_test_stats,
            'combined': mami_combined_stats
        },
        'EXIST2024': {
            'training': exist_training_stats,
            'test': exist_test_stats,
            'combined': exist_combined_stats
        }
    }
    
    with open("evaluation/binary_ablation_analysis/binary_ablation_stats.json", 'w') as f:
        json.dump(detailed_results, f, indent=2, cls=NumpyEncoder)
    
    print(f"\n✅ Results saved to evaluation/binary_ablation_analysis/binary_ablation_stats.json")
    
    # Generate comparison report
    generate_binary_comparison_report(mami_combined_stats, exist_combined_stats)
    
    # Generate category-specific insights
    print(f"\n📊 CATEGORY-SPECIFIC INSIGHTS:")
    print("-" * 50)
    
    for category in binary_categories:
        print(f"\n🎯 {category.upper()}:")
        
        # Compare coverage across datasets
        mami_coverage = mami_combined_stats[category]['coverage_percentage']
        exist_coverage = exist_combined_stats[category]['coverage_percentage']
        
        print(f"   • Coverage: MAMI {mami_coverage:.1f}% vs EXIST2024 {exist_coverage:.1f}%")
        
        # Compare unique word counts
        mami_words = mami_combined_stats[category]['unique_feature_count']
        exist_words = exist_combined_stats[category]['unique_feature_count']
        
        print(f"   • Unique words: MAMI {mami_words} vs EXIST2024 {exist_words}")
        
        # Compare overlap patterns
        mami_overlaps = mami_combined_stats[category]['overlapping_instances_count']
        exist_overlaps = exist_combined_stats[category]['overlapping_instances_count']
        mami_total = mami_combined_stats[category]['total_documents']
        exist_total = exist_combined_stats[category]['total_documents']
        
        print(f"   • Text-caption overlaps: MAMI {mami_overlaps}/{mami_total} ({mami_overlaps/mami_total*100:.1f}%) vs EXIST2024 {exist_overlaps}/{exist_total} ({exist_overlaps/exist_total*100:.1f}%)")
        
        # Sample words comparison
        mami_sample = mami_combined_stats[category]['unique_feature_words'][:5]
        exist_sample = exist_combined_stats[category]['unique_feature_words'][:5]
        
        print(f"   • Sample words MAMI: {', '.join(mami_sample)}")
        print(f"   • Sample words EXIST: {', '.join(exist_sample)}")
    
    print(f"\n🎉 BINARY ABLATION STATISTICS ANALYSIS COMPLETED!")
    print("📁 Results saved with proper feature counting (no artificial deduplication)")
    print("🔍 This analysis provides insights into feature distribution for binary ablation experiments")
    
    return mami_combined_stats, exist_combined_stats


# Main execution
if __name__ == "__main__":
    try:
        print("🚀 Running binary ablation statistics analysis...")
        mami_results, exist_results = run_binary_ablation_statistics_analysis()
        
        print(f"\n🎉 ANALYSIS COMPLETE!")
        print("📁 Results saved with comprehensive statistics for binary ablation categories")
        
    except Exception as e:
        print(f"❌ Error: {e}")
        import traceback
        traceback.print_exc()
        print("Make sure all required functions and datasets are loaded")

## Multilabel

In [None]:
def run_coarse_grained_multilabel_ablation_experiments(dataset_df_train, dataset_df_test, feature_type, 
                                                       ablation_method, dataset_name, binary_label, fine_grained_labels):
    """
    Run multilabel ablation experiment for coarse-grained feature categories.
    
    Parameters:
    -----------
    dataset_df_train : DataFrame
        Training dataset
    dataset_df_test : DataFrame
        Test dataset
    feature_type : str
        Type of feature to ablate:
        - 'sentiment_pos': Positive sentiment words
        - 'sentiment_neg': Negative sentiment words
        - 'hate': All hate speech terms
        - 'function': All function words
    ablation_method : str
        Method for ablation ('mask' or 'remove')
    dataset_name : str
        Name of the dataset ('MAMI' or 'EXIST2024')
    binary_label : str
        Name of the binary label column
    fine_grained_labels : list
        List of fine-grained category label names
    """
    # Load necessary lexicons
    nrc_lexicon = load_nrc_lexicon("NRC-Emotion-Lexicon/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt")
    function_words_dict = load_function_words()
    hurtlex_dict = load_hurtlex("hurtlex-master/lexica/EN/1.2/hurtlex_EN.tsv")
    
    # Get original texts
    X_train = dataset_df_train["svm representation"].tolist()
    y_train_binary = dataset_df_train[binary_label].tolist()
    train_df_bin_positive = dataset_df_train.loc[dataset_df_train[binary_label] == 1]
    X_train_bin_positive = train_df_bin_positive["svm representation"].tolist()
    y_train_categories = train_df_bin_positive[fine_grained_labels]
    
    X_test = dataset_df_test["svm representation"].tolist()
    
    # Prepare complete label list (binary + fine-grained)
    all_labels = [binary_label] + fine_grained_labels
    y_test_all = dataset_df_test[all_labels]
    
    # Process texts based on feature type and ablation method
    X_train_ablated = []
    X_train_bin_pos_ablated = []
    X_test_ablated = []
    
    # Create display name and model name for feature type
    display_name = f"Coarse-Grained: {feature_type.replace('_', ' ').title()}"
    feature_name_for_model = feature_type
    
    print(f"Running multilabel coarse-grained ablation experiment for {dataset_name} - {display_name} with {ablation_method} method...")
    
    # Statistics tracking
    total_feature_words_processed = 0
    documents_with_features = 0
    feature_words_per_doc = []
    
    # Define function to extract coarse-grained features
    def extract_coarse_grained_features(text):
        """Helper function to extract coarse-grained feature words"""
        if feature_type == 'sentiment_pos':
            # Extract positive sentiment words
            return extract_sentiment_words(text, nrc_lexicon, 'positive')
        elif feature_type == 'sentiment_neg':
            # Extract negative sentiment words  
            return extract_sentiment_words(text, nrc_lexicon, 'negative')
        elif feature_type == 'hate':
            # Extract all hate speech terms (any category)
            hate_words = []
            for word in text.lower().split():
                if word in hurtlex_dict:
                    hate_words.append(word)
            return list(set(hate_words))
        elif feature_type == 'function':
            # Extract all function words from all categories
            function_words = []
            for category in function_words_dict.keys():
                category_words = extract_function_words(text, function_words_dict, category)
                function_words.extend(category_words)
            return list(set(function_words))
        else:
            print(f"Warning: Unknown coarse-grained feature type '{feature_type}'. No features will be ablated.")
            return []
    
    # Process all training data
    for text in X_train:
        feature_words = extract_coarse_grained_features(text)
        
        # Convert to set for faster lookup
        feature_words_set = set(feature_words)
        
        # Count feature words in this text
        num_feature_words = len([word for word in text.lower().split() if word in feature_words_set])
        feature_words_per_doc.append(num_feature_words)
        
        if num_feature_words > 0:
            documents_with_features += 1
        
        # Process with feature ablation
        if ablation_method == 'mask':
            ablated_text, count = replace_mask_features(text, feature_words, mask_token='UNK')
        elif ablation_method == 'remove':
            ablated_text, count = remove_features(text, feature_words)
        
        X_train_ablated.append(ablated_text)
        total_feature_words_processed += count
    
    # Process positive-only training data
    for text in X_train_bin_positive:
        feature_words = extract_coarse_grained_features(text)
        
        # Process with feature ablation
        if ablation_method == 'mask':
            ablated_text, _ = replace_mask_features(text, feature_words, mask_token='UNK')
        elif ablation_method == 'remove':
            ablated_text, _ = remove_features(text, feature_words)
        
        X_train_bin_pos_ablated.append(ablated_text)
    
    # Process test data
    for text in X_test:
        feature_words = extract_coarse_grained_features(text)
        
        # Process with feature ablation
        if ablation_method == 'mask':
            ablated_text, _ = replace_mask_features(text, feature_words, mask_token='UNK')
        elif ablation_method == 'remove':
            ablated_text, _ = remove_features(text, feature_words)
        
        X_test_ablated.append(ablated_text)
    
    # Summary statistics
    print(f"\n{'=' * 50}")
    print(f"Processing Summary:")
    print(f"Total documents: {len(X_train_ablated)}")
    print(f"Documents with features: {documents_with_features} ({documents_with_features/len(X_train_ablated)*100:.2f}%)")
    print(f"Total feature words processed: {total_feature_words_processed}")
    print(f"Average feature words per document: {np.mean(feature_words_per_doc):.2f}")
    print(f"Max words per document: {max(feature_words_per_doc) if feature_words_per_doc else 0}")
    print(f"{'=' * 50}")
    
    # Create model name
    model_name = f"{dataset_name}_svm_ablation_{feature_name_for_model}_{ablation_method}_hierarchy"
    
    try:
        # Train and evaluate model
        print(f"\nTraining model with ablated {display_name}...")
        test_pred_df, bin_clf, bin_vec, ml_model, ml_vec = build_hierarchical_multilabel_classifier(
            X_train_ablated,
            y_train_binary,
            X_train_bin_pos_ablated,
            y_train_categories,
            X_test_ablated,
            binary_label,
            fine_grained_labels
        )
        
        # Get gold file paths based on dataset
        dataset_path_name = "EXIST2024" if dataset_name == "EXIST" else dataset_name
        gold_test_ml = f"models/evaluation/golds/{dataset_path_name}/{dataset_path_name}_test_hierarchical.json"
        gold_test_txt = f'models/evaluation/golds/{dataset_path_name}/{dataset_path_name}_test_truth.txt'
        evaluation_type = "hierarchical"
        
        # Create file with predictions
        print(f"\nSaving evaluation results to: {model_name}")
        test_pred_json_ml, test_pred_txt_ml = save_evaluation(
            dataset_df_test, "evaluation/predictions", dataset_name, "test", 
            evaluation_type, model_name, test_pred_df, binary_label, all_labels
        )
        
        # Get evaluation metrics
        print(f"\n{'='*20} Evaluation Results for {display_name} {'='*20}")
        metrics = evaluate_multilabel_classification(
            gold_test_ml, test_pred_json_ml,
            y_test_all, test_pred_df.to_numpy(),
            gold_test_txt, test_pred_txt_ml,
            all_labels, hierarchy=True
        )

        # ✅ ADD PREDICTIONS TO RESULTS
        metrics['predictions'] = test_pred_df.to_numpy().tolist()  
        metrics['true_labels'] = y_test_all.values.tolist()  
        metrics['prediction_files'] = {
            'json': test_pred_json_ml,
            'txt': test_pred_txt_ml
        }
        metrics['all_labels'] = all_labels

        os.makedirs("evaluation/results/multi-label/SVM/coarse-grained", exist_ok=True)
        results_file = f"evaluation/results/multi-label/SVM/coarse-grained/{model_name}_results.json"
        with open(results_file, 'w') as f:
            json.dump(metrics, f, indent=2, cls=NumpyEncoder)
        print(f"✅ Results saved to: {results_file}")
        
        return {
            'model_name': model_name,
            'metrics': metrics,
            'ablation_stats': {
                'total_documents': len(X_train),
                'documents_with_features': documents_with_features,
                'percentage_with_features': documents_with_features/len(X_train)*100,
                'total_feature_words': total_feature_words_processed,
                'avg_feature_words': np.mean(feature_words_per_doc),
                'max_feature_words': max(feature_words_per_doc) if feature_words_per_doc else 0
            }
        }
        
    except ValueError as e:
        print(f"Error in experiment: {e}")
        print(f"Skipping multilabel coarse-grained ablation for {feature_type} with {ablation_method}")
        return None



def run_fine_grained_multilabel_ablation_experiments(dataset_df_train, dataset_df_test, feature_type, feature_category, 
                                                     ablation_method, dataset_name, binary_label, fine_grained_labels):
    """
    Run a complete multi-label ablation experiment for specific feature types.
    
    Parameters:
    -----------
    dataset_df_train : DataFrame
        Training dataset
    dataset_df_test : DataFrame
        Test dataset
    feature_type : str
        Type of feature to ablate:
        - 'neg_emotion': Negative emotions (fear, anger, sadness, disgust)
        - 'function': Function words
        - 'hate': Hate speech terms
    feature_category : str or list
        Specific category within the feature type or list of categories.
        For 'neg_emotion', can be individual emotion or None for all negative emotions.
        For 'function', should be a specific function word category.
        For 'hate', should be a specific hate speech category.
    ablation_method : str
        Method for ablation ('mask' or 'remove')
    dataset_name : str
        Name of the dataset ('MAMI' or 'EXIST2024')
    binary_label : str
        Name of the binary label column
    fine_grained_labels : list
        List of fine-grained category label names
    """
    # Load necessary lexicons
    nrc_lexicon = load_nrc_lexicon("NRC-Emotion-Lexicon/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt")
    function_words_dict = load_function_words()
    hurtlex_dict = load_hurtlex("hurtlex-master/lexica/EN/1.2/hurtlex_EN.tsv")
    
    # Define negative emotions
    negative_emotions = ['fear', 'anger', 'sadness', 'disgust']
    
    # Get original texts
    X_train = dataset_df_train["svm representation"].tolist()
    y_train_binary = dataset_df_train[binary_label].tolist()
    train_df_bin_positive = dataset_df_train.loc[dataset_df_train[binary_label] == 1]
    X_train_bin_positive = train_df_bin_positive["svm representation"].tolist()
    y_train_categories = train_df_bin_positive[fine_grained_labels]
    
    X_test = dataset_df_test["svm representation"].tolist()
    
    # Prepare complete label list (binary + fine-grained)
    all_labels = [binary_label] + fine_grained_labels
    y_test_all = dataset_df_test[all_labels]
    
    # Process texts based on feature type and ablation method
    X_train_ablated = []
    X_train_bin_pos_ablated = []
    X_test_ablated = []
    
    # Create display name for feature category
    if feature_type == 'neg_emotion' and feature_category is None:
        display_name = "All Negative Emotions (fear, anger, sadness, disgust)"
        feature_name_for_model = "all_neg_emotions"
    elif feature_type == 'neg_emotion':
        display_name = f"Negative Emotion: {feature_category}"
        feature_name_for_model = f"neg_{feature_category}"
    elif feature_type == 'function':
        display_name = f"Function Words: {feature_category}"
        feature_name_for_model = f"func_{feature_category}"
    elif feature_type == 'hate':
        display_name = f"Hate Speech: {feature_category}"
        feature_name_for_model = f"hate_{feature_category}"
    else:
        display_name = f"{feature_type}: {feature_category}"
        feature_name_for_model = f"{feature_type}_{feature_category}"
    
    print(f"Running multi-label ablation experiment for {dataset_name} - {display_name} with {ablation_method} method...")
    
    # Statistics tracking
    total_feature_words_processed = 0
    documents_with_features = 0
    feature_words_per_doc = []
    
    # Define function to extract targeted features
    def extract_targeted_features(text):
        """Helper function to extract the targeted feature words based on feature type"""
        if feature_type == 'neg_emotion':
            if feature_category is None:
                # Extract all negative emotions
                all_neg_words = []
                for emotion in negative_emotions:
                    emotion_words = extract_emotion_words(text, nrc_lexicon, emotion)
                    all_neg_words.extend(emotion_words)
                return list(set(all_neg_words))  # Remove duplicates
            else:
                # Extract specific negative emotion
                return extract_emotion_words(text, nrc_lexicon, feature_category)
        elif feature_type == 'function':
            return extract_function_words(text, function_words_dict, feature_category)
        elif feature_type == 'hate':
            return extract_hate_speech_terms(text, hurtlex_dict, feature_category)
        else:
            print(f"Warning: Unknown feature type '{feature_type}'. No features will be ablated.")
            return []
    
    # Process all training data
    for text in X_train:
        feature_words = extract_targeted_features(text)
        
        # Convert to set for faster lookup
        feature_words_set = set(feature_words)
        
        # Count feature words in this text
        num_feature_words = len([word for word in text.lower().split() if word in feature_words_set])
        feature_words_per_doc.append(num_feature_words)
        
        if num_feature_words > 0:
            documents_with_features += 1
        
        # Process with feature ablation
        if ablation_method == 'mask':
            ablated_text, count = replace_mask_features(text, feature_words, mask_token='UNK')
        elif ablation_method == 'remove':
            ablated_text, count = remove_features(text, feature_words)
        
        X_train_ablated.append(ablated_text)
        total_feature_words_processed += count
    
    # Process positive-only training data
    for text in X_train_bin_positive:
        feature_words = extract_targeted_features(text)
        
        # Process with feature ablation
        if ablation_method == 'mask':
            ablated_text, _ = replace_mask_features(text, feature_words, mask_token='UNK')
        elif ablation_method == 'remove':
            ablated_text, _ = remove_features(text, feature_words)
        
        X_train_bin_pos_ablated.append(ablated_text)
    
    # Process test data
    for text in X_test:
        feature_words = extract_targeted_features(text)
        
        # Process with feature ablation
        if ablation_method == 'mask':
            ablated_text, _ = replace_mask_features(text, feature_words, mask_token='UNK')
        elif ablation_method == 'remove':
            ablated_text, _ = remove_features(text, feature_words)
        
        X_test_ablated.append(ablated_text)
    
    # Summary statistics
    print(f"\n{'=' * 50}")
    print(f"Processing Summary:")
    print(f"Total documents: {len(X_train_ablated)}")
    print(f"Documents with features: {documents_with_features} ({documents_with_features/len(X_train_ablated)*100:.2f}%)")
    print(f"Total feature words processed: {total_feature_words_processed}")
    print(f"Average feature words per document: {np.mean(feature_words_per_doc):.2f}")
    print(f"Max words per document: {max(feature_words_per_doc) if feature_words_per_doc else 0}")
    print(f"{'=' * 50}")
    
    # Create model name
    model_name = f"{dataset_name}_svm_ablation_{feature_name_for_model}_{ablation_method}_hierarchy"
    
    try:
        # Train and evaluate model
        print(f"\nTraining model with ablated {display_name}...")
        test_pred_df, bin_clf, bin_vec, ml_model, ml_vec = build_hierarchical_multilabel_classifier(
            X_train_ablated,
            y_train_binary,
            X_train_bin_pos_ablated,
            y_train_categories,
            X_test_ablated,
            binary_label,
            fine_grained_labels
        )
        
        # Get gold file paths based on dataset
        gold_test_ml = f"models/evaluation/golds/{dataset_path_name}/{dataset_path_name}_test_hierarchical.json"
        gold_test_txt = f'models/evaluation/golds/{dataset_path_name}/{dataset_path_name}_test_truth.txt'
        evaluation_type = "hierarchical"
        
        # Create file with predictions
        print(f"\nSaving evaluation results to: {model_name}")
        test_pred_json_ml, test_pred_txt_ml = save_evaluation(
            dataset_df_test, "evaluation/predictions", dataset_name, "test", 
            evaluation_type, model_name, test_pred_df, binary_label, all_labels
        )
        
        # Get evaluation metrics
        print(f"\n{'='*20} Evaluation Results for {display_name} {'='*20}")
        metrics = evaluate_multilabel_classification(
            gold_test_ml, test_pred_json_ml,
            y_test_all, test_pred_df.to_numpy(),
            gold_test_txt, test_pred_txt_ml,
            all_labels, hierarchy=True
        )

        os.makedirs("evaluation/results/multi-label/SVM/fine-grained/", exist_ok=True)
        results_file = f"evaluation/results/multi-label/SVM/fine-grained/{model_name}_results.json"
        with open(results_file, 'w') as f:
            json.dump(metrics, f, indent=2, cls=NumpyEncoder)
        print(f"✅ Results saved to: {results_file}")
        
        return {
            'model_name': model_name,
            'metrics': metrics,
            'ablation_stats': {
                'total_documents': len(X_train),
                'documents_with_features': documents_with_features,
                'percentage_with_features': documents_with_features/len(X_train)*100,
                'total_feature_words': total_feature_words_processed,
                'avg_feature_words': np.mean(feature_words_per_doc),
                'max_feature_words': max(feature_words_per_doc) if feature_words_per_doc else 0
            }
        }
        
    except ValueError as e:
        print(f"Error in experiment: {e}")
        print(f"Skipping multi-label ablation for {feature_type} - {feature_category} with {ablation_method}")
        return None

    
def run_multilabel_experiments(dataset_name="MAMI", ablation_method="mask"):
    """
    Run multi-label ablation experiments including both:
    1. Coarse-grained categories (sentiment_pos, sentiment_neg, hate, function)
    2. Fine-grained categories (individual emotions, function word categories, hate speech categories)
    
    Parameters:
    -----------
    dataset_name : str
        Name of the dataset ('MAMI' or 'EXIST2024')
    ablation_method : str
        Method for ablation ('mask' or 'remove')
    """
    print("=" * 80)
    print(f"RUNNING MULTI-LABEL ABLATION EXPERIMENTS FOR {dataset_name}")
    print("Including both coarse-grained and fine-grained categories")
    print("Using ablation method:", ablation_method)
    print("=" * 80)
    
    # Set up dataset and labels based on dataset name
    if dataset_name == "MAMI":
        training_df = mami_training_df
        test_df = mami_test_df
        binary_label = "misogynous"
        fine_grained_labels = ["shaming", "stereotype", "objectification", "violence"]
    else:  # EXIST2024
        training_df = exist_training_df
        test_df = exist_test_df
        binary_label = "sexist"
        fine_grained_labels = ["ideological-inequality", "stereotyping-dominance", "objectification", 
                              "sexual-violence", "misogyny-non-sexual-violence"]
    
    # Store results
    results = {
        'coarse_grained': {},
        'neg_emotion': {},
        'function': {},
        'hate': {}
    }
    
    # 1. COARSE-GRAINED EXPERIMENTS (NEW!)
    print("\n" + "=" * 50)
    print("COARSE-GRAINED ABLATION EXPERIMENTS")
    print("=" * 50)
    
    coarse_grained_categories = ['sentiment_pos', 'sentiment_neg', 'hate', 'function']
    
    for category in coarse_grained_categories:
        print(f"\n🎯 Running coarse-grained experiment: {category}")
        coarse_result = run_coarse_grained_multilabel_ablation_experiments(
            training_df, test_df, category, 
            ablation_method, dataset_name, binary_label, fine_grained_labels
        )
        results['coarse_grained'][category] = coarse_result
    
    # 2. FINE-GRAINED EXPERIMENTS (EXISTING)
    print("\n" + "=" * 50)
    print("FINE-GRAINED NEGATIVE EMOTIONS ABLATION")
    print("=" * 50)
    
    # First run combined negative emotions
    neg_emotions_result = run_fine_grained_multilabel_ablation_experiments(
        training_df, test_df, 'neg_emotion', None, 
        ablation_method, dataset_name, binary_label, fine_grained_labels
    )
    results['neg_emotion']['combined'] = neg_emotions_result
    
    # Then individual negative emotions
    negative_emotions = ['fear', 'anger', 'sadness', 'disgust']
    for emotion in negative_emotions:
        emotion_result = run_fine_grained_multilabel_ablation_experiments(
            training_df, test_df, 'neg_emotion', emotion, 
            ablation_method, dataset_name, binary_label, fine_grained_labels
        )
        results['neg_emotion'][emotion] = emotion_result
    
    # 3. Function Words Experiments
    print("\n" + "=" * 50)
    print("FINE-GRAINED FUNCTION WORDS ABLATION")
    print("=" * 50)
    
    function_words_dict = load_function_words()
    function_categories = list(function_words_dict.keys())
    
    for category in function_categories:
        function_result = run_fine_grained_multilabel_ablation_experiments(
            training_df, test_df, 'function', category, 
            ablation_method, dataset_name, binary_label, fine_grained_labels
        )
        results['function'][category] = function_result
    
    # 4. Hate Speech Experiments
    print("\n" + "=" * 50)
    print("FINE-GRAINED HATE SPEECH LEXICON ABLATION")
    print("=" * 50)
    
    # Get HurtLex lexicon and use all categories
    hurtlex_dict = load_hurtlex("hurtlex-master/lexica/EN/1.2/hurtlex_EN.tsv")
    category_counts = count_words_by_category(hurtlex_dict)
    all_categories = list(category_counts.keys())  
    
    for category in all_categories:
        hate_result = run_fine_grained_multilabel_ablation_experiments(
            training_df, test_df, 'hate', category, 
            ablation_method, dataset_name, binary_label, fine_grained_labels
        )
        results['hate'][category] = hate_result
    
    # Save results
    print(f"\n📊 SAVING ERESULTS...")
    os.makedirs("evaluation/multilabel_results", exist_ok=True)
    
    results_file = f"evaluation/multilabel_results/{dataset_name}_{ablation_method}_multilabel_results.json"
    with open(results_file, 'w') as f:
        json.dump(results, f, indent=2, cls=NumpyEncoder)
    
    print(f"✅  multilabel results saved to: {results_file}")
    
    # Summary report
    print(f"\n📊 MULTILABEL EXPERIMENT SUMMARY FOR {dataset_name}:")
    print("-" * 60)
    print(f"✅ Coarse-grained categories: {len(results['coarse_grained'])} experiments")
    print(f"✅ Fine-grained emotions: {len(results['neg_emotion'])} experiments")
    print(f"✅ Fine-grained function words: {len(results['function'])} experiments")
    print(f"✅ Fine-grained hate speech: {len(results['hate'])} experiments")
    
    total_experiments = (len(results['coarse_grained']) + 
                        len(results['neg_emotion']) + 
                        len(results['function']) + 
                        len(results['hate']))
    print(f"🎯 Total experiments completed: {total_experiments}")
    
    return results


def run_all_multilabel_experiments():
    """
    Run  multilabel experiments for both datasets and both methods.
    """
    
    print("🚀 RUNNING ALL MULTILABEL EXPERIMENTS")
    print("=" * 80)
    print("This includes coarse-grained categories: sentiment_pos, sentiment_neg, hate, function")
    print("Plus all existing fine-grained categories")
    print()
    
    all_results = {}
    
    # MAMI experiments
    print("🎯 MAMI DATASET EXPERIMENTS")
    print("=" * 50)
    
    print("\n📦 MAMI - MASK method...")
    mami_mask = run_multilabel_experiments(
        dataset_name="MAMI", ablation_method="mask"
    )
    all_results['MAMI_mask'] = mami_mask
    
    print("\n📦 MAMI - REMOVE method...")
    mami_remove = run_multilabel_experiments(
        dataset_name="MAMI", ablation_method="remove"
    )
    all_results['MAMI_remove'] = mami_remove
    
    # EXIST2024 experiments
    print("\n🎯 EXIST2024 DATASET EXPERIMENTS")
    print("=" * 50)
    
    print("\n📦 EXIST2024 - MASK method...")
    exist_mask = run_multilabel_experiments(
        dataset_name="EXIST2024", ablation_method="mask"
    )
    all_results['EXIST2024_mask'] = exist_mask
    
    print("\n📦 EXIST2024 - REMOVE method...")
    exist_remove = run_multilabel_experiments(
        dataset_name="EXIST2024", ablation_method="remove"
    )
    all_results['EXIST2024_remove'] = exist_remove
    
    # Save combined results
    print(f"\n💾 SAVING COMBINED RESULTS...")
    os.makedirs("evaluation/multilabel_results", exist_ok=True)
    
    combined_results_file = "evaluation/multilabel_results/all_multilabel_experiments.json"
    with open(combined_results_file, 'w') as f:
        json.dump(all_results, f, indent=2, cls=NumpyEncoder)
    
    print(f"✅ All multilabel results saved to: {combined_results_file}")
    
    # Final summary
    print(f"\n🎉 ALL MULTILABEL EXPERIMENTS COMPLETED!")
    print("=" * 80)
    
    for experiment_name, experiment_results in all_results.items():
        dataset, method = experiment_name.split('_')
        total_exp = (len(experiment_results.get('coarse_grained', {})) + 
                    len(experiment_results.get('neg_emotion', {})) + 
                    len(experiment_results.get('function', {})) + 
                    len(experiment_results.get('hate', {})))
        print(f"📊 {dataset} ({method}): {total_exp} experiments completed")
    
    return all_results


# Run the experiments
if __name__ == "__main__":
    # Run all multilabel experiments including coarse-grained categories
    print("🚀 Starting multilabel experiments with coarse-grained categories...")
    all_results = run_all_multilabel_experiments()
    
    print("\n✅ Multilabel experiments completed!")
    print("📁 Results include both coarse-grained and fine-grained ablation experiments")
    print("🎯 Coarse-grained categories: sentiment_pos, sentiment_neg, hate, function")
    print("🔬 Fine-grained categories: individual emotions, function word types, hate speech categories")

## POS Ablation

In [None]:
# First, install and import required packages
try:
    import spacy
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("Installing spaCy English model...")
    import subprocess
    subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
    import spacy
    nlp = spacy.load("en_core_web_sm")

In [None]:
# Load spaCy model for POS tagging
nlp = spacy.load("en_core_web_sm")

# Universal POS tags based on Van Nooten et al. (2021)  -- open class words
# https://universaldependencies.org/u/pos/

UNIVERSAL_POS_TAGS = [
    'ADJ',      # adjective
    'ADV',      # adverb
    'INTJ',     # interjection
    'NOUN',     # noun
    'PROPN',    # proper noun
    'VERB'      # verb
]

In [None]:

def run_pos_ablation_experiment(dataset_df_train, dataset_df_test, pos_category, 
                                dataset_name, binary_label='misogynous',
                                train_pos_tags=None, test_pos_tags=None):
    """
    Run POS ablation experiment for a specific POS category (optimized version with predictions saving).
    
    Parameters:
    - dataset_df_train: Training DataFrame
    - dataset_df_test: Test DataFrame  
    - pos_category: POS category to ablate
    - dataset_name: Name of dataset ('MAMI' or 'EXIST2024')
    - binary_label: Binary label column name
    - train_pos_tags: Precomputed POS tags for training data
    - test_pos_tags: Precomputed POS tags for test data
    
    Returns:
    - dict: Results including metrics and statistics
    """
    print(f"\n{'='*60}")
    print(f"POS Ablation Experiment: {pos_category}")
    print(f"Dataset: {dataset_name}")
    print(f"{'='*60}")
    
    # Get original texts and labels
    X_train_orig = dataset_df_train["svm representation"].tolist()
    X_test_orig = dataset_df_test["svm representation"].tolist()
    y_train = dataset_df_train[binary_label].tolist()
    y_test = dataset_df_test[binary_label].tolist()
    
    # Create ablated datasets using precomputed POS tags
    print("\nCreating ablated training set...")
    X_train_ablated, train_stats = create_pos_ablated_dataset(
        X_train_orig, pos_category, train_pos_tags)
    
    print("\nCreating ablated test set...")
    X_test_ablated, test_stats = create_pos_ablated_dataset(
        X_test_orig, pos_category, test_pos_tags)
    
    # Print statistics
    print(f"\n{'-'*50}")
    print("ABLATION STATISTICS:")
    print(f"{'-'*50}")
    print(f"Training set:")
    print(f"  - Documents affected: {train_stats['documents_affected']}/{train_stats['total_documents']} ({train_stats['percentage_docs_affected']:.2f}%)")
    print(f"  - Total words removed: {train_stats['total_words_removed']}")
    print(f"  - Avg words removed per doc: {train_stats['avg_words_removed_per_doc']:.2f}")
    print(f"  - Max words removed: {train_stats['max_words_removed']}")
    
    print(f"\nTest set:")
    print(f"  - Documents affected: {test_stats['documents_affected']}/{test_stats['total_documents']} ({test_stats['percentage_docs_affected']:.2f}%)")
    print(f"  - Total words removed: {test_stats['total_words_removed']}")
    print(f"  - Avg words removed per doc: {test_stats['avg_words_removed_per_doc']:.2f}")
    print(f"  - Max words removed: {test_stats['max_words_removed']}")
    
    # Train model on ablated data
    print(f"\n{'-'*50}")
    print("TRAINING MODEL ON ABLATED DATA:")
    print(f"{'-'*50}")
    
    try:
        model, vectorizer = build_bin_classifier(X_train_ablated, y_train)
        y_pred = classify_data(X_test_ablated, model, vectorizer)
        
        # Set up evaluation parameters
        model_name = f"svm_pos_ablation_{pos_category.lower()}_{dataset_name.lower()}"
        evaluation_type = "binary"

        # Set gold standard file paths
        if dataset_name == "MAMI":
            gold_test_bin = "" # Path to MAMI binary classification gold labels (json)
            gold_test_txt = "" # Path to MAMI text gold labels (txt)
            label_names = ["non-misogynous", "misogynous"]
        else:  # EXIST2024
            gold_test_bin = "" # Path to EXIST2024 binary classification gold labels (json)
            gold_test_txt = "" # Path to EXIST2024 text gold labels (txt)
            label_names = ["non-sexist", "sexist"]

        
        # Save predictions and evaluate
        test_pred_json, test_pred_txt = save_evaluation(
            dataset_df_test, "evaluation/predictions", dataset_name, "test", 
            evaluation_type, model_name, y_pred, binary_label, []
        )
        
        print(f"\n{'-'*50}")
        print("EVALUATION RESULTS:")
        print(f"{'-'*50}")
        
        # Use existing evaluation function
        evaluate_binary_classification(
            gold_test_bin, test_pred_json, y_test, y_pred,
            gold_test_txt, test_pred_txt, label_names, 
            model_name=f"SVM POS Ablation ({pos_category})"
        )
        
        # Calculate all required metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision_macro = precision_score(y_test, y_pred, average='macro')  
        recall_macro = recall_score(y_test, y_pred, average='macro')        
        f1_macro = f1_score(y_test, y_pred, average='macro')
        
        # Get classification report as dictionary
        class_report_dict = classification_report(y_test, y_pred, 
                                                target_names=label_names, 
                                                zero_division=0, digits=3, 
                                                output_dict=True)
        
        # Calculate binary F1 score (MAMI evaluation metric)
        binary_f1 = evaluate_f1_scores(gold_test_txt, test_pred_txt, 2)
        
        # ✅ FIXED: Create structured results dictionary WITH PREDICTIONS
        pos_results = {
            'binary_f1': binary_f1,
            'macro_f1': f1_macro,
            'accuracy': accuracy,
            'precision_macro': precision_macro,  
            'recall_macro': recall_macro,        
            'per_label_metrics': class_report_dict,
            'predictions': y_pred.tolist(),  
            'true_labels': y_test,           
            'confusion_matrix': confusion_matrix(y_test, y_pred).tolist(),
            'label_names': label_names,
            'prediction_files': {
                'json': test_pred_json,
                'txt': test_pred_txt
            }
        }
        
        # Save results to JSON file
        os.makedirs("evaluation/results/POS/SVM", exist_ok=True)
        results_file = f"evaluation/results/POS/SVM/{model_name}_results.json"
        
        with open(results_file, 'w') as f:
            json.dump(pos_results, f, indent=2, cls=NumpyEncoder)
        
        print(f"✅ POS results saved to: {results_file}")
        
        # Return structured results
        results = {
            'pos_category': pos_category,
            'model_name': model_name,
            'accuracy': accuracy,
            'precision_macro': precision_macro,
            'recall_macro': recall_macro,
            'f1_macro': f1_macro,
            'binary_f1': binary_f1,
            'predictions': y_pred.tolist(),  
            'true_labels': y_test,
            'train_stats': train_stats,
            'test_stats': test_stats,
            'prediction_files': {
                'json': test_pred_json,
                'txt': test_pred_txt
            }
        }
        
        return results

    except Exception as e:
        print(f"Error in POS ablation for {pos_category}: {e}")
        return None



def run_complete_pos_ablation_study(dataset_df_train, dataset_df_test, dataset_name, 
                                   binary_label='misogynous', pos_tags=None):
    """
    Run complete POS ablation study for all POS categories (optimized version with baseline predictions saving).
    """
    if pos_tags is None:
        pos_tags = UNIVERSAL_POS_TAGS.copy()
    
    print(f"\n{'='*80}")
    print(f"OPTIMIZED POS ABLATION STUDY FOR {dataset_name}")
    print(f"Testing {len(pos_tags)} POS categories: {', '.join(pos_tags)}")
    print(f"{'='*80}")
    
    # Get original texts and labels
    X_train = dataset_df_train["svm representation"].tolist()
    X_test = dataset_df_test["svm representation"].tolist()
    y_train = dataset_df_train[binary_label].tolist()
    y_test = dataset_df_test[binary_label].tolist()
    
    # Precompute POS tags for all texts once
    print(f"\n{'='*60}")
    print("PRECOMPUTING POS TAGS (This will save time for multiple experiments)")
    print(f"{'='*60}")
    
    print("Computing POS tags for training data...")
    train_pos_tags = extract_pos_tags_batch(X_train, batch_size=1000)
    
    print("Computing POS tags for test data...")
    test_pos_tags = extract_pos_tags_batch(X_test, batch_size=1000)
    
    print("POS tag computation complete! Now running experiments...")
    
    # First run baseline (no ablation)
    print(f"\n{'='*60}")
    print("BASELINE EXPERIMENT (NO ABLATION)")
    print(f"{'='*60}")
    
    baseline_model, baseline_vec = build_bin_classifier(X_train, y_train)
    baseline_pred = classify_data(X_test, baseline_model, baseline_vec)
    
    
    baseline_accuracy = accuracy_score(y_test, baseline_pred)
    baseline_f1 = f1_score(y_test, baseline_pred, average='macro')
    baseline_precision = precision_score(y_test, baseline_pred, average='macro')
    baseline_recall = recall_score(y_test, baseline_pred, average='macro')
    
    print(f"Baseline Results:")
    print(f"  - Accuracy: {baseline_accuracy:.3f}")
    print(f"  - F1-macro: {baseline_f1:.3f}")
    print(f"  - Precision-macro: {baseline_precision:.3f}")
    print(f"  - Recall-macro: {baseline_recall:.3f}")
    
    # Save baseline predictions for POS ablation
    baseline_results = {
        'accuracy': baseline_accuracy,
        'f1_macro': baseline_f1,
        'precision_macro': baseline_precision,
        'recall_macro': baseline_recall,
        'predictions': baseline_pred.tolist(), 
        'true_labels': y_test
    }
    
    # Save baseline results
    os.makedirs("evaluation/results/POS/SVM", exist_ok=True)
    baseline_file = f"evaluation/results/POS/SVM/pos_baseline_{dataset_name.lower()}_results.json"
    with open(baseline_file, 'w') as f:
        json.dump(baseline_results, f, indent=2, cls=NumpyEncoder)
    
    print(f"✅ Baseline predictions saved to: {baseline_file}")
    
    # Store all results
    all_results = {
        'baseline': baseline_results,
        'ablation_results': {}
    }
    
    # Run ablation for each POS category using precomputed tags
    for pos_category in pos_tags:
        results = run_pos_ablation_experiment(
            dataset_df_train, dataset_df_test, pos_category, dataset_name, binary_label,
            train_pos_tags, test_pos_tags
        )
        
        if results is not None:
            all_results['ablation_results'][pos_category] = results
    
    return all_results


In [None]:
def run_mami_pos_ablation():
    """Run POS ablation study on MAMI dataset."""
    print("Starting MAMI POS Ablation Study...")
    
    results = run_complete_pos_ablation_study(
        mami_training_df, mami_test_df, "MAMI", binary_label='misogynous'
    )
    
    return results

def run_exist2024_pos_ablation():
    """Run POS ablation study on EXIST2024 dataset."""
    print("Starting EXIST2024 POS Ablation Study...")
    
    results = run_complete_pos_ablation_study(
        exist_training_df, exist_test_df, "EXIST2024", binary_label='sexist'
    )
    
    return results

In [None]:
mami_results = run_mami_pos_ablation()
exist_results = run_exist2024_pos_ablation()

# Export Results

## coarse-grained

In [None]:
def load_json_results(file_path):
    """Load results from JSON file."""
    try:
        with open(file_path, 'r') as f:
            return json.load(f)
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return None

def extract_feature_info_from_filename(filename):
    """Extract feature type and method from filename."""
    # Remove extension and directory path
    basename = os.path.basename(filename).replace('_results.json', '').replace('_bin_results.json', '')
    
    # Feature type extraction
    feature_type = None
    if 'sentiment_pos' in basename:
        feature_type = 'Pos_Sentiment'
    elif 'sentiment_neg' in basename:
        feature_type = 'Neg_Sentiment'
    elif 'function' in basename:
        feature_type = 'Function'
    elif 'hate' in basename:
        feature_type = 'Hate'
    
    # Method extraction
    method = None
    if 'mask' in basename:
        method = 'Placeholder'
    elif 'remove' in basename:
        method = 'Remove'
    
    return feature_type, method

def collect_binary_results(dataset_name):
    """Collect binary classification results."""
    results_dir = "evaluation/results/binary/SVM"
    dataset_lower = dataset_name.lower()
    
    # Find all binary result files
    patterns = [
        f"{results_dir}/*{dataset_lower}*bin_results.json",
        f"{results_dir}/*{dataset_lower}*results.json"
    ]
    
    found_files = []
    for pattern in patterns:
        files = glob.glob(pattern)
        found_files.extend(files)
    
    # Remove duplicates
    found_files = list(set(found_files))
    
    binary_results = []
    
    for file_path in found_files:
        filename = os.path.basename(file_path)
        
        # Skip baseline files
        if 'baseline' in filename.lower():
            continue
            
        # Skip random files (we'll handle them separately)
        if 'random' in filename.lower():
            continue
            
        feature_type, method = extract_feature_info_from_filename(filename)
        
        if feature_type and method:
            data = load_json_results(file_path)
            if data:
                # Extract metrics
                precision_macro = data.get('precision_macro', 0)
                recall_macro = data.get('recall_macro', 0)
                f1_macro = data.get('macro_f1', 0)
                
                # Get positive class F1 from per_label_metrics
                f1_positive = 0
                if 'per_label_metrics' in data:
                    if dataset_name == 'MAMI' and 'misogynous' in data['per_label_metrics']:
                        f1_positive = data['per_label_metrics']['misogynous']['f1-score']
                    elif dataset_name == 'EXIST2024' and 'sexist' in data['per_label_metrics']:
                        f1_positive = data['per_label_metrics']['sexist']['f1-score']
                
                binary_results.append({
                    'Dataset': dataset_name,
                    'Experiment Category': 'Feature Ablation',
                    'Feature Type': feature_type,
                    'Ablation Method': method,
                    'Precision Macro': precision_macro,
                    'Recall Macro': recall_macro,
                    'F1 Macro': f1_macro,
                    'F1 Positive Class': f1_positive
                })
    
    return binary_results


def add_baseline_results(results_list, dataset_name, experiment_type):
    """Add baseline results to the results list."""
    if experiment_type == 'binary':
        baseline_file = f"evaluation/results/binary/SVM/svm_baseline_bow_{dataset_name}_bin_baseline_results.json"
    else:  # multilabel
        baseline_file = f"evaluation/results/multi-label/SVM/fine-grained/svm_baseline_bow_hierarchy_{dataset_name}_results.json"
    
    if os.path.exists(baseline_file):
        baseline_data = load_json_results(baseline_file)
        if baseline_data:
            if experiment_type == 'binary':
                # Add binary baseline
                precision_macro = baseline_data.get('precision_macro', 0)
                recall_macro = baseline_data.get('recall_macro', 0)
                f1_macro = baseline_data.get('macro_f1', 0)
                
                # Get positive class F1
                f1_positive = 0
                if 'per_label_metrics' in baseline_data:
                    if dataset_name == 'MAMI' and 'misogynous' in baseline_data['per_label_metrics']:
                        f1_positive = baseline_data['per_label_metrics']['misogynous']['f1-score']
                    elif dataset_name == 'EXIST2024' and 'sexist' in baseline_data['per_label_metrics']:
                        f1_positive = baseline_data['per_label_metrics']['sexist']['f1-score']
                
                baseline_row = {
                    'Dataset': dataset_name,
                    'Experiment Category': 'Baseline',
                    'Feature Type': '',
                    'Ablation Method': '',
                    'Precision Macro': precision_macro,
                    'Recall Macro': recall_macro,
                    'F1 Macro': f1_macro,
                    'F1 Positive Class': f1_positive
                }
                results_list.insert(0, baseline_row)
            
            else:  # multilabel
                # Add multilabel baseline
                per_label_metrics = baseline_data.get('per_label_metrics', {})
                
                baseline_row = {
                    'Dataset': dataset_name,
                    'Experiment Category': 'Baseline',
                    'Feature Type': '',
                    'Ablation Method': '',
                }
                
                # Add metrics for each label
                if dataset_name == 'MAMI':
                    labels = ['non-misogynous', 'shaming', 'stereotype', 'objectification', 'violence']
                    label_mapping = {
                        'non-misogynous': 'Non-misogyny',
                        'shaming': 'Shaming',
                        'stereotype': 'Stereotype', 
                        'objectification': 'Objectification',
                        'violence': 'Violence'
                    }
                else:  # EXIST2024
                    labels = ['non-sexist', 'ideological-inequality', 'stereotyping-dominance', 
                             'objectification', 'sexual-violence', 'misogyny-non-sexual-violence']
                    label_mapping = {
                        'non-sexist': 'Non-sexist',
                        'ideological-inequality': 'Ideological-inequality',
                        'stereotyping-dominance': 'Stereotyping-dominance',
                        'objectification': 'Objectification',
                        'sexual-violence': 'Sexual-violence',
                        'misogyny-non-sexual-violence': 'Misogyny-non-sexual-violence'
                    }
                
                for label in labels:
                    if label in per_label_metrics:
                        metrics = per_label_metrics[label]
                        mapped_label = label_mapping.get(label, label)
                        baseline_row[f'{mapped_label} P'] = metrics.get('precision', 0)
                        baseline_row[f'{mapped_label} R'] = metrics.get('recall', 0)
                        baseline_row[f'{mapped_label} F1'] = metrics.get('f1-score', 0)
                
                # Add macro average
                if 'macro avg' in per_label_metrics:
                    macro_metrics = per_label_metrics['macro avg']
                    baseline_row['Macro average P'] = macro_metrics.get('precision', 0)
                    baseline_row['Macro average R'] = macro_metrics.get('recall', 0)
                    baseline_row['Macro average F1'] = macro_metrics.get('f1-score', 0)
                
                results_list.insert(0, baseline_row)


def collect_random_including_results(dataset_name):
    """Collect random including results, excluding function word random ablations."""
    results_dir = "evaluation/results/binary/SVM"
    patterns = [f"{results_dir}/random_including_*{dataset_name.lower()}*results.json"]
    
    random_results = []
    
    for pattern in patterns:
        files = glob.glob(pattern)
        for file_path in files:
            filename = os.path.basename(file_path)
            
            if 'random_including' in filename:
                # Extract base feature type
                if 'sentiment_pos' in filename:
                    base_feature = 'Pos_Sentiment'
                elif 'sentiment_neg' in filename:
                    base_feature = 'Neg_Sentiment'
                elif 'function' in filename:
                    # Skip function word random ablations
                    print(f"   ⏭️ Skipping function word random ablation: {filename}")
                    continue
                elif 'hate' in filename:
                    base_feature = 'Hate'
                else:
                    continue
                
                data = load_json_results(file_path)
                if data:
                    precision_macro = data.get('precision_macro', 0)
                    recall_macro = data.get('recall_macro', 0)
                    f1_macro = data.get('macro_f1', 0)
 
                    
                    # Get positive class F1
                    f1_positive = 0
                    if 'per_label_metrics' in data:
                        if dataset_name == 'MAMI' and 'misogynous' in data['per_label_metrics']:
                            f1_positive = data['per_label_metrics']['misogynous']['f1-score']
                        elif dataset_name == 'EXIST2024' and 'sexist' in data['per_label_metrics']:
                            f1_positive = data['per_label_metrics']['sexist']['f1-score']
                    
                    method_name = 'Remove' if 'remove' in filename else 'Placeholder'
                    
                    random_row = {
                        'Dataset': dataset_name,
                        'Experiment Category': 'Random Words',
                        'Feature Type': base_feature,
                        'Ablation Method': method_name,
                        'Precision Macro': precision_macro,
                        'Recall Macro': recall_macro,
                        'F1 Macro': f1_macro,
                        'F1 Positive Class': f1_positive
                    }
                    random_results.append(random_row)
    
    return random_results







def collect_coarse_grained_multilabel_results(dataset_name):
    """
    Collect coarse-grained multilabel classification results.
    
    Looks for results from run_coarse_grained_multilabel_ablation_experiments function
    which processes sentiment_pos, sentiment_neg, hate, function categories.
    """
    results_dir = "evaluation/results/multi-label/SVM/coarse-grained"
    
    # Search patterns for coarse-grained multilabel results
    if dataset_name in ["EXIST2024", "EXIST"]:
        patterns = [
            f"{results_dir}/*exist*_hierarchy_results.json",
            f"{results_dir}/*EXIST2024*_hierarchy_results.json"
        ]
    else:  # MAMI
        patterns = [
            f"{results_dir}/*mami*_hierarchy_results.json", 
            f"{results_dir}/*MAMI*_hierarchy_results.json"
        ]
    
    found_files = []
    for pattern in patterns:
        files = glob.glob(pattern)
        found_files.extend(files)
    
    # Remove duplicates
    found_files = list(set(found_files))
    
    coarse_multilabel_results = []
    
    for file_path in found_files:
        filename = os.path.basename(file_path)
        
        # Skip baseline files
        if 'baseline' in filename.lower():
            continue
        
        # Extract feature info for coarse-grained categories
        feature_type, method = extract_coarse_grained_feature_info(filename, dataset_name)
        
        if feature_type and method:
            data = load_json_results(file_path)
            if data:
                # Get per-label metrics
                per_label_metrics = data.get('per_label_metrics', {})
                
                # Initialize metrics
                result_row = {
                    'Dataset': dataset_name,
                    'Experiment Category': 'Coarse-Grained Feature Ablation',
                    'Feature Type': feature_type,
                    'Ablation Method': method,
                }
                
                # Extract metrics for each label
                if dataset_name == 'MAMI':
                    labels = ['non-misogynous', 'shaming', 'stereotype', 'objectification', 'violence']
                    label_mapping = {
                        'non-misogynous': 'Non-misogyny',
                        'shaming': 'Shaming',
                        'stereotype': 'Stereotype', 
                        'objectification': 'Objectification',
                        'violence': 'Violence'
                    }
                else:  # EXIST2024
                    labels = ['non-sexist', 'ideological-inequality', 'stereotyping-dominance', 
                             'objectification', 'sexual-violence', 'misogyny-non-sexual-violence']
                    label_mapping = {
                        'non-sexist': 'Non-sexist',
                        'ideological-inequality': 'Ideological-inequality',
                        'stereotyping-dominance': 'Stereotyping-dominance',
                        'objectification': 'Objectification',
                        'sexual-violence': 'Sexual-violence',
                        'misogyny-non-sexual-violence': 'Misogyny-non-sexual-violence'
                    }
                
                # Add precision, recall, F1 for each label
                for label in labels:
                    if label in per_label_metrics:
                        metrics = per_label_metrics[label]
                        mapped_label = label_mapping.get(label, label)
                        result_row[f'{mapped_label} P'] = metrics.get('precision', 0)
                        result_row[f'{mapped_label} R'] = metrics.get('recall', 0)
                        result_row[f'{mapped_label} F1'] = metrics.get('f1-score', 0)
                
                # Add macro average
                if 'macro avg' in per_label_metrics:
                    macro_metrics = per_label_metrics['macro avg']
                    result_row['Macro average P'] = macro_metrics.get('precision', 0)
                    result_row['Macro average R'] = macro_metrics.get('recall', 0)
                    result_row['Macro average F1'] = macro_metrics.get('f1-score', 0)
                
                coarse_multilabel_results.append(result_row)
    
    return coarse_multilabel_results


def extract_coarse_grained_feature_info(filename, dataset_name):
    """
    Extract feature type and method from coarse-grained multilabel filename.
    
    Focuses on the 4 coarse-grained categories: sentiment_pos, sentiment_neg, hate, function
    """
    # Remove common parts and normalize
    basename = filename.replace('_results.json', '').replace('_hierarchy_results.json', '')
    basename = basename.replace(f'{dataset_name}_', '').replace('svm_ablation_', '')
    basename = basename.replace('EXIST2024_', '').replace('MAMI_', '')  # Handle case variations
    
    feature_type = None
    method = None
    
    # Extract coarse-grained feature types
    if 'sentiment_pos' in basename:
        feature_type = 'CG_Pos_Sentiment'
    elif 'sentiment_neg' in basename:
        feature_type = 'CG_Neg_Sentiment'
    elif 'hate' in basename:
        # Handle general hate category (not specific subcategories)
        feature_type = 'CG_Hate'
    elif 'function' in basename:
        # Handle general function category (not specific subcategories)
        feature_type = 'CG_Function'
    
    # Method extraction
    if 'mask' in basename:
        method = 'Placeholder'
    elif 'remove' in basename:
        method = 'Remove'
    
    return feature_type, method


def export_coarse_grained_multilabel_results_separately():
    """
    Export coarse-grained multilabel results as separate CSV files for MAMI and EXIST2024.
    
    This function specifically handles results from run_coarse_grained_multilabel_ablation_experiments
    which processes the 4 coarse-grained categories: sentiment_pos, sentiment_neg, hate, function.
    """
    
    print("🔄 Collecting coarse-grained multilabel ablation results for CSV export...")
    
    datasets = ['MAMI', 'EXIST2024']
    
    # Create output directory
    output_dir = "evaluation/exported_results"
    os.makedirs(output_dir, exist_ok=True)
    
    for dataset in datasets:
        print(f"\n📊 Processing coarse-grained multilabel results for {dataset}...")
        
        dataset_results = []
        
        # Add baseline first
        add_baseline_results(dataset_results, dataset, 'multilabel')
        
        # Add coarse-grained feature ablation results
        coarse_results = collect_coarse_grained_multilabel_results(dataset)
        dataset_results.extend(coarse_results)
        
        print(f"   ✅ Added {len(coarse_results)} coarse-grained multilabel results for {dataset}")
        
        if dataset_results:
            # Create DataFrame
            df = pd.DataFrame(dataset_results)
            
            # Sort: Feature Type first, then Method
            df['Feature_Type_Sort'] = df['Feature Type'].fillna('')
            df = df.sort_values(
                ['Feature_Type_Sort', 'Ablation Method'], 
                na_position='first'
            )
            # Remove the helper column
            df = df.drop('Feature_Type_Sort', axis=1)
            
            # Export to CSV
            csv_path = f"{output_dir}/coarse_grained_multilabel_results_{dataset}.csv"
            df.to_csv(csv_path, index=False, float_format='%.3f')
            
            print(f"✅ Coarse-grained multilabel results for {dataset} exported to: {csv_path}")
            print(f"   📊 Total rows: {len(dataset_results)}")
            print(f"   📊 Columns: {list(df.columns)}")
            
            # Show feature types found
            feature_types = df[df['Feature Type'].notna()]['Feature Type'].unique()
            if len(feature_types) > 0:
                print(f"   🎯 Feature types: {', '.join(feature_types)}")
        else:
            print(f"⚠️ No coarse-grained multilabel results found for {dataset}")

    
    print(f"📁 Results saved in: {output_dir}/")
    print("📊 Separate CSV files created for MAMI and EXIST2024")
    print("🎯 Categories: CG_Pos_Sentiment, CG_Neg_Sentiment, CG_Hate, CG_Function")



export_coarse_grained_multilabel_results_separately()

## fine-grained

In [None]:
def collect_multilabel_results(dataset_name):
    """Collect multilabel classification results."""
    results_dir = "evaluation/results/multi-label/SVM/fine-grained"
    
    # Search patterns
    if dataset_name in ["EXIST2024", "EXIST"]:
        patterns = [
            f"{results_dir}/*exist*_results.json",
            f"{results_dir}/*EXIST2024*_results.json"
        ]
    else:  # MAMI
        patterns = [
            f"{results_dir}/*mami*_results.json", 
            f"{results_dir}/*MAMI*_results.json"
        ]
    
    found_files = []
    for pattern in patterns:
        files = glob.glob(pattern)
        found_files.extend(files)
    
    # Remove duplicates
    found_files = list(set(found_files))
    
    multilabel_results = []
    
    for file_path in found_files:
        filename = os.path.basename(file_path)
        
        # Skip baseline files
        if 'baseline' in filename.lower():
            continue
        
        # Extract feature info from multilabel filename
        feature_type, method = extract_multilabel_feature_info(filename, dataset_name)
        
        if feature_type and method:
            data = load_json_results(file_path)
            if data:
                # Get per-label metrics
                per_label_metrics = data.get('per_label_metrics', {})
                
                # Initialize metrics
                result_row = {
                    'Dataset': dataset_name,
                    'Experiment Category': 'Feature Ablation',
                    'Feature Type': feature_type,
                    'Ablation Method': method,
                }
                
                # Extract metrics for each label
                if dataset_name == 'MAMI':
                    labels = ['non-misogynous', 'shaming', 'stereotype', 'objectification', 'violence']
                    label_mapping = {
                        'non-misogynous': 'Non-misogyny',
                        'shaming': 'Shaming',
                        'stereotype': 'Stereotype', 
                        'objectification': 'Objectification',
                        'violence': 'Violence'
                    }
                else:  # EXIST2024
                    labels = ['non-sexist', 'ideological-inequality', 'stereotyping-dominance', 
                             'objectification', 'sexual-violence', 'misogyny-non-sexual-violence']
                    label_mapping = {
                        'non-sexist': 'Non-sexist',
                        'ideological-inequality': 'Ideological-inequality',
                        'stereotyping-dominance': 'Stereotyping-dominance',
                        'objectification': 'Objectification',
                        'sexual-violence': 'Sexual-violence',
                        'misogyny-non-sexual-violence': 'Misogyny-non-sexual-violence'
                    }
                
                # Add precision, recall, F1 for each label
                for label in labels:
                    if label in per_label_metrics:
                        metrics = per_label_metrics[label]
                        mapped_label = label_mapping.get(label, label)
                        result_row[f'{mapped_label} P'] = metrics.get('precision', 0)
                        result_row[f'{mapped_label} R'] = metrics.get('recall', 0)
                        result_row[f'{mapped_label} F1'] = metrics.get('f1-score', 0)
                
                # Add macro average
                if 'macro avg' in per_label_metrics:
                    macro_metrics = per_label_metrics['macro avg']
                    result_row['Macro average P'] = macro_metrics.get('precision', 0)
                    result_row['Macro average R'] = macro_metrics.get('recall', 0)
                    result_row['Macro average F1'] = macro_metrics.get('f1-score', 0)
                
                multilabel_results.append(result_row)
    
    return multilabel_results

def extract_multilabel_feature_info(filename, dataset_name):
    """Extract feature type and method from multilabel filename."""
    # Remove common parts and normalize
    basename = filename.replace('_results.json', '').replace('_hierarchy_results.json', '')
    basename = basename.replace(f'{dataset_name}_', '').replace('svm_ablation_', '')
    basename = basename.replace('EXIST2024_', '').replace('MAMI_', '')  # Handle case variations
    
    
    feature_type = None
    method = None
    
    # Feature type extraction for multilabel (more specific categories)
    if 'all_neg_emotions' in basename:
        feature_type = 'All_Neg_Emotions'
    elif 'neg_sadness' in basename:
        feature_type = 'Neg_Sadness'
    elif 'neg_anger' in basename:
        feature_type = 'Neg_Anger'
    elif 'neg_fear' in basename:
        feature_type = 'Neg_Fear'
    elif 'neg_disgust' in basename:
        feature_type = 'Neg_Disgust'
    elif 'func_' in basename or 'auxiliary_verbs' in basename or 'conjunctions' in basename or 'determiners' in basename or 'enumerators' in basename or 'interjections' in basename or 'particles' in basename or 'prepositions' in basename or 'pronouns' in basename or 'qualifiers' in basename:
        # Extract specific function word category
        if 'auxiliary_verbs' in basename or 'auxiliary' in basename:
            feature_type = 'Func_Auxiliary'
        elif 'conjunctions' in basename:
            feature_type = 'Func_Conjunctions'
        elif 'determiners' in basename:
            feature_type = 'Func_Determiners'
        elif 'enumerators' in basename:
            feature_type = 'Func_Enumerators'
        elif 'interjections' in basename:
            feature_type = 'Func_Interjections'
        elif 'particles' in basename:
            feature_type = 'Func_Particles'
        elif 'prepositions' in basename:
            feature_type = 'Func_Prepositions'
        elif 'pronouns' in basename:
            feature_type = 'Func_Pronouns'
        elif 'qualifiers' in basename:
            feature_type = 'Func_Qualifiers'
        else:
            # Fallback for func_ pattern
            parts = basename.split('_')
            for i, part in enumerate(parts):
                if part == 'func' and i + 1 < len(parts):
                    func_category = parts[i + 1]
                    feature_type = f'Func_{func_category.title()}'
                    break
    elif 'hate_' in basename or any(hate_word in basename for hate_word in ['hate_an', 'hate_asf', 'hate_asm', 'hate_cds', 'hate_ddp', 'hate_ddf', 'hate_dmc', 'hate_is', 'hate_om', 'hate_or', 'hate_pa', 'hate_pr', 'hate_ps', 'hate_qas', 'hate_rci', 'hate_re', 'hate_svp']):
        # Extract specific hate speech category
        hate_categories = ['an', 'asf', 'asm', 'cds', 'ddp', 'ddf', 'dmc', 'is', 'om', 'or', 'pa', 'pr', 'ps', 'qas', 'rci', 're', 'svp']
        for category in hate_categories:
            if f'hate_{category}' in basename:
                feature_type = f'Hate_{category.upper()}'
                break
        
        # Fallback for hate_ pattern
        if not feature_type:
            parts = basename.split('_')
            for i, part in enumerate(parts):
                if part == 'hate' and i + 1 < len(parts):
                    hate_category = parts[i + 1]
                    feature_type = f'Hate_{hate_category.upper()}'
                    break
    
    # Method extraction
    if 'mask' in basename:
        method = 'Placeholder'
    elif 'remove' in basename:
        method = 'Remove'
    
    
    return feature_type, method






In [None]:
def export_ablation_results_to_csv():
    """Main function to export all ablation results to CSV."""
    
    print("🔄 Collecting ablation results for CSV export...")
    
    datasets = ['MAMI', 'EXIST2024']
    
    # Collect binary results (baseline + feature ablation only)
    print("\n📊 Collecting binary classification results (baseline + feature ablation)...")
    binary_results = []
    
    for dataset in datasets:
        print(f"   Processing {dataset}...")
        
        # Add baseline first
        add_baseline_results(binary_results, dataset, 'binary')
        
        # Add feature ablation results
        dataset_results = collect_binary_results(dataset)
        binary_results.extend(dataset_results)
        
        print(f"   ✅ Added {len(dataset_results)} feature ablation results for {dataset}")
    
    # Collect binary results with random words (baseline + feature ablation + random)
    print("\n📊 Collecting binary classification results with random words (excluding function word random ablations)...")
    binary_results_with_random = binary_results.copy()  # Start with baseline + feature ablation
    
    for dataset in datasets:
        print(f"   Processing random results for {dataset}...")
        
        # Add random including results (excluding function word random ablations)
        random_results = collect_random_including_results(dataset)
        binary_results_with_random.extend(random_results)
        
        print(f"   ✅ Added {len(random_results)} random word results for {dataset} (function word random ablations excluded)")
    
    # Collect multilabel results 
    print("\n📊 Collecting multilabel classification results...")
    multilabel_results_by_dataset = {}  
    
    for dataset in datasets:
        print(f"   Processing {dataset}...")
        
        dataset_multilabel_results = []
        
        # Add baseline first
        add_baseline_results(dataset_multilabel_results, dataset, 'multilabel')
        
        # Add feature ablation results
        dataset_results = collect_multilabel_results(dataset)
        dataset_multilabel_results.extend(dataset_results)
        
        multilabel_results_by_dataset[dataset] = dataset_multilabel_results
        
        print(f"   ✅ Added {len(dataset_results)} feature ablation results for {dataset}")
    
    # Create output directory
    output_dir = "evaluation/exported_results"
    os.makedirs(output_dir, exist_ok=True)
    
    # Export binary results (baseline + feature ablation only)
    if binary_results:
        binary_df = pd.DataFrame(binary_results)
        binary_df = binary_df.sort_values(['Dataset', 'Feature Type', 'Ablation Method'], 
                                         na_position='first')
        
        binary_csv_path = f"{output_dir}/binary_ablation_results.csv"
        binary_df.to_csv(binary_csv_path, index=False, float_format='%.3f')

        print(f"\n✅ Binary results (baseline + feature ablation) exported to: {binary_csv_path}")
        print(f"   📊 Total rows: {len(binary_results)}")
        print(f"   📊 Columns: {list(binary_df.columns)}")
    else:
        print("\n⚠️ No binary results found to export")
    
    # Export binary results with random words (baseline + feature ablation + random)
    if binary_results_with_random:
        binary_random_df = pd.DataFrame(binary_results_with_random)
        
        # Custom sorting: Dataset first, then by Feature Type within each dataset
        # Handle empty Feature Type (baseline) by putting it first
        binary_random_df['Feature_Type_Sort'] = binary_random_df['Feature Type'].fillna('')
        binary_random_df = binary_random_df.sort_values(
            ['Dataset', 'Feature_Type_Sort', 'Experiment Category', 'Ablation Method'], 
            na_position='first'
        )
        # Remove the helper column
        binary_random_df = binary_random_df.drop('Feature_Type_Sort', axis=1)
        
        binary_random_csv_path = f"{output_dir}/binary_ablation_results_random.csv"
        binary_random_df.to_csv(binary_random_csv_path, index=False, float_format='%.3f')

        print(f"\n✅ Binary results with random words exported to: {binary_random_csv_path}")
        print(f"   📊 Total rows: {len(binary_results_with_random)} (function word random ablations excluded)")
        print(f"   📊 Columns: {list(binary_random_df.columns)}")
    else:
        print("\n⚠️ No binary results with random words found to export")
    
    # Export multilabel results
    multilabel_results_all = [] 
    
    for dataset, dataset_results in multilabel_results_by_dataset.items():
        if dataset_results:
            multilabel_df = pd.DataFrame(dataset_results)
            
            # Sort multilabel results: Feature Type first
            multilabel_df['Feature_Type_Sort'] = multilabel_df['Feature Type'].fillna('')
            multilabel_df = multilabel_df.sort_values(
                ['Feature_Type_Sort', 'Ablation Method'], 
                na_position='first'
            )
            # Remove the helper column
            multilabel_df = multilabel_df.drop('Feature_Type_Sort', axis=1)
            

            multilabel_csv_path = f"{output_dir}/multilabel_ablation_results_{dataset}.csv"
            multilabel_df.to_csv(multilabel_csv_path, index=False, float_format='%.3f')
            
            print(f"\n✅ Multilabel results for {dataset} exported to: {multilabel_csv_path}")
            print(f"   📊 Total rows: {len(dataset_results)}")
            print(f"   📊 Columns: {list(multilabel_df.columns)}")
            

            multilabel_results_all.extend(dataset_results)
        else:
            print(f"\n⚠️ No multilabel results found for {dataset}")
    
    # Optional
    if multilabel_results_all:
        combined_multilabel_df = pd.DataFrame(multilabel_results_all)
        combined_multilabel_df['Feature_Type_Sort'] = combined_multilabel_df['Feature Type'].fillna('')
        combined_multilabel_df = combined_multilabel_df.sort_values(
            ['Dataset', 'Feature_Type_Sort', 'Ablation Method'], 
            na_position='first'
        )
        combined_multilabel_df = combined_multilabel_df.drop('Feature_Type_Sort', axis=1)
        
        combined_multilabel_csv_path = f"{output_dir}/multilabel_ablation_results_combined.csv"
        combined_multilabel_df.to_csv(combined_multilabel_csv_path, index=False, float_format='%.3f')
        
        print(f"\n✅ Combined multilabel results exported to: {combined_multilabel_csv_path}")
        print(f"   📊 Total rows: {len(multilabel_results_all)}")
    
    # Create summary
    print(f"\n🎉 EXPORT COMPLETED!")
    print(f"📁 Results saved in: {output_dir}/")
    print(f"📊 Binary results (baseline + feature ablation): {len(binary_results) if binary_results else 0} rows")
    print(f"📊 Binary results with random words: {len(binary_results_with_random) if binary_results_with_random else 0} rows (function word random ablations excluded)")
    

    for dataset, dataset_results in multilabel_results_by_dataset.items():
        print(f"📊 Multilabel results for {dataset}: {len(dataset_results) if dataset_results else 0} rows")
    
    print(f"📊 Combined multilabel results: {len(multilabel_results_all) if multilabel_results_all else 0} rows")
    
    return binary_results, binary_results_with_random, multilabel_results_all

if __name__ == "__main__":
    # Run the export
    binary_results, binary_results_with_random, multilabel_results = export_ablation_results_to_csv()