# Results Comparison: HEARTS Adaptation vs. Original

**Project:** HEARTS Adaptation - Gender Bias Detection  
**Purpose:** Compare results from adapted models with original HEARTS replication

This notebook handles:
1. Loading evaluation results from both projects
2. Comparing metrics across models
3. Visualizing performance differences
4. Generating comparison report


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

# Set up paths
project_root = os.path.dirname(os.path.dirname(os.path.abspath('')))
results_dir = os.path.join(project_root, 'results')
hearts_original_dir = r'D:\Coursework\Project Replication\HEARTS-Text-Stereotype-Detection-main'

print(f"Project root: {project_root}")
print(f"Results directory: {results_dir}")
print(f"Original HEARTS directory: {hearts_original_dir}")


## Load Results

Load evaluation results from both the adapted project and original HEARTS replication:


In [None]:
def load_classification_reports(results_dir, model_names=None):
    """
    Load classification reports for all models
    
    Parameters:
    -----------
    results_dir : str
        Directory containing results
    model_names : list or None
        List of model names to load (if None, auto-detect)
    
    Returns:
    --------
    results_dict : dict
        Dictionary mapping model names to classification reports
    """
    results_dict = {}
    
    if model_names is None:
        # Auto-detect model names from directory structure
        dataset_dir = os.path.join(results_dir, 'job_descriptions')
        if os.path.exists(dataset_dir):
            csv_files = [f for f in os.listdir(dataset_dir) if f.startswith('classification_report_') and f.endswith('.csv')]
            model_names = [f.replace('classification_report_', '').replace('.csv', '') for f in csv_files]
    
    if model_names:
        for model_name in model_names:
            report_path = os.path.join(results_dir, 'job_descriptions', f'classification_report_{model_name}.csv')
            if os.path.exists(report_path):
                df = pd.read_csv(report_path, index_col=0)
                results_dict[model_name] = df
                print(f"Loaded results for: {model_name}")
            else:
                print(f"Warning: Results not found for {model_name} at {report_path}")
    
    return results_dict


def extract_metrics_from_report(report_df):
    """
    Extract key metrics from classification report
    
    Returns:
    --------
    metrics : dict
        Dictionary with precision, recall, f1, balanced_accuracy
    """
    metrics = {}
    
    # Get macro averages
    if 'macro avg' in report_df.index:
        macro_avg = report_df.loc['macro avg']
        metrics['precision'] = macro_avg.get('precision', np.nan)
        metrics['recall'] = macro_avg.get('recall', np.nan)
        metrics['f1'] = macro_avg.get('f1-score', np.nan)
    
    # Get accuracy
    if 'accuracy' in report_df.index:
        metrics['accuracy'] = report_df.loc['accuracy', 'f1-score']
    
    # Try to get balanced accuracy if available
    if 'balanced_accuracy' in report_df.columns:
        metrics['balanced_accuracy'] = report_df.loc['macro avg', 'balanced_accuracy']
    
    return metrics


# Load adapted project results
print("Loading adapted project results...")
adapted_results = load_classification_reports(results_dir)

# Load original HEARTS results (if available)
print("\nLoading original HEARTS results...")
# Note: Update path based on where original HEARTS results are stored
hearts_results_path = os.path.join(hearts_original_dir, 'Model Training and Evaluation', 'result_output_albertv2')
if os.path.exists(hearts_results_path):
    # Try to find classification reports
    hearts_results = {}
    # This would need to be adapted based on actual HEARTS result structure
    print("Original HEARTS results directory found")
else:
    print("Original HEARTS results directory not found - update path if needed")
    hearts_results = {}


## Compare Metrics

Create a comparison table of metrics across models:


In [None]:
def create_comparison_table(adapted_results, hearts_results=None):
    """
    Create a comparison table of metrics
    
    Parameters:
    -----------
    adapted_results : dict
        Results from adapted project
    hearts_results : dict
        Results from original HEARTS (optional)
    
    Returns:
    --------
    comparison_df : pd.DataFrame
        Comparison table
    """
    rows = []
    
    # Add adapted project results
    for model_name, report_df in adapted_results.items():
        metrics = extract_metrics_from_report(report_df)
        row = {
            'Project': 'Adapted (Gender Bias)',
            'Model': model_name,
            'Task': 'Gender Bias Detection',
            **metrics
        }
        rows.append(row)
    
    # Add original HEARTS results (if available)
    if hearts_results:
        for model_name, report_df in hearts_results.items():
            metrics = extract_metrics_from_report(report_df)
            row = {
                'Project': 'Original HEARTS',
                'Model': model_name,
                'Task': 'Stereotype Detection',
                **metrics
            }
            rows.append(row)
    
    # Create DataFrame
    comparison_df = pd.DataFrame(rows)
    
    return comparison_df


# Create comparison table
if adapted_results:
    comparison_df = create_comparison_table(adapted_results, hearts_results)
    print("\n" + "=" * 80)
    print("RESULTS COMPARISON")
    print("=" * 80)
    print(comparison_df.to_string(index=False))
    
    # Save comparison
    comparison_path = os.path.join(results_dir, 'results_comparison.csv')
    comparison_df.to_csv(comparison_path, index=False)
    print(f"\nComparison saved to: {comparison_path}")
else:
    print("No results found. Please run evaluation notebooks first.")


In [None]:
def plot_metrics_comparison(comparison_df, save_path=None):
    """
    Create visualization comparing metrics across models
    
    Parameters:
    -----------
    comparison_df : pd.DataFrame
        Comparison table
    save_path : str or None
        Path to save figure
    """
    if comparison_df.empty:
        print("No data to plot")
        return
    
    # Set up the plotting style
    plt.style.use('seaborn-v0_8-darkgrid')
    sns.set_palette("husl")
    
    # Prepare data for plotting
    metrics_to_plot = ['precision', 'recall', 'f1']
    available_metrics = [m for m in metrics_to_plot if m in comparison_df.columns]
    
    if not available_metrics:
        print("No metrics available for plotting")
        return
    
    # Create figure with subplots
    n_metrics = len(available_metrics)
    fig, axes = plt.subplots(1, n_metrics, figsize=(6 * n_metrics, 6))
    
    if n_metrics == 1:
        axes = [axes]
    
    for idx, metric in enumerate(available_metrics):
        ax = axes[idx]
        
        # Create bar plot
        x_pos = np.arange(len(comparison_df))
        bars = ax.bar(x_pos, comparison_df[metric], alpha=0.7, edgecolor='black')
        
        # Customize plot
        ax.set_xlabel('Model', fontsize=12)
        ax.set_ylabel(metric.capitalize(), fontsize=12)
        ax.set_title(f'{metric.capitalize()} Comparison', fontsize=14, fontweight='bold')
        ax.set_xticks(x_pos)
        ax.set_xticklabels(comparison_df['Model'], rotation=45, ha='right')
        ax.set_ylim([0, 1])
        ax.grid(axis='y', alpha=0.3)
        
        # Add value labels on bars
        for bar in bars:
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height,
                   f'{height:.3f}',
                   ha='center', va='bottom', fontsize=10)
    
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        print(f"Figure saved to: {save_path}")
    
    plt.show()


# Create visualizations
if 'comparison_df' in locals() and not comparison_df.empty:
    plot_path = os.path.join(results_dir, 'metrics_comparison.png')
    plot_metrics_comparison(comparison_df, save_path=plot_path)
else:
    print("No comparison data available. Please run the comparison cell above first.")


## Summary Report

Generate a summary report comparing the adapted project with original HEARTS:


In [None]:
def generate_summary_report(comparison_df, output_path=None):
    """
    Generate a text summary report
    
    Parameters:
    -----------
    comparison_df : pd.DataFrame
        Comparison table
    output_path : str or None
        Path to save report
    """
    if comparison_df.empty:
        return "No results available for summary."
    
    report_lines = []
    report_lines.append("=" * 80)
    report_lines.append("HEARTS ADAPTATION - RESULTS SUMMARY")
    report_lines.append("=" * 80)
    report_lines.append("")
    report_lines.append("Project: Gender Bias Detection in Job Descriptions")
    report_lines.append("SDG Alignment: SDG 5 (Gender Equality) & SDG 8 (Decent Work)")
    report_lines.append("")
    report_lines.append("=" * 80)
    report_lines.append("MODEL PERFORMANCE")
    report_lines.append("=" * 80)
    report_lines.append("")
    
    # Add adapted project results
    adapted_df = comparison_df[comparison_df['Project'] == 'Adapted (Gender Bias)']
    if not adapted_df.empty:
        report_lines.append("Adapted Project Results:")
        report_lines.append("-" * 80)
        for _, row in adapted_df.iterrows():
            report_lines.append(f"\nModel: {row['Model']}")
            if 'precision' in row:
                report_lines.append(f"  Precision: {row['precision']:.4f}")
            if 'recall' in row:
                report_lines.append(f"  Recall: {row['recall']:.4f}")
            if 'f1' in row:
                report_lines.append(f"  F1-Score: {row['f1']:.4f}")
            if 'balanced_accuracy' in row and pd.notna(row['balanced_accuracy']):
                report_lines.append(f"  Balanced Accuracy: {row['balanced_accuracy']:.4f}")
    
    # Add original HEARTS results if available
    hearts_df = comparison_df[comparison_df['Project'] == 'Original HEARTS']
    if not hearts_df.empty:
        report_lines.append("\n" + "=" * 80)
        report_lines.append("Original HEARTS Results (for reference):")
        report_lines.append("-" * 80)
        for _, row in hearts_df.iterrows():
            report_lines.append(f"\nModel: {row['Model']}")
            if 'precision' in row:
                report_lines.append(f"  Precision: {row['precision']:.4f}")
            if 'recall' in row:
                report_lines.append(f"  Recall: {row['recall']:.4f}")
            if 'f1' in row:
                report_lines.append(f"  F1-Score: {row['f1']:.4f}")
    
    report_lines.append("\n" + "=" * 80)
    report_lines.append("END OF REPORT")
    report_lines.append("=" * 80)
    
    report_text = "\n".join(report_lines)
    
    if output_path:
        with open(output_path, 'w') as f:
            f.write(report_text)
        print(f"Summary report saved to: {output_path}")
    
    return report_text


# Generate summary report
if 'comparison_df' in locals() and not comparison_df.empty:
    report_path = os.path.join(results_dir, 'summary_report.txt')
    summary = generate_summary_report(comparison_df, output_path=report_path)
    print(summary)
else:
    print("No comparison data available. Please run the comparison cell above first.")
