# NER Model Evaluation Results Visualization

This notebook provides comprehensive visualizations for the NER model evaluation results, comparing TITLE and NO-TITLE models across different datasets.

## Visualizations Included:
1. **Performance Comparison Charts** - F1, Precision, Recall comparisons
2. **Model Efficiency Analysis** - Inference time and model size comparisons
3. **Dataset-specific Performance** - Detailed breakdown by dataset
4. **Aggregated Statistics** - Summary statistics and trends
5. **Radar Charts** - Multi-dimensional performance comparison
6. **Statistical Analysis** - Confidence intervals and significance tests

In [15]:
# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
from pathlib import Path
import warnings
from datetime import datetime
import os

# Configure plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
warnings.filterwarnings('ignore')

# Set up paths
RESULTS_PATH = Path('/home/daniel-dorigo/Desktop/NER-proper-names/evaluation_results')
CHARTS_PATH = Path('/home/daniel-dorigo/Desktop/NER-proper-names/charts')
CHARTS_PATH.mkdir(exist_ok=True)

print("✅ Libraries imported successfully!")
print(f"📊 Charts will be saved to: {CHARTS_PATH}")

✅ Libraries imported successfully!
📊 Charts will be saved to: /home/daniel-dorigo/Desktop/NER-proper-names/charts


In [16]:
# Load the Latest Evaluation Results
def load_latest_results():
    """Load the most recent evaluation results."""
    csv_files = list(RESULTS_PATH.glob("detailed_evaluation_results_*.csv"))
    
    if not csv_files:
        raise FileNotFoundError("No evaluation results found!")
    
    # Get the most recent file
    latest_file = max(csv_files, key=lambda x: x.stat().st_mtime)
    print(f"📂 Loading results from: {latest_file.name}")
    
    df = pd.read_csv(latest_file)
    print(f"📊 Loaded {len(df)} evaluation records")
    print(f"📈 Models: {df['model_type'].unique()}")
    print(f"📋 Datasets: {df['dataset'].unique()}")
    
    return df, latest_file.stem

# Load data
df_results, results_timestamp = load_latest_results()

# Display basic info
print("\n🔍 Data Overview:")
print(f"  Shape: {df_results.shape}")
print(f"  Columns: {list(df_results.columns)}")
print("\n📊 Sample Data:")
display_cols = ['dataset', 'model_type', 'person_f1', 'person_precision', 'person_recall', 'token_accuracy']
df_results[display_cols].head()

📂 Loading results from: detailed_evaluation_results_20250611_004238.csv
📊 Loaded 8 evaluation records
📈 Models: ['TITLE' 'NO-TITLE']
📋 Datasets: ['conllpp_train' 'KAGGLE' 'ritter' 'WNUT']

🔍 Data Overview:
  Shape: (8, 12)
  Columns: ['dataset', 'model_type', 'num_samples', 'inference_time_seconds', 'inference_time_per_sample', 'person_precision', 'person_recall', 'person_f1', 'entity_f1_macro', 'token_accuracy', 'timestamp', 'model_size_mb']

📊 Sample Data:


Unnamed: 0,dataset,model_type,person_f1,person_precision,person_recall,token_accuracy
0,conllpp_train,TITLE,0.883776,0.838224,0.934564,0.988447
1,KAGGLE,TITLE,0.719136,0.647808,0.808116,0.964158
2,ritter,TITLE,0.245322,0.144608,0.808219,0.930461
3,WNUT,TITLE,0.377111,0.266578,0.644231,0.947684
4,conllpp_train,NO-TITLE,0.894456,0.876106,0.913591,0.991795


In [17]:
# Calculate Aggregated Statistics
def calculate_aggregated_stats(df):
    """Calculate aggregated statistics by model type."""
    agg_stats = []
    
    for model_type in df['model_type'].unique():
        model_data = df[df['model_type'] == model_type]
        
        stats = {
            'model_type': model_type,
            'num_datasets': len(model_data),
            'total_samples': model_data['num_samples'].sum(),
            'person_f1_mean': model_data['person_f1'].mean(),
            'person_f1_std': model_data['person_f1'].std(),
            'person_f1_min': model_data['person_f1'].min(),
            'person_f1_max': model_data['person_f1'].max(),
            'person_precision_mean': model_data['person_precision'].mean(),
            'person_precision_std': model_data['person_precision'].std(),
            'person_recall_mean': model_data['person_recall'].mean(),
            'person_recall_std': model_data['person_recall'].std(),
            'token_accuracy_mean': model_data['token_accuracy'].mean(),
            'token_accuracy_std': model_data['token_accuracy'].std(),
            'avg_inference_time_per_sample': model_data['inference_time_per_sample'].mean() * 1000,  # Convert to ms
            'model_size_mb': model_data['model_size_mb'].iloc[0]
        }
        agg_stats.append(stats)
    
    return pd.DataFrame(agg_stats)

df_aggregated = calculate_aggregated_stats(df_results)
print("📈 Aggregated Statistics:")
df_aggregated

📈 Aggregated Statistics:


Unnamed: 0,model_type,num_datasets,total_samples,person_f1_mean,person_f1_std,person_f1_min,person_f1_max,person_precision_mean,person_precision_std,person_recall_mean,person_recall_std,token_accuracy_mean,token_accuracy_std,avg_inference_time_per_sample,model_size_mb
0,TITLE,4,7104,0.556336,0.295845,0.245322,0.883776,0.474304,0.323724,0.798783,0.119022,0.957687,0.024694,44.530326,484.540474
1,NO-TITLE,4,7104,0.58432,0.284219,0.283619,0.894456,0.512445,0.325427,0.781903,0.130244,0.968749,0.021517,46.470687,484.528656


In [18]:
# 1. Performance Comparison Bar Charts
def create_performance_comparison():
    """Create comprehensive performance comparison charts."""
    
    # Create subplots
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=('F1 Score by Dataset', 'Precision by Dataset', 
                       'Recall by Dataset', 'Token Accuracy by Dataset'),
        specs=[[{"secondary_y": False}, {"secondary_y": False}],
               [{"secondary_y": False}, {"secondary_y": False}]]
    )
    
    metrics = ['person_f1', 'person_precision', 'person_recall', 'token_accuracy']
    titles = ['F1 Score', 'Precision', 'Recall', 'Token Accuracy']
    positions = [(1, 1), (1, 2), (2, 1), (2, 2)]
    
    colors = {'TITLE': '#1f77b4', 'NO-TITLE': '#ff7f0e'}
    
    for metric, title, (row, col) in zip(metrics, titles, positions):
        for model_type in df_results['model_type'].unique():
            model_data = df_results[df_results['model_type'] == model_type]
            
            fig.add_trace(
                go.Bar(
                    x=model_data['dataset'],
                    y=model_data[metric],
                    name=f'{model_type} Model',
                    marker_color=colors[model_type],
                    showlegend=(row == 1 and col == 1)  # Only show legend once
                ),
                row=row, col=col
            )
    
    fig.update_layout(
        height=800,
        title_text="NER Model Performance Comparison Across Datasets",
        title_x=0.5,
        showlegend=True,
        template="plotly_white"
    )
    
    # Update y-axes
    for row in [1, 2]:
        for col in [1, 2]:
            fig.update_yaxes(range=[0, 1], row=row, col=col)
    
    return fig

# Create and display the chart
performance_fig = create_performance_comparison()
performance_fig.show()

# Save the chart
performance_fig.write_html(CHARTS_PATH / f"performance_comparison_{results_timestamp}.html")
performance_fig.write_image(CHARTS_PATH / f"performance_comparison_{results_timestamp}.png", width=1200, height=800)
print("✅ Performance comparison chart saved!")

✅ Performance comparison chart saved!


In [19]:
# 2. Aggregated Model Comparison
def create_aggregated_comparison():
    """Create aggregated model comparison with error bars."""
    
    metrics = ['person_f1_mean', 'person_precision_mean', 'person_recall_mean', 'token_accuracy_mean']
    metric_names = ['F1 Score', 'Precision', 'Recall', 'Token Accuracy']
    error_metrics = ['person_f1_std', 'person_precision_std', 'person_recall_std', 'token_accuracy_std']
    
    fig = go.Figure()
    
    x_pos = np.arange(len(metric_names))
    width = 0.35
    
    for i, model_type in enumerate(df_aggregated['model_type']):
        model_data = df_aggregated[df_aggregated['model_type'] == model_type].iloc[0]
        
        values = [model_data[metric] for metric in metrics]
        errors = [model_data[error] for error in error_metrics]
        
        fig.add_trace(go.Bar(
            x=metric_names,
            y=values,
            error_y=dict(type='data', array=errors, visible=True),
            name=f'{model_type} Model',
            text=[f'{v:.3f}±{e:.3f}' for v, e in zip(values, errors)],
            textposition='outside'
        ))
    
    fig.update_layout(
        title="Aggregated Model Performance Comparison (Mean ± Std)",
        xaxis_title="Metrics",
        yaxis_title="Score",
        yaxis=dict(range=[0, 1.1]),
        template="plotly_white",
        height=600
    )
    
    return fig

aggregated_fig = create_aggregated_comparison()
aggregated_fig.show()

# Save the chart
aggregated_fig.write_html(CHARTS_PATH / f"aggregated_comparison_{results_timestamp}.html")
aggregated_fig.write_image(CHARTS_PATH / f"aggregated_comparison_{results_timestamp}.png", width=800, height=600)
print("✅ Aggregated comparison chart saved!")

✅ Aggregated comparison chart saved!


In [20]:
# 3. Model Efficiency Analysis
def create_efficiency_analysis():
    """Create model efficiency analysis (speed vs performance)."""
    
    # Create bubble chart: F1 vs Inference Time, bubble size = model size
    fig = go.Figure()
    
    colors = {'TITLE': '#1f77b4', 'NO-TITLE': '#ff7f0e'}
    
    for model_type in df_results['model_type'].unique():
        model_data = df_results[df_results['model_type'] == model_type]
        
        fig.add_trace(go.Scatter(
            x=model_data['inference_time_per_sample'] * 1000,  # Convert to ms
            y=model_data['person_f1'],
            mode='markers',
            marker=dict(
                size=model_data['model_size_mb'] / 10,  # Scale for visibility
                color=colors[model_type],
                opacity=0.7,
                line=dict(width=2, color='white')
            ),
            name=f'{model_type} Model',
            text=model_data['dataset'],
            hovertemplate=(
                '<b>%{text}</b><br>' +
                'F1 Score: %{y:.3f}<br>' +
                'Inference Time: %{x:.1f}ms<br>' +
                'Model Size: %{marker.size*10:.1f}MB<br>' +
                '<extra></extra>'
            )
        ))
    
    fig.update_layout(
        title="Model Efficiency Analysis: Performance vs Speed",
        xaxis_title="Inference Time per Sample (ms)",
        yaxis_title="F1 Score",
        template="plotly_white",
        height=600,
        annotations=[
            dict(text="Bubble size represents model size", 
                 x=0.02, y=0.98, xref="paper", yref="paper",
                 showarrow=False, font=dict(size=12))
        ]
    )
    
    return fig

efficiency_fig = create_efficiency_analysis()
efficiency_fig.show()

# Save the chart
efficiency_fig.write_html(CHARTS_PATH / f"efficiency_analysis_{results_timestamp}.html")
efficiency_fig.write_image(CHARTS_PATH / f"efficiency_analysis_{results_timestamp}.png", width=800, height=600)
print("✅ Efficiency analysis chart saved!")

✅ Efficiency analysis chart saved!


In [21]:
# 4. Radar Chart for Multi-dimensional Comparison
def create_radar_chart():
    """Create radar chart for multi-dimensional model comparison."""
    
    # Prepare data for radar chart
    categories = ['F1 Score', 'Precision', 'Recall', 'Token Accuracy', 'Speed (1-normalized)', 'Efficiency (1/size)']
    
    fig = go.Figure()
    
    for model_type in df_aggregated['model_type']:
        model_data = df_aggregated[df_aggregated['model_type'] == model_type].iloc[0]
        
        # Normalize speed (lower is better, so invert)
        max_speed = df_aggregated['avg_inference_time_per_sample'].max()
        normalized_speed = 1 - (model_data['avg_inference_time_per_sample'] / max_speed)
        
        # Normalize efficiency (1/size, higher is better)
        max_size = df_aggregated['model_size_mb'].max()
        normalized_efficiency = 1 - (model_data['model_size_mb'] / max_size)
        
        values = [
            model_data['person_f1_mean'],
            model_data['person_precision_mean'],
            model_data['person_recall_mean'],
            model_data['token_accuracy_mean'],
            normalized_speed,
            normalized_efficiency
        ]
        
        fig.add_trace(go.Scatterpolar(
            r=values,
            theta=categories,
            fill='toself',
            name=f'{model_type} Model',
            opacity=0.6
        ))
    
    fig.update_layout(
        polar=dict(
            radialaxis=dict(
                visible=True,
                range=[0, 1]
            )
        ),
        title="Multi-dimensional Model Comparison (Radar Chart)",
        template="plotly_white",
        height=600
    )
    
    return fig

radar_fig = create_radar_chart()
radar_fig.show()

# Save the chart
radar_fig.write_html(CHARTS_PATH / f"radar_comparison_{results_timestamp}.html")
radar_fig.write_image(CHARTS_PATH / f"radar_comparison_{results_timestamp}.png", width=800, height=600)
print("✅ Radar chart saved!")

✅ Radar chart saved!


In [22]:
# 5. Distribution Analysis
def create_distribution_analysis():
    """Create distribution analysis of metrics across datasets."""
    
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=('F1 Score Distribution', 'Precision Distribution',
                       'Recall Distribution', 'Token Accuracy Distribution'),
        specs=[[{"secondary_y": False}, {"secondary_y": False}],
               [{"secondary_y": False}, {"secondary_y": False}]]
    )
    
    metrics = ['person_f1', 'person_precision', 'person_recall', 'token_accuracy']
    positions = [(1, 1), (1, 2), (2, 1), (2, 2)]
    colors = ['#1f77b4', '#ff7f0e']
    
    for metric, (row, col) in zip(metrics, positions):
        for i, model_type in enumerate(df_results['model_type'].unique()):
            model_data = df_results[df_results['model_type'] == model_type]
            
            fig.add_trace(
                go.Histogram(
                    x=model_data[metric],
                    name=f'{model_type}',
                    opacity=0.7,
                    nbinsx=10,
                    marker_color=colors[i],
                    showlegend=(row == 1 and col == 1)
                ),
                row=row, col=col
            )
    
    fig.update_layout(
        height=800,
        title_text="Distribution of Performance Metrics Across Datasets",
        title_x=0.5,
        template="plotly_white",
        barmode='overlay'
    )
    
    return fig

distribution_fig = create_distribution_analysis()
distribution_fig.show()

# Save the chart
distribution_fig.write_html(CHARTS_PATH / f"distribution_analysis_{results_timestamp}.html")
distribution_fig.write_image(CHARTS_PATH / f"distribution_analysis_{results_timestamp}.png", width=1200, height=800)
print("✅ Distribution analysis chart saved!")

✅ Distribution analysis chart saved!


In [23]:
# 6. Dataset-specific Performance Heatmap
def create_performance_heatmap():
    """Create a heatmap showing performance across datasets and models."""
    
    # Pivot the data for heatmap
    metrics = ['person_f1', 'person_precision', 'person_recall', 'token_accuracy']
    
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=('F1 Score', 'Precision', 'Recall', 'Token Accuracy'),
        specs=[[{"type": "heatmap"}, {"type": "heatmap"}],
               [{"type": "heatmap"}, {"type": "heatmap"}]]
    )
    
    positions = [(1, 1), (1, 2), (2, 1), (2, 2)]
    
    for metric, (row, col) in zip(metrics, positions):
        # Create pivot table
        pivot_data = df_results.pivot(index='dataset', columns='model_type', values=metric)
        
        fig.add_trace(
            go.Heatmap(
                z=pivot_data.values,
                x=pivot_data.columns,
                y=pivot_data.index,
                colorscale='RdYlBu_r',
                showscale=(row == 1 and col == 2),
                text=np.round(pivot_data.values, 3),
                texttemplate="%{text}",
                textfont={"size": 10}
            ),
            row=row, col=col
        )
    
    fig.update_layout(
        height=800,
        title_text="Performance Heatmap: Models vs Datasets",
        title_x=0.5,
        template="plotly_white"
    )
    
    return fig

heatmap_fig = create_performance_heatmap()
heatmap_fig.show()

# Save the chart
heatmap_fig.write_html(CHARTS_PATH / f"performance_heatmap_{results_timestamp}.html")
heatmap_fig.write_image(CHARTS_PATH / f"performance_heatmap_{results_timestamp}.png", width=1200, height=800)
print("✅ Performance heatmap saved!")

✅ Performance heatmap saved!


In [None]:
# 7. Statistical Analysis and Confidence Intervals
def create_statistical_analysis():
    """Create statistical analysis with confidence intervals."""
    
    from scipy import stats
    
    # Calculate confidence intervals
    def calculate_ci(data, confidence=0.90):
        n = len(data)
        mean = np.mean(data)
        se = stats.sem(data)
        h = se * stats.t.ppf((1 + confidence) / 2., n-1)
        return mean - h, mean + h
    
    metrics = ['person_f1', 'person_precision', 'person_recall', 'token_accuracy']
    metric_names = ['F1 Score', 'Precision', 'Recall', 'Token Accuracy']
    
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=[f'{name} (90% CI)' for name in metric_names]
    )
    
    positions = [(1, 1), (1, 2), (2, 1), (2, 2)]
    colors = {'TITLE': '#1f77b4', 'NO-TITLE': '#ff7f0e'}
    
    for metric, metric_name, (row, col) in zip(metrics, metric_names, positions):
        for model_type in df_results['model_type'].unique():
            model_data = df_results[df_results['model_type'] == model_type]
            data = model_data[metric]
            
            mean_val = np.mean(data)
            ci_lower, ci_upper = calculate_ci(data)
            
            fig.add_trace(
                go.Scatter(
                    x=[model_type],
                    y=[mean_val],
                    error_y=dict(
                        type='data',
                        symmetric=False,
                        array=[ci_upper - mean_val],
                        arrayminus=[mean_val - ci_lower]
                    ),
                    mode='markers',
                    marker=dict(size=15, color=colors[model_type]),
                    name=model_type,
                    showlegend=(row == 1 and col == 1),
                    text=f'{mean_val:.3f} [{ci_lower:.3f}, {ci_upper:.3f}]',
                    hovertemplate='%{text}<extra></extra>'
                ),
                row=row, col=col
            )
    
    fig.update_layout(
        height=800,
        title_text="Statistical Analysis with 95% Confidence Intervals",
        title_x=0.5,
        template="plotly_white"
    )
    
    return fig

# Install scipy if needed
try:
    statistical_fig = create_statistical_analysis()
    statistical_fig.show()
    
    # Save the chart
    statistical_fig.write_html(CHARTS_PATH / f"statistical_analysis_{results_timestamp}.html")
    statistical_fig.write_image(CHARTS_PATH / f"statistical_analysis_{results_timestamp}.png", width=1200, height=800)
    print("✅ Statistical analysis chart saved!")
except ImportError:
    print("⚠️ Scipy not available. Skipping statistical analysis.")

✅ Statistical analysis chart saved!


In [25]:
# 8. Summary Dashboard
def create_summary_dashboard():
    """Create a comprehensive summary dashboard."""
    
    from plotly.subplots import make_subplots
    import plotly.graph_objects as go
    
    # Create a 3x2 subplot layout
    fig = make_subplots(
        rows=3, cols=2,
        subplot_titles=('Model Performance Overview', 'Inference Speed Comparison',
                       'Dataset Performance Range', 'Model Size Comparison',
                       'Precision vs Recall', 'Performance Consistency'),
        specs=[[{"type": "bar"}, {"type": "bar"}],
               [{"type": "box"}, {"type": "bar"}],
               [{"type": "scatter"}, {"type": "bar"}]]
    )
    
    colors = {'TITLE': '#1f77b4', 'NO-TITLE': '#ff7f0e'}
    
    # 1. Model Performance Overview (F1 Score)
    for model_type in df_aggregated['model_type']:
        model_data = df_aggregated[df_aggregated['model_type'] == model_type].iloc[0]
        fig.add_trace(
            go.Bar(
                x=[model_type],
                y=[model_data['person_f1_mean']],
                name=f'{model_type} F1',
                marker_color=colors[model_type],
                showlegend=False,
                text=f"{model_data['person_f1_mean']:.3f}",
                textposition='outside'
            ),
            row=1, col=1
        )
    
    # 2. Inference Speed Comparison
    for model_type in df_aggregated['model_type']:
        model_data = df_aggregated[df_aggregated['model_type'] == model_type].iloc[0]
        fig.add_trace(
            go.Bar(
                x=[model_type],
                y=[model_data['avg_inference_time_per_sample']],
                name=f'{model_type} Speed',
                marker_color=colors[model_type],
                showlegend=False,
                text=f"{model_data['avg_inference_time_per_sample']:.1f}ms",
                textposition='outside'
            ),
            row=1, col=2
        )
    
    # 3. Dataset Performance Range (Box plot)
    for model_type in df_results['model_type'].unique():
        model_data = df_results[df_results['model_type'] == model_type]
        fig.add_trace(
            go.Box(
                y=model_data['person_f1'],
                name=model_type,
                marker_color=colors[model_type],
                showlegend=False
            ),
            row=2, col=1
        )
    
    # 4. Model Size Comparison
    for model_type in df_aggregated['model_type']:
        model_data = df_aggregated[df_aggregated['model_type'] == model_type].iloc[0]
        fig.add_trace(
            go.Bar(
                x=[model_type],
                y=[model_data['model_size_mb']],
                name=f'{model_type} Size',
                marker_color=colors[model_type],
                showlegend=False,
                text=f"{model_data['model_size_mb']:.1f}MB",
                textposition='outside'
            ),
            row=2, col=2
        )
    
    # 5. Precision vs Recall
    for model_type in df_results['model_type'].unique():
        model_data = df_results[df_results['model_type'] == model_type]
        fig.add_trace(
            go.Scatter(
                x=model_data['person_precision'],
                y=model_data['person_recall'],
                mode='markers',
                name=model_type,
                marker=dict(size=10, color=colors[model_type]),
                showlegend=False,
                text=model_data['dataset'],
                hovertemplate='%{text}<br>Precision: %{x:.3f}<br>Recall: %{y:.3f}<extra></extra>'
            ),
            row=3, col=1
        )
    
    # 6. Performance Consistency (Standard Deviation)
    for model_type in df_aggregated['model_type']:
        model_data = df_aggregated[df_aggregated['model_type'] == model_type].iloc[0]
        fig.add_trace(
            go.Bar(
                x=[model_type],
                y=[model_data['person_f1_std']],
                name=f'{model_type} Consistency',
                marker_color=colors[model_type],
                showlegend=False,
                text=f"{model_data['person_f1_std']:.3f}",
                textposition='outside'
            ),
            row=3, col=2
        )
    
    # Update layout
    fig.update_layout(
        height=1200,
        title_text="NER Model Evaluation Dashboard",
        title_x=0.5,
        template="plotly_white"
    )
    
    # Update specific axes
    fig.update_yaxes(title_text="F1 Score", row=1, col=1)
    fig.update_yaxes(title_text="Time (ms)", row=1, col=2)
    fig.update_yaxes(title_text="F1 Score", row=2, col=1)
    fig.update_yaxes(title_text="Size (MB)", row=2, col=2)
    fig.update_yaxes(title_text="Recall", row=3, col=1)
    fig.update_xaxes(title_text="Precision", row=3, col=1)
    fig.update_yaxes(title_text="Std Dev", row=3, col=2)
    
    return fig

dashboard_fig = create_summary_dashboard()
dashboard_fig.show()

# Save the dashboard
dashboard_fig.write_html(CHARTS_PATH / f"summary_dashboard_{results_timestamp}.html")
dashboard_fig.write_image(CHARTS_PATH / f"summary_dashboard_{results_timestamp}.png", width=1400, height=1200)
print("✅ Summary dashboard saved!")

✅ Summary dashboard saved!


In [26]:
# 9. Generate Summary Report
def generate_summary_report():
    """Generate a comprehensive text summary of the results."""
    
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    
    report = f"""
# NER Model Evaluation Summary Report
Generated: {timestamp}

## Overall Results

### Dataset Coverage:
- Total evaluations: {len(df_results)}
- Unique datasets: {len(df_results['dataset'].unique())}
- Model types evaluated: {', '.join(df_results['model_type'].unique())}

### Key Findings:

"""
    
    # Add model comparison
    for model_type in df_aggregated['model_type']:
        model_data = df_aggregated[df_aggregated['model_type'] == model_type].iloc[0]
        
        report += f"""
#### {model_type} Model Performance:
- **F1 Score**: {model_data['person_f1_mean']:.3f} ± {model_data['person_f1_std']:.3f} (range: {model_data['person_f1_min']:.3f} - {model_data['person_f1_max']:.3f})
- **Precision**: {model_data['person_precision_mean']:.3f} ± {model_data['person_precision_std']:.3f}
- **Recall**: {model_data['person_recall_mean']:.3f} ± {model_data['person_recall_std']:.3f}
- **Token Accuracy**: {model_data['token_accuracy_mean']:.3f} ± {model_data['token_accuracy_std']:.3f}
- **Model Size**: {model_data['model_size_mb']:.1f} MB
- **Avg Inference Time**: {model_data['avg_inference_time_per_sample']:.1f} ms per sample
- **Datasets Evaluated**: {model_data['num_datasets']}
- **Total Samples**: {model_data['total_samples']:,}
"""
    
    # Add best performing datasets
    report += "\n### Best Performing Datasets by Model:\n"
    
    for model_type in df_results['model_type'].unique():
        model_data = df_results[df_results['model_type'] == model_type]
        best_dataset = model_data.loc[model_data['person_f1'].idxmax()]
        
        report += f"""
**{model_type} Model - Best Dataset: {best_dataset['dataset']}**
- F1 Score: {best_dataset['person_f1']:.3f}
- Precision: {best_dataset['person_precision']:.3f}
- Recall: {best_dataset['person_recall']:.3f}
- Samples: {best_dataset['num_samples']:,}
"""
    
    # Add model comparison
    if len(df_aggregated) == 2:
        title_data = df_aggregated[df_aggregated['model_type'] == 'TITLE'].iloc[0]
        no_title_data = df_aggregated[df_aggregated['model_type'] == 'NO-TITLE'].iloc[0]
        
        f1_diff = title_data['person_f1_mean'] - no_title_data['person_f1_mean']
        speed_diff = title_data['avg_inference_time_per_sample'] - no_title_data['avg_inference_time_per_sample']
        
        better_f1 = "TITLE" if f1_diff > 0 else "NO-TITLE"
        faster = "TITLE" if speed_diff < 0 else "NO-TITLE"
        
        report += f"""
### Model Comparison Summary:
- **Better F1 Performance**: {better_f1} model ({abs(f1_diff):.3f} points difference)
- **Faster Inference**: {faster} model ({abs(speed_diff):.1f} ms difference)
- **Model Sizes**: Nearly identical (~{title_data['model_size_mb']:.1f} MB)

### Recommendations:
- For highest accuracy: Use {better_f1} model
- For fastest inference: Use {faster} model
- Both models show similar efficiency in terms of size/performance ratio
"""
    
    report += f"""
### Generated Visualizations:
The following charts have been saved to the charts directory:
1. Performance Comparison Chart
2. Aggregated Model Comparison
3. Efficiency Analysis (Speed vs Performance)
4. Multi-dimensional Radar Chart
5. Distribution Analysis
6. Performance Heatmap
7. Statistical Analysis with Confidence Intervals
8. Comprehensive Summary Dashboard

All charts are available in both HTML (interactive) and PNG (static) formats.
"""
    
    # Save the report
    report_path = CHARTS_PATH / f"evaluation_report_{results_timestamp}.txt"
    with open(report_path, 'w') as f:
        f.write(report)
    
    print(f"📄 Summary report saved to: {report_path}")
    return report

summary_report = generate_summary_report()
print("\n" + "="*60)
print("SUMMARY REPORT")
print("="*60)
print(summary_report)

📄 Summary report saved to: /home/daniel-dorigo/Desktop/NER-proper-names/charts/evaluation_report_detailed_evaluation_results_20250611_004238.txt

SUMMARY REPORT

# NER Model Evaluation Summary Report
Generated: 2025-06-11 10:58:30

## Overall Results

### Dataset Coverage:
- Total evaluations: 8
- Unique datasets: 4
- Model types evaluated: TITLE, NO-TITLE

### Key Findings:


#### TITLE Model Performance:
- **F1 Score**: 0.556 ± 0.296 (range: 0.245 - 0.884)
- **Precision**: 0.474 ± 0.324
- **Recall**: 0.799 ± 0.119
- **Token Accuracy**: 0.958 ± 0.025
- **Model Size**: 484.5 MB
- **Avg Inference Time**: 44.5 ms per sample
- **Datasets Evaluated**: 4
- **Total Samples**: 7,104

#### NO-TITLE Model Performance:
- **F1 Score**: 0.584 ± 0.284 (range: 0.284 - 0.894)
- **Precision**: 0.512 ± 0.325
- **Recall**: 0.782 ± 0.130
- **Token Accuracy**: 0.969 ± 0.022
- **Model Size**: 484.5 MB
- **Avg Inference Time**: 46.5 ms per sample
- **Datasets Evaluated**: 4
- **Total Samples**: 7,104

### B

In [27]:
# 10. Final Summary and File List
print("🎉 Visualization Pipeline Complete!")
print("=" * 50)

print(f"\n📁 Generated files in {CHARTS_PATH}:")
chart_files = list(CHARTS_PATH.glob(f"*{results_timestamp}*"))
for file_path in sorted(chart_files):
    file_size = file_path.stat().st_size / 1024  # Size in KB
    print(f"  📊 {file_path.name} ({file_size:.1f} KB)")

print(f"\n📈 Total files generated: {len(chart_files)}")
print(f"📊 Interactive HTML files: {len([f for f in chart_files if f.suffix == '.html'])}")
print(f"🖼️  Static PNG images: {len([f for f in chart_files if f.suffix == '.png'])}")
print(f"📄 Text reports: {len([f for f in chart_files if f.suffix == '.txt'])}")

print(f"\n💡 Next Steps:")
print(f"  1. Open the HTML files in your browser for interactive exploration")
print(f"  2. Use PNG files for presentations or reports")
print(f"  3. Review the text summary report for key insights")
print(f"  4. Share the summary dashboard for a comprehensive overview")

print(f"\n🔗 Quick Access:")
print(f"  Dashboard: {CHARTS_PATH / f'summary_dashboard_{results_timestamp}.html'}")
print(f"  Report: {CHARTS_PATH / f'evaluation_report_{results_timestamp}.txt'}")

🎉 Visualization Pipeline Complete!

📁 Generated files in /home/daniel-dorigo/Desktop/NER-proper-names/charts:
  📊 aggregated_comparison_detailed_evaluation_results_20250611_004238.html (4553.2 KB)
  📊 aggregated_comparison_detailed_evaluation_results_20250611_004238.png (47.8 KB)
  📊 distribution_analysis_detailed_evaluation_results_20250611_004238.html (4555.0 KB)
  📊 distribution_analysis_detailed_evaluation_results_20250611_004238.png (68.0 KB)
  📊 efficiency_analysis_detailed_evaluation_results_20250611_004238.html (4553.8 KB)
  📊 efficiency_analysis_detailed_evaluation_results_20250611_004238.png (48.2 KB)
  📊 evaluation_report_detailed_evaluation_results_20250611_004238.txt (1.9 KB)
  📊 performance_comparison_detailed_evaluation_results_20250611_004238.html (4555.2 KB)
  📊 performance_comparison_detailed_evaluation_results_20250611_004238.png (75.1 KB)
  📊 performance_heatmap_detailed_evaluation_results_20250611_004238.html (4556.2 KB)
  📊 performance_heatmap_detailed_evaluation_