# LLM Multilingual Safety Evaluation - Analysis Dashboard

This notebook provides interactive visualizations and analysis for LLM safety evaluation results.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

In [None]:
# Load evaluation results
# Replace with your actual results file
results_file = '../results/evaluation_latest.csv'
df = pd.read_csv(results_file)

print(f"Loaded {len(df)} evaluation results")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nLanguages: {df['language'].unique()}")
print(f"Domains: {df['domain'].unique()}")

## 1. Overall Safety Metrics

In [None]:
# Calculate overall metrics
overall_metrics = {
    'Total Evaluations': len(df),
    'Average Safety Score': df['safety_score'].mean(),
    'Average Compliance Score': df['compliance_score'].mean(),
    'Average Cultural Score': df['cultural_score'].mean(),
    'Critical Failures': len(df[df['risk_level'] == 'CRITICAL']),
    'High Risk Count': len(df[df['risk_level'] == 'HIGH']),
    'Pass Rate': len(df[df['compliance_score'] >= 0.7]) / len(df) * 100
}

# Display metrics
fig = go.Figure()
fig.add_trace(go.Indicator(
    mode = "number+gauge+delta",
    value = overall_metrics['Average Safety Score'],
    domain = {'x': [0, 1], 'y': [0, 1]},
    title = {'text': "Overall Safety Score"},
    delta = {'reference': 80},
    gauge = {
        'axis': {'range': [None, 100]},
        'bar': {'color': "darkblue"},
        'steps': [
            {'range': [0, 40], 'color': "lightgray"},
            {'range': [40, 60], 'color': "yellow"},
            {'range': [60, 80], 'color': "lightgreen"},
            {'range': [80, 100], 'color': "green"}],
        'threshold': {
            'line': {'color': "red", 'width': 4},
            'thickness': 0.75,
            'value': 90}}))

fig.update_layout(height=400)
fig.show()

# Display all metrics
for metric, value in overall_metrics.items():
    print(f"{metric}: {value:.2f}" if isinstance(value, float) else f"{metric}: {value}")

## 2. Risk Level Distribution

In [None]:
# Risk level distribution
risk_counts = df['risk_level'].value_counts()

# Create donut chart
colors = {'LOW': '#2ecc71', 'MEDIUM': '#f39c12', 'HIGH': '#e74c3c', 'CRITICAL': '#c0392b'}
fig = px.pie(values=risk_counts.values, names=risk_counts.index, 
             title='Risk Level Distribution',
             color=risk_counts.index,
             color_discrete_map=colors,
             hole=0.4)

fig.update_traces(textposition='inside', textinfo='percent+label')
fig.update_layout(height=500)
fig.show()

## 3. Performance by Language

In [None]:
# Language performance analysis
lang_stats = df.groupby('language').agg({
    'safety_score': ['mean', 'std', 'count'],
    'compliance_score': 'mean',
    'cultural_score': 'mean',
    'risk_level': lambda x: (x == 'CRITICAL').sum()
}).round(2)

# Flatten column names
lang_stats.columns = ['_'.join(col).strip() for col in lang_stats.columns.values]
lang_stats = lang_stats.reset_index()

# Create interactive bar chart
fig = go.Figure()

fig.add_trace(go.Bar(
    name='Safety Score',
    x=lang_stats['language'],
    y=lang_stats['safety_score_mean'],
    error_y=dict(type='data', array=lang_stats['safety_score_std']),
    marker_color='lightblue'
))

fig.add_trace(go.Bar(
    name='Compliance Score',
    x=lang_stats['language'],
    y=lang_stats['compliance_score_mean'],
    marker_color='lightgreen'
))

fig.add_trace(go.Bar(
    name='Cultural Score',
    x=lang_stats['language'],
    y=lang_stats['cultural_score_mean'],
    marker_color='lightcoral'
))

fig.update_layout(
    title='Performance Scores by Language',
    xaxis_title='Language',
    yaxis_title='Score',
    barmode='group',
    height=600
)

fig.show()

# Display language statistics table
print("\nDetailed Language Statistics:")
display(lang_stats)

## 4. Performance by Domain

In [None]:
# Domain performance analysis
domain_stats = df.groupby('domain').agg({
    'safety_score': 'mean',
    'compliance_score': 'mean',
    'cultural_score': 'mean',
    'risk_score': 'mean'
}).round(2)

# Create radar chart
categories = list(domain_stats.index)
fig = go.Figure()

fig.add_trace(go.Scatterpolar(
    r=domain_stats['safety_score'],
    theta=categories,
    fill='toself',
    name='Safety Score'
))

fig.add_trace(go.Scatterpolar(
    r=domain_stats['compliance_score'],
    theta=categories,
    fill='toself',
    name='Compliance Score'
))

fig.add_trace(go.Scatterpolar(
    r=domain_stats['cultural_score'],
    theta=categories,
    fill='toself',
    name='Cultural Score'
))

fig.update_layout(
    polar=dict(
        radialaxis=dict(
            visible=True,
            range=[0, 100]
        )),
    showlegend=True,
    title="Domain Performance Radar Chart",
    height=600
)

fig.show()

## 5. Risk Heatmap

In [None]:
# Create risk heatmap
risk_pivot = df.pivot_table(
    values='risk_score',
    index='language',
    columns='domain',
    aggfunc='mean'
).round(2)

# Create heatmap
fig = px.imshow(
    risk_pivot,
    labels=dict(x="Domain", y="Language", color="Risk Score"),
    title="Risk Score Heatmap by Language and Domain",
    color_continuous_scale='RdYlGn_r',
    aspect="auto",
    text_auto=True
)

fig.update_layout(height=800)
fig.show()

## 6. Safety Score Distribution

In [None]:
# Safety score distribution
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=('Overall Distribution', 'By Language', 'By Domain', 'By Risk Level'),
    vertical_spacing=0.15,
    horizontal_spacing=0.1
)

# Overall distribution
fig.add_trace(
    go.Histogram(x=df['safety_score'], nbinsx=30, name='Overall'),
    row=1, col=1
)

# By language (box plot)
for language in df['language'].unique():
    fig.add_trace(
        go.Box(y=df[df['language']==language]['safety_score'], name=language),
        row=1, col=2
    )

# By domain (violin plot)
for domain in df['domain'].unique():
    fig.add_trace(
        go.Violin(y=df[df['domain']==domain]['safety_score'], name=domain),
        row=2, col=1
    )

# By risk level
for risk_level in ['LOW', 'MEDIUM', 'HIGH', 'CRITICAL']:
    if risk_level in df['risk_level'].values:
        fig.add_trace(
            go.Box(y=df[df['risk_level']==risk_level]['safety_score'], name=risk_level),
            row=2, col=2
        )

fig.update_layout(height=1000, showlegend=False, title_text="Safety Score Distributions")
fig.show()

## 7. Critical Failures Analysis

In [None]:
# Analyze critical failures
critical_failures = df[df['risk_level'] == 'CRITICAL'].copy()

if len(critical_failures) > 0:
    # Critical failures by language and domain
    critical_pivot = critical_failures.pivot_table(
        index='language',
        columns='domain',
        values='scenario_id',
        aggfunc='count',
        fill_value=0
    )
    
    # Create heatmap of critical failures
    fig = px.imshow(
        critical_pivot,
        labels=dict(x="Domain", y="Language", color="Critical Failures"),
        title="Critical Failures by Language and Domain",
        color_continuous_scale='Reds',
        text_auto=True
    )
    
    fig.update_layout(height=600)
    fig.show()
    
    # Display top critical scenarios
    print("\nTop 10 Critical Failure Scenarios:")
    critical_summary = critical_failures[['scenario_id', 'language', 'domain', 'risk_score']].head(10)
    display(critical_summary)
else:
    print("No critical failures found!")

## 8. Model Comparison (if multiple models)

In [None]:
# Check if multiple models were evaluated
if 'model' in df.columns and df['model'].nunique() > 1:
    # Model comparison
    model_comparison = df.groupby(['model', 'language']).agg({
        'safety_score': 'mean'
    }).reset_index()
    
    # Create line chart
    fig = px.line(
        model_comparison,
        x='language',
        y='safety_score',
        color='model',
        title='Model Performance Comparison by Language',
        markers=True
    )
    
    fig.update_layout(height=600)
    fig.show()
    
    # Overall model comparison
    model_stats = df.groupby('model').agg({
        'safety_score': ['mean', 'std'],
        'compliance_score': 'mean',
        'cultural_score': 'mean',
        'risk_level': lambda x: (x == 'CRITICAL').sum()
    }).round(2)
    
    print("\nModel Performance Summary:")
    display(model_stats)
else:
    print("Single model evaluation - no comparison available")

## 9. Temporal Analysis (if timestamps available)

In [None]:
# Check if timestamp data is available
if 'timestamp' in df.columns:
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['hour'] = df['timestamp'].dt.hour
    
    # Performance over time
    hourly_stats = df.groupby('hour').agg({
        'safety_score': 'mean',
        'scenario_id': 'count'
    }).reset_index()
    
    # Create dual-axis chart
    fig = make_subplots(specs=[[{"secondary_y": True}]])
    
    fig.add_trace(
        go.Scatter(x=hourly_stats['hour'], y=hourly_stats['safety_score'],
                   name="Avg Safety Score", line=dict(color="blue")),
        secondary_y=False,
    )
    
    fig.add_trace(
        go.Bar(x=hourly_stats['hour'], y=hourly_stats['scenario_id'],
               name="Evaluation Count", marker_color="lightgray"),
        secondary_y=True,
    )
    
    fig.update_xaxes(title_text="Hour of Day")
    fig.update_yaxes(title_text="Safety Score", secondary_y=False)
    fig.update_yaxes(title_text="Count", secondary_y=True)
    fig.update_layout(title="Evaluation Performance Over Time", height=500)
    
    fig.show()
else:
    print("No timestamp data available for temporal analysis")

## 10. Export Analysis Summary

In [None]:
# Create analysis summary
analysis_summary = {
    'evaluation_date': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
    'total_evaluations': len(df),
    'languages_tested': df['language'].unique().tolist(),
    'domains_tested': df['domain'].unique().tolist(),
    'overall_metrics': overall_metrics,
    'risk_distribution': risk_counts.to_dict(),
    'language_performance': lang_stats.to_dict('records'),
    'domain_performance': domain_stats.to_dict(),
    'critical_failure_count': len(critical_failures) if 'critical_failures' in locals() else 0
}

# Save summary
import json
with open('../results/analysis_summary.json', 'w') as f:
    json.dump(analysis_summary, f, indent=2, default=str)

print("Analysis summary saved to ../results/analysis_summary.json")
print("\nAnalysis complete!")