In [1]:
"""
Statistical Significance Testing - Hypothesis Test Visualizations
Google Colab Compatible Code - COMPLETE VERSION WITH ALL 13 MODELS
Including: BERT, RoBERTa, XLNet, GPT2, ERNIE, DNABERT, DNAGPT, Nucleotide Transformer
"""

# Install required packages (if needed in Colab)
!pip install scipy matplotlib numpy -q

import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
from google.colab import files
import io

# Set up matplotlib for high-quality output
plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['font.size'] = 11
plt.rcParams['font.family'] = 'serif'

#==============================================================================
# DATA: ALL 13 MODELS with their t-statistics
#==============================================================================

models_data = {
    # ========== General-Purpose Language Models ==========
    'BERT-base': {
        'privacy_t': -0.27,
        'privacy_p': 0.785,
        'privacy_mean': -0.15,
        'utility_t': 11.71,
        'utility_p': 0.001,
        'config': 'Utility-Dominant'
    },
    'BERT-Large': {
        'privacy_t': 13.20,
        'privacy_p': 0.001,
        'privacy_mean': 12.8,
        'utility_t': 18.35,
        'utility_p': 0.001,
        'config': 'Dual-Improvements'
    },
    'RoBERTa-base': {
        'privacy_t': -7.24,
        'privacy_p': 0.001,
        'privacy_mean': -8.5,
        'utility_t': 10.82,
        'utility_p': 0.001,
        'config': 'Privacy-Utility Tradeoff'
    },
    'RoBERTa-Large': {
        'privacy_t': -6.89,
        'privacy_p': 0.001,
        'privacy_mean': -7.9,
        'utility_t': 11.15,
        'utility_p': 0.001,
        'config': 'Privacy-Utility Tradeoff'
    },
    'XLNet-base': {
        'privacy_t': 6.85,
        'privacy_p': 0.001,
        'privacy_mean': 7.8,
        'utility_t': 12.94,
        'utility_p': 0.001,
        'config': 'Dual-Improvements'
    },
    'XLNet-Large': {
        'privacy_t': 16.42,
        'privacy_p': 0.001,
        'privacy_mean': 19.5,
        'utility_t': 17.86,
        'utility_p': 0.001,
        'config': 'Dual-Improvements'
    },
    'GPT2-Small': {
        'privacy_t': 3.25,
        'privacy_p': 0.004,
        'privacy_mean': 3.8,
        'utility_t': 11.59,
        'utility_p': 0.001,
        'config': 'Dual-Improvements'
    },
    'GPT2-Medium': {
        'privacy_t': -7.91,
        'privacy_p': 0.001,
        'privacy_mean': -8.7,
        'utility_t': 10.00,
        'utility_p': 0.001,
        'config': 'Privacy-Utility Tradeoff'
    },
    'ERNIE 2.0': {
        'privacy_t': 1.46,
        'privacy_p': 0.161,
        'privacy_mean': 2.1,
        'utility_t': 12.04,
        'utility_p': 0.001,
        'config': 'Utility-Dominant'
    },
    # ========== Genomic Foundation Models ==========
    'DNABERT-base': {
        'privacy_t': -7.72,
        'privacy_p': 0.001,
        'privacy_mean': -9.8,
        'utility_t': 9.55,
        'utility_p': 0.001,
        'config': 'Privacy-Utility Tradeoff'
    },
    'DNABERT2': {
        'privacy_t': -3.04,
        'privacy_p': 0.007,
        'privacy_mean': -4.2,
        'utility_t': 11.27,
        'utility_p': 0.001,
        'config': 'Privacy-Utility Tradeoff'
    },
    'DNAGPT': {
        'privacy_t': 9.63,
        'privacy_p': 0.001,
        'privacy_mean': 10.5,
        'utility_t': 15.68,
        'utility_p': 0.001,
        'config': 'Dual-Improvements'
    },
    'Nucleotide Transformer': {
        'privacy_t': 5.63,
        'privacy_p': 0.001,
        'privacy_mean': 6.7,
        'utility_t': 12.53,
        'utility_p': 0.001,
        'config': 'Dual-Improvements'
    }
}

print("="*80)
print("COMPLETE MODEL LIST (13 MODELS)")
print("="*80)
print("\nGeneral-Purpose Models (9):")
for i, model in enumerate(['BERT-base', 'BERT-Large', 'RoBERTa-base', 'RoBERTa-Large',
                           'XLNet-base', 'XLNet-Large', 'GPT2-Small', 'GPT2-Medium', 'ERNIE 2.0'], 1):
    print(f"  {i}. {model}")

print("\nGenomic Models (4):")
for i, model in enumerate(['DNABERT-base', 'DNABERT2', 'DNAGPT', 'Nucleotide Transformer'], 10):
    print(f"  {i}. {model}")

print(f"\nTotal: {len(models_data)} models")
print("="*80)

#==============================================================================
# FUNCTION: Individual Model Hypothesis Test Visualization
#==============================================================================

def plot_hypothesis_test(t_stat, model_name, metric_name='Privacy Gain',
                        p_value=None, mean_value=None,
                        alpha=0.05, df=19, save_fig=True):
    """
    Plot t-distribution with rejection regions and observed t-statistic

    Parameters:
    - t_stat: observed t-statistic
    - model_name: name of the model
    - metric_name: 'Privacy Gain' or 'Utility Improvement'
    - p_value: p-value for the test
    - mean_value: mean difference value
    - alpha: significance level (default 0.05)
    - df: degrees of freedom (default 19)
    - save_fig: whether to save the figure
    """
    # Calculate critical values for two-tailed test
    critical_value = stats.t.ppf(1 - alpha/2, df)

    # Generate x values for t-distribution
    x = np.linspace(-20, 20, 1000)
    y = stats.t.pdf(x, df)

    # Create figure
    fig, ax = plt.subplots(figsize=(12, 7))

    # Plot t-distribution
    ax.plot(x, y, 'b-', linewidth=3, label=f't-distribution (df={df})', zorder=2)

    # Shade rejection regions (two-tailed)
    x_left_reject = x[x <= -critical_value]
    x_right_reject = x[x >= critical_value]
    ax.fill_between(x_left_reject, 0, stats.t.pdf(x_left_reject, df),
                     alpha=0.5, color='red', label=f'Rejection Region (α={alpha})',
                     zorder=1)
    ax.fill_between(x_right_reject, 0, stats.t.pdf(x_right_reject, df),
                     alpha=0.5, color='red', zorder=1)

    # Mark critical values with dashed lines
    ax.axvline(-critical_value, color='darkred', linestyle='--', linewidth=2.5,
               zorder=3)
    ax.axvline(critical_value, color='darkred', linestyle='--', linewidth=2.5,
               zorder=3)

    # Add critical value labels at the top
    y_label_pos = max(y) * 0.95
    ax.text(-critical_value, y_label_pos, f'  −{critical_value:.3f}',
            ha='left', va='top', fontsize=12, fontweight='bold',
            color='darkred',
            bbox=dict(boxstyle='round,pad=0.4', facecolor='white',
                     edgecolor='darkred', linewidth=1.5))
    ax.text(critical_value, y_label_pos, f'  +{critical_value:.3f}',
            ha='left', va='top', fontsize=12, fontweight='bold',
            color='darkred',
            bbox=dict(boxstyle='round,pad=0.4', facecolor='white',
                     edgecolor='darkred', linewidth=1.5))

    # Determine if significant and set colors
    if abs(t_stat) > critical_value:
        if t_stat > 0:
            color = 'darkgreen'
            decision = 'Reject H₀'
            result = 'Statistically Significant\n(Positive Effect)'
            box_color = 'lightgreen'
        else:
            color = 'darkred'
            decision = 'Reject H₀'
            result = 'Statistically Significant\n(Negative Effect/Degradation)'
            box_color = 'lightcoral'
    else:
        color = 'darkorange'
        decision = 'Fail to Reject H₀'
        result = 'Not Statistically Significant'
        box_color = 'lightyellow'

    # Mark observed t-statistic with thick line
    ax.axvline(t_stat, color=color, linewidth=4,
               label=f'Observed t = {t_stat:.2f}', zorder=4)

    # Add observed t-statistic annotation with arrow
    if abs(t_stat) < 12:
        y_arrow_start = 0.02
        y_arrow_end = max(y) * 0.6
        ax.annotate(f'Observed\nt = {t_stat:.2f}',
                   xy=(t_stat, y_arrow_start),
                   xytext=(t_stat, y_arrow_end),
                   ha='center', va='bottom',
                   fontsize=12, fontweight='bold', color=color,
                   bbox=dict(boxstyle='round,pad=0.6', facecolor=color,
                            alpha=0.2, edgecolor=color, linewidth=2),
                   arrowprops=dict(arrowstyle='->', lw=3, color=color),
                   zorder=5)
    else:
        # For extreme values, place annotation differently
        if t_stat > 0:
            x_text = t_stat - 2.5
            ha_align = 'right'
        else:
            x_text = t_stat + 2.5
            ha_align = 'left'
        ax.text(x_text, max(y)*0.5, f'Observed\nt = {t_stat:.2f}',
               ha=ha_align, va='center',
               fontsize=12, fontweight='bold', color=color,
               bbox=dict(boxstyle='round,pad=0.6', facecolor=color,
                        alpha=0.2, edgecolor=color, linewidth=2))

    # Add decision box in upper right
    decision_text = f'Decision: {decision}\n\n{result}'
    if p_value is not None:
        if p_value < 0.001:
            p_text = 'p < 0.001'
        else:
            p_text = f'p = {p_value:.3f}'
        decision_text += f'\n\n{p_text}'

    ax.text(0.98, 0.98, decision_text,
            transform=ax.transAxes, fontsize=11, fontweight='bold',
            verticalalignment='top', horizontalalignment='right',
            bbox=dict(boxstyle='round,pad=0.8', facecolor=box_color,
                     alpha=0.9, edgecolor='black', linewidth=2.5),
            zorder=6)

    # Add "Reject H₀" labels in rejection regions
    ax.text(-critical_value - 2.5, max(y)*0.15, 'Reject H₀\n(α/2 = 0.025)',
            ha='center', va='center', fontsize=10, fontweight='bold',
            color='darkred', style='italic')
    ax.text(critical_value + 2.5, max(y)*0.15, 'Reject H₀\n(α/2 = 0.025)',
            ha='center', va='center', fontsize=10, fontweight='bold',
            color='darkred', style='italic')

    # Add "Fail to Reject H₀" label in center
    ax.text(0, max(y)*0.3, 'Fail to Reject H₀',
            ha='center', va='center', fontsize=11, fontweight='bold',
            color='blue', alpha=0.6, style='italic')

    # Labels and title
    ax.set_xlabel('t-statistic', fontsize=14, fontweight='bold')
    ax.set_ylabel('Probability Density', fontsize=14, fontweight='bold')

    title_text = f'Hypothesis Test: {model_name} - {metric_name}\n'
    title_text += r'$H_0: \mu = 0$ vs. $H_1: \mu \neq 0$ (Two-tailed test, α = 0.05)'
    if mean_value is not None:
        title_text += f'\nMean Difference = {mean_value:+.2f}%'

    ax.set_title(title_text, fontsize=15, fontweight='bold', pad=20)

    # Grid and legend
    ax.grid(True, alpha=0.3, linestyle=':', linewidth=1.5, zorder=0)
    ax.legend(loc='upper left', fontsize=11, framealpha=0.95,
             edgecolor='black', fancybox=True, shadow=True)

    # Set axis limits
    x_max = max(20, abs(t_stat) + 3)
    ax.set_xlim([-x_max, x_max])
    ax.set_ylim([0, max(y)*1.15])

    # Add background color to non-rejection region
    ax.axvspan(-critical_value, critical_value, alpha=0.1, color='blue', zorder=0)

    plt.tight_layout()

    if save_fig:
        filename = f'hypothesis_test_{model_name.replace(" ", "_").replace(".", "").replace("-", "_")}_privacy.png'
        plt.savefig(filename, dpi=300, bbox_inches='tight', facecolor='white')
        print(f"✓ Saved: {filename}")

    return fig

#==============================================================================
# GENERATE INDIVIDUAL PLOTS FOR ALL MODELS
#==============================================================================

print("\n" + "="*80)
print("GENERATING INDIVIDUAL HYPOTHESIS TEST PLOTS (ALL 13 MODELS)")
print("="*80)

for model_name, data in models_data.items():
    print(f"\nGenerating plot for: {model_name}")
    fig = plot_hypothesis_test(
        t_stat=data['privacy_t'],
        model_name=model_name,
        metric_name='Privacy Gain',
        p_value=data['privacy_p'],
        mean_value=data['privacy_mean'],
        save_fig=True
    )
    plt.close()

print("\n" + "="*80)
print("ALL INDIVIDUAL PLOTS GENERATED!")
print(f"Total: {len(models_data)} models")
print("="*80)

#==============================================================================
# FUNCTION: Grid Visualization for All Models (5x3 for 13 models)
#==============================================================================

def plot_hypothesis_tests_grid(models_dict, save_fig=True):
    """
    Create a 5x3 grid of hypothesis tests for all 13 models
    """
    models = list(models_dict.keys())
    t_stats_privacy = [models_dict[m]['privacy_t'] for m in models]

    # Create 5x3 grid (15 subplots, 13 used)
    fig, axes = plt.subplots(5, 3, figsize=(18, 26))
    axes = axes.flatten()

    df = 19
    critical_value = 2.093

    for idx, (ax, model, t_stat) in enumerate(zip(axes[:len(models)], models, t_stats_privacy)):
        # Generate t-distribution
        x = np.linspace(-20, 20, 1000)
        y = stats.t.pdf(x, df)

        # Plot t-distribution
        ax.plot(x, y, 'b-', linewidth=2.5, zorder=2)

        # Shade rejection regions
        x_left_reject = x[x <= -critical_value]
        x_right_reject = x[x >= critical_value]
        ax.fill_between(x_left_reject, 0, stats.t.pdf(x_left_reject, df),
                         alpha=0.5, color='red', zorder=1)
        ax.fill_between(x_right_reject, 0, stats.t.pdf(x_right_reject, df),
                         alpha=0.5, color='red', zorder=1)

        # Mark critical values
        ax.axvline(-critical_value, color='darkred', linestyle='--', linewidth=2, zorder=3)
        ax.axvline(critical_value, color='darkred', linestyle='--', linewidth=2, zorder=3)

        # Determine color based on significance
        if abs(t_stat) > critical_value:
            if t_stat > 0:
                color = 'darkgreen'
                decision = 'Reject H₀\n(Significant +)'
            else:
                color = 'darkred'
                decision = 'Reject H₀\n(Significant −)'
        else:
            color = 'darkorange'
            decision = 'Fail to Reject H₀\n(Non-significant)'

        # Mark observed t-statistic
        ax.axvline(t_stat, color=color, linewidth=4, zorder=4)

        # Add model name and t-statistic at top
        ax.text(0.5, 0.97, f'{model}\nt = {t_stat:.2f}',
                transform=ax.transAxes, ha='center', va='top',
                fontsize=10, fontweight='bold',
                bbox=dict(boxstyle='round,pad=0.5', facecolor='white',
                         edgecolor=color, linewidth=2.5, alpha=0.95),
                zorder=5)

        # Add decision at bottom
        ax.text(0.5, 0.05, decision,
                transform=ax.transAxes, ha='center', va='bottom',
                fontsize=8, fontweight='bold', color=color,
                bbox=dict(boxstyle='round,pad=0.4', facecolor=color,
                         alpha=0.2, edgecolor=color, linewidth=2),
                zorder=5)

        # Formatting
        x_lim = max(20, abs(t_stat) + 3)
        ax.set_xlim([-x_lim, x_lim])
        ax.set_ylim([0, max(y)*1.2])
        ax.set_xlabel('t-statistic', fontsize=9, fontweight='bold')
        ax.set_ylabel('Density', fontsize=9, fontweight='bold')
        ax.grid(True, alpha=0.3, linestyle=':', linewidth=1)

        # Add critical value annotations on first plot only
        if idx == 0:
            ax.text(-critical_value, max(y)*1.1, f'−{critical_value:.2f}',
                    ha='center', va='bottom', fontsize=8, color='darkred',
                    fontweight='bold',
                    bbox=dict(boxstyle='round,pad=0.3', facecolor='white',
                             edgecolor='darkred', linewidth=1))
            ax.text(critical_value, max(y)*1.1, f'+{critical_value:.2f}',
                    ha='center', va='bottom', fontsize=8, color='darkred',
                    fontweight='bold',
                    bbox=dict(boxstyle='round,pad=0.3', facecolor='white',
                             edgecolor='darkred', linewidth=1))

    # Hide the unused subplots
    for idx in range(len(models), len(axes)):
        axes[idx].axis('off')

    # Super title
    fig.suptitle('Hypothesis Testing: Privacy Gain Across All Models (13 Models)\n' +
                r'Two-tailed t-test: $H_0: \mu = 0$ vs. $H_1: \mu \neq 0$ (α = 0.05, df = 19)',
                fontsize=16, fontweight='bold', y=0.996)

    plt.tight_layout(rect=[0, 0, 1, 0.994])

    if save_fig:
        filename = 'hypothesis_tests_all_models_grid.png'
        plt.savefig(filename, dpi=300, bbox_inches='tight', facecolor='white')
        print(f"✓ Saved: {filename}")

    return fig

#==============================================================================
# GENERATE GRID PLOT
#==============================================================================

print("\n" + "="*80)
print("GENERATING GRID VISUALIZATION (5x3 for 13 models)")
print("="*80)

fig_grid = plot_hypothesis_tests_grid(models_data, save_fig=True)
plt.close()

print("\n" + "="*80)
print("GRID PLOT GENERATED!")
print("="*80)

#==============================================================================
# SUMMARY
#==============================================================================

print("\n" + "="*80)
print("SUMMARY OF GENERATED FILES")
print("="*80)

print("\nIndividual Model Plots:")
for i, model in enumerate(models_data.keys(), 1):
    filename = f'hypothesis_test_{model.replace(" ", "_").replace(".", "").replace("-", "_")}_privacy.png'
    print(f"{i:2d}. {filename}")

print("\nGrid Plot:")
print(f"{len(models_data)+1}. hypothesis_tests_all_models_grid.png")

print("\n" + "="*80)
print("ALL VISUALIZATIONS COMPLETE!")
print(f"Total files generated: {len(models_data) + 1}")
print("Files are ready for download from Colab")
print("="*80)

#==============================================================================
# CONFIGURATION SUMMARY
#==============================================================================

print("\n" + "="*80)
print("CONFIGURATION TYPE SUMMARY")
print("="*80)

config_counts = {}
for model, data in models_data.items():
    config = data['config']
    if config not in config_counts:
        config_counts[config] = []
    config_counts[config].append(model)

for config, models in sorted(config_counts.items()):
    print(f"\n{config} ({len(models)} models):")
    for model in models:
        t_stat = models_data[model]['privacy_t']
        print(f"  - {model:30s} (t = {t_stat:+6.2f})")

print("\n" + "="*80)

#==============================================================================
# DOWNLOAD ALL FILES
#==============================================================================

print("\n" + "="*80)
print("READY TO DOWNLOAD FILES")
print("="*80)
print("\nRun the following code in the next cell to download all files:")
print("""
from google.colab import files
import os

# Get all PNG files
png_files = [f for f in os.listdir('.') if f.endswith('.png') and 'hypothesis' in f]
print(f"Downloading {len(png_files)} files...")

# Download each file
for filename in png_files:
    files.download(filename)

print("All files downloaded!")
""")

COMPLETE MODEL LIST (13 MODELS)

General-Purpose Models (9):
  1. BERT-base
  2. BERT-Large
  3. RoBERTa-base
  4. RoBERTa-Large
  5. XLNet-base
  6. XLNet-Large
  7. GPT2-Small
  8. GPT2-Medium
  9. ERNIE 2.0

Genomic Models (4):
  10. DNABERT-base
  11. DNABERT2
  12. DNAGPT
  13. Nucleotide Transformer

Total: 13 models

GENERATING INDIVIDUAL HYPOTHESIS TEST PLOTS (ALL 13 MODELS)

Generating plot for: BERT-base
✓ Saved: hypothesis_test_BERT_base_privacy.png

Generating plot for: BERT-Large
✓ Saved: hypothesis_test_BERT_Large_privacy.png

Generating plot for: RoBERTa-base
✓ Saved: hypothesis_test_RoBERTa_base_privacy.png

Generating plot for: RoBERTa-Large
✓ Saved: hypothesis_test_RoBERTa_Large_privacy.png

Generating plot for: XLNet-base
✓ Saved: hypothesis_test_XLNet_base_privacy.png

Generating plot for: XLNet-Large
✓ Saved: hypothesis_test_XLNet_Large_privacy.png

Generating plot for: GPT2-Small
✓ Saved: hypothesis_test_GPT2_Small_privacy.png

Generating plot for: GPT2-Medium
✓ S

In [2]:
import zipfile
import os
from google.colab import files

# Get all PNG files
png_files = [f for f in os.listdir('.') if f.endswith('.png') and 'hypothesis' in f]

# Create a zip file
zip_filename = 'hypothesis_plots.zip'
with zipfile.ZipFile(zip_filename, 'w') as zipf:
    for filename in png_files:
        zipf.write(filename, os.path.basename(filename))

print(f"Created zip file: {zip_filename}")

# Download the zip file
files.download(zip_filename)

print("Zip file downloaded!")

Created zip file: hypothesis_plots.zip


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Zip file downloaded!
