# Scientific Analysis: Impact of Neighborhood Size on NCA Models

This notebook performs a comprehensive scientific analysis to determine whether it makes sense to use a `neighborhood_size` greater than 3, and what are the differences between models with different neighborhood sizes.

## Analysis Objectives:
1. **Performance Evaluation**: Comparison of biological metrics across different neighborhood sizes
2. **Statistical Tests**: Verification of statistical significance of differences
3. **Trend Analysis**: Identification of patterns and improvements/degradations
4. **Computational Complexity**: Analysis of computational cost vs. benefits
5. **Interactive Visualizations**: Plotly charts for in-depth exploration


In [None]:
import sys
import os
from pathlib import Path

# Add parent directory to path
# Get the directory where this notebook is located
notebook_dir = Path().absolute()
# Get the project root (parent of notebooks directory)
project_root = notebook_dir.parent
# Add to path
sys.path.insert(0, str(project_root))
sys.path.insert(0, str(project_root / 'experiments'))

# Import the analyzer
from experiments.analyze_neighborhood_sizes import NeighborhoodSizeAnalyzer

import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

print("Imports completed!")
print(f"Project root: {project_root}")


## Configuration

Define the parameters for the analysis:


In [None]:
# Configuration
# Use absolute paths based on project root
# If running from notebooks/, go up one level to project root
if 'notebooks' in str(notebook_dir):
    project_root = notebook_dir.parent
else:
    project_root = notebook_dir

RESULTS_DIR = str(project_root / "experiments" / "results_extended")
HISTORIES_PATH = str(project_root / "histories.npy")
DEVICE = "auto"  # "auto", "cuda", "mps", or "cpu"
N_EVALUATIONS = 10  # Number of evaluations for stochastic models
NEIGHBORHOOD_SIZES = [3, 4, 5, 6, 7]
FORCE_RECOMPUTE = False  # If True, re-evaluate even if CSV files exist

print(f"Notebook directory: {notebook_dir}")
print(f"Project root: {project_root}")
print(f"Results directory: {RESULTS_DIR}")
print(f"Histories path: {HISTORIES_PATH}")
print(f"Device: {DEVICE}")
print(f"Neighborhood sizes: {NEIGHBORHOOD_SIZES}")
print(f"Number of evaluations: {N_EVALUATIONS}")
print(f"Paths exist: RESULTS_DIR={os.path.exists(RESULTS_DIR)}, HISTORIES={os.path.exists(HISTORIES_PATH)}")


## Analyzer Initialization

Create the analyzer instance and load/evaluate the models:


In [None]:
# Initialize the analyzer
analyzer = NeighborhoodSizeAnalyzer(
    results_dir=RESULTS_DIR,
    histories_path=HISTORIES_PATH,
    device=DEVICE,
    n_evaluations=N_EVALUATIONS
)

# Load or evaluate the models
analyzer.load_or_evaluate_models(
    neighborhood_sizes=NEIGHBORHOOD_SIZES,
    force_recompute=FORCE_RECOMPUTE
)

print("\n Models loaded/evaluated successfully!")


## Data Exploration

Examine the metrics data:


In [None]:
# Parse the metrics
df = analyzer.parse_metrics()

print("Dataset shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nFirst data:")
df.head(10)


In [None]:
# Descriptive statistics by model and neighborhood size
print("Descriptive statistics by model:")
print("="*60)
for model_type in df['Model Type'].unique():
    print(f"\n{model_type}:")
    model_data = df[df['Model Type'] == model_type]
    print(model_data.groupby('Neighborhood Size').agg(['mean', 'std']))


## Statistical Tests

Perform statistical tests to verify the significance of differences:


In [None]:
# Perform statistical tests
stat_results = analyzer.statistical_tests()

# Display results in a more readable format
import json
print("\n" + "="*60)
print("STATISTICAL TEST RESULTS")
print("="*60)

for metric, model_results in stat_results.items():
    print(f"\n{'='*60}")
    print(f"METRIC: {metric}")
    print(f"{'='*60}")
    for model_type, results in model_results.items():
        if 'kruskal_wallis' in results:
            kw = results['kruskal_wallis']
            significance = '***' if kw['p_value'] < 0.001 else '**' if kw['p_value'] < 0.01 else '*' if kw['p_value'] < 0.05 else '(not significant)'
            print(f"\n  {model_type}:")
            print(f"    Kruskal-Wallis: H={kw['statistic']:.4f}, p={kw['p_value']:.6f} {significance}")
            
            if 'pairwise' in results and results['pairwise']:
                print(f"    Significant pairwise comparisons:")
                for pair, pair_result in results['pairwise'].items():
                    if pair_result['significant']:
                        nb1, nb2 = pair.split('_vs_')
                        sig = '***' if pair_result['p_value'] < 0.001 else '**' if pair_result['p_value'] < 0.01 else '*'
                        print(f"      NB{nb1} vs NB{nb2}: p={pair_result['p_value']:.6f} {sig}")


## Trend Analysis

Analyze performance trends as neighborhood size varies:


In [None]:
# Trend analysis
trend_df = analyzer.performance_trend_analysis()

print("Trend Analysis:")
print("="*60)
trend_df


In [None]:
# Display improvements/degradations
print("\nImprovements from NB=3 to NB=7:")
print("="*60)
improvements = trend_df[['Model Type', 'Metric', 'Improvement_3_to_7']].copy()
improvements = improvements.dropna()
improvements = improvements.sort_values('Improvement_3_to_7')

for _, row in improvements.iterrows():
    improvement = row['Improvement_3_to_7']
    direction = "IMPROVEMENT" if improvement > 0 else "DEGRADATION"
    print(f"{row['Model Type']} - {row['Metric']}: {improvement:.2f}% ({direction})")


## Computational Complexity Analysis

Measure the computational cost for each neighborhood size:


In [None]:
# Computational complexity analysis
complexity_df = analyzer.computational_complexity_analysis(n_samples=5)

print("Computational Complexity:")
print("="*60)
complexity_df


In [None]:
# Display computational complexity
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=complexity_df['Neighborhood Size'],
    y=complexity_df['Mean Time (s)'],
    mode='lines+markers',
    name='Mean time (s)',
    error_y=dict(type='data', array=complexity_df['Std Time (s)'], visible=True),
    line=dict(width=3, color='blue'),
    marker=dict(size=12)
))

fig.add_trace(go.Scatter(
    x=complexity_df['Neighborhood Size'],
    y=complexity_df['Normalized Time'],
    mode='lines+markers',
    name='Normalized time (vs NB=3)',
    line=dict(width=3, color='red', dash='dash'),
    marker=dict(size=12)
))

fig.update_layout(
    title='Computational Complexity vs Neighborhood Size',
    xaxis_title='Neighborhood Size',
    yaxis_title='Time (s) / Normalization Factor',
    width=1000,
    height=600,
    template='plotly_white',
    hovermode='x unified'
)

fig.show()


## Interactive Visualizations

Create interactive visualizations with Plotly:


In [None]:
# Create all visualizations
analyzer.create_visualizations()

print("\n Visualizations created! Check the analysis_plots/ folder")


## Custom Visualizations in the Notebook

Create interactive visualizations directly in the notebook:


In [None]:
# Interactive dashboard with all metrics
metric_cols = ['KL Divergence', 'Chi-Square', 'Categorical MMD', 
              'Tumor Size Diff', 'Border Size Diff', 'Spatial Variance Diff']

fig = make_subplots(
    rows=2, cols=3,
    subplot_titles=metric_cols,
    vertical_spacing=0.12,
    horizontal_spacing=0.1
)

colors = px.colors.qualitative.Set2
df_parsed = analyzer.parse_metrics()

for idx, metric in enumerate(metric_cols):
    if metric not in df_parsed.columns:
        continue
    
    row = (idx // 3) + 1
    col = (idx % 3) + 1
    
    for model_idx, model_type in enumerate(df_parsed['Model Type'].unique()):
        model_data = df_parsed[df_parsed['Model Type'] == model_type]
        grouped = model_data.groupby('Neighborhood Size')[metric].agg(['mean', 'std'])
        
        sizes = grouped.index.values
        means = grouped['mean'].values
        stds = grouped['std'].values
        
        color = colors[model_idx % len(colors)]
        
        fig.add_trace(
            go.Scatter(
                x=sizes,
                y=means,
                mode='lines+markers',
                name=model_type if idx == 0 else '',
                line=dict(color=color, width=2),
                marker=dict(size=8, color=color),
                error_y=dict(type='data', array=stds, visible=True),
                showlegend=(idx == 0),
                hovertemplate=f'<b>{model_type}</b><br>' +
                            'Neighborhood Size: %{x}<br>' +
                            f'{metric}: %{{y:.4f}}<br>' +
                            '<extra></extra>'
            ),
            row=row, col=col
        )

fig.update_layout(
    title_text="Complete Dashboard: Performance by Neighborhood Size",
    height=1000,
    width=1800,
    font=dict(size=10),
    title_font_size=18,
    template='plotly_white'
)

fig.show()


In [None]:
# Interactive box plot for a specific metric
metric = 'KL Divergence'  # Change this metric to explore others

fig = px.box(
    df_parsed, 
    x='Neighborhood Size', 
    y=metric, 
    color='Model Type',
    title=f'{metric} by Neighborhood Size and Model Type',
    labels={'Neighborhood Size': 'Neighborhood Size', metric: metric}
)

fig.update_layout(
    width=1200,
    height=700,
    font=dict(size=12),
    title_font_size=16,
    template='plotly_white'
)

fig.show()


## Cost-Benefit Analysis

Compare performance improvement with computational cost:


In [None]:
# Cost-benefit analysis: improvement vs complexity
# For each model, calculate the relative improvement and compare it with the cost

cost_benefit_analysis = []

for model_type in df_parsed['Model Type'].unique():
    model_data = df_parsed[df_parsed['Model Type'] == model_type]
    
    # Calculate average improvement across all metrics (normalized)
    nb3_data = model_data[model_data['Neighborhood Size'] == 3]
    nb7_data = model_data[model_data['Neighborhood Size'] == 7]
    
    if len(nb3_data) > 0 and len(nb7_data) > 0:
        improvements = []
        for metric in metric_cols:
            if metric in model_data.columns:
                mean3 = nb3_data[metric].mean()
                mean7 = nb7_data[metric].mean()
                if mean3 > 0:
                    improvement = (mean3 - mean7) / mean3 * 100  # % improvement
                    improvements.append(improvement)
        
        avg_improvement = np.mean(improvements) if improvements else 0
        
        # Computational complexity (normalized to NB=3)
        complexity_nb7 = complexity_df[complexity_df['Neighborhood Size'] == 7]['Normalized Time'].values[0] if len(complexity_df[complexity_df['Neighborhood Size'] == 7]) > 0 else 1
        
        cost_benefit_analysis.append({
            'Model Type': model_type,
            'Avg Improvement (%)': avg_improvement,
            'Computational Cost (x)': complexity_nb7,
            'Efficiency (Improvement/Cost)': avg_improvement / complexity_nb7 if complexity_nb7 > 0 else 0
        })

cost_benefit_df = pd.DataFrame(cost_benefit_analysis)
print("Cost-Benefit Analysis (NB=3 vs NB=7):")
print("="*60)
cost_benefit_df


In [None]:
# Display cost-benefit analysis
fig = go.Figure()

for _, row in cost_benefit_df.iterrows():
    fig.add_trace(go.Scatter(
        x=[row['Computational Cost (x)']],
        y=[row['Avg Improvement (%)']],
        mode='markers+text',
        name=row['Model Type'],
        marker=dict(size=15),
        text=[row['Model Type']],
        textposition="top center",
        hovertemplate=f"<b>{row['Model Type']}</b><br>" +
                      f"Improvement: {row['Avg Improvement (%)']:.2f}%<br>" +
                      f"Cost: {row['Computational Cost (x)']:.2f}x<br>" +
                      f"Efficiency: {row['Efficiency (Improvement/Cost)']:.2f}<br>" +
                      "<extra></extra>"
    ))

fig.update_layout(
    title='Cost-Benefit Analysis: Improvement vs Computational Complexity',
    xaxis_title='Computational Cost (normalized to NB=3)',
    yaxis_title='Average Performance Improvement (%)',
    width=1000,
    height=700,
    template='plotly_white',
    hovermode='closest'
)

# Add reference lines
fig.add_hline(y=0, line_dash="dash", line_color="gray", annotation_text="No improvement")
fig.add_vline(x=1, line_dash="dash", line_color="gray", annotation_text="Base cost (NB=3)")

fig.show()


## Complete Report Generation

Generate the complete textual report:


In [None]:
# Generate complete report
analyzer.generate_report()

print("\n Report generated! Check the file neighborhood_size_analysis_report.txt")


## Conclusions and Recommendations

Summary of main results:

In [None]:
# Find the best configuration for each metric
print("="*60)
print("BEST CONFIGURATIONS BY METRIC")
print("="*60)

for metric in metric_cols:
    if metric not in df_parsed.columns:
        continue
    
    best_idx = df_parsed[metric].idxmin()
    best_row = df_parsed.loc[best_idx]
    print(f"\n{metric}:")
    print(f"  Best: {best_row['Model Type']} with NB={best_row['Neighborhood Size']}")
    print(f"  Value: {best_row[metric]:.4f}")

print("\n" + "="*60)
print("RECOMMENDATIONS")
print("="*60)
print("""
1. Analyze statistical tests to determine if differences are significant
2. Consider the trade-off between performance improvement and computational cost
3. Verify if larger neighborhood sizes provide consistent improvements
4. Evaluate if the improvement justifies the increase in computational cost
5. Consider using a larger neighborhood size only if:
   - Statistical tests show significant differences
   - The improvement is consistent across all metrics
   - The computational cost is acceptable for your use case
""")