### Correlations

In [6]:
import pandas as pd
import numpy as np
from scipy import stats
import os

def analyze_file_correlations(data, name=None):
    """Analyze Spearman correlations between 'complexity' and other features in a dataset."""
    try:
        # If 'data' is a file path string, read the CSV
        if isinstance(data, str):
            df = pd.read_csv(data)
        else:
            df = data

        if 'complexity' not in df.columns:
            print(f"\nWarning: 'complexity' not found in {name or 'dataset'}")
            return None, 0

        correlations = {}
        for column in df.columns:
            if column not in ['complexity', 'image_id']:
                # Compute Spearman correlation (ignoring NaNs)
                correlation, _ = stats.spearmanr(df['complexity'], df[column], nan_policy='omit')
                if not np.isnan(correlation):
                    correlations[column] = correlation

        return correlations, len(df)

    except Exception as e:
        print(f"\nError processing {name or 'dataset'}: {e}")
        return None, 0

def aggregate_correlations(all_files):
    """Aggregate correlations across all individual CSV files."""
    all_correlations = {}  # Dictionary to collect correlations per feature
    all_datasets = []      # List to store (DataFrame, dataset_name) tuples

    # Process each individual file
    for file in all_files:
        try:
            df = pd.read_csv(file)
            dataset_name = os.path.basename(file)
            all_datasets.append((df, dataset_name))
        except Exception as e:
            print(f"Error reading {file}: {e}")

    # Iterate over every dataset
    for data, name in all_datasets:
        correlations, n_samples = analyze_file_correlations(data, name)
        if correlations:
            for feature, correlation in correlations.items():
                # Append a tuple of (correlation, sample size, dataset name) for each feature
                all_correlations.setdefault(feature, []).append((correlation, n_samples, name))

    # Compute statistics for each feature across datasets
    feature_stats = {}
    total_datasets = len(all_datasets)
    for feature, corr_list in all_correlations.items():
        coverage = len(corr_list) / total_datasets
        # Extract correlation values and sample sizes
        corr_values = [item[0] for item in corr_list]
        sample_sizes = [item[1] for item in corr_list]
        total_samples = sum(sample_sizes)

        # Unweighted (simple) average and standard deviation over datasets
        mean_corr = np.mean(corr_values)
        std_corr = np.std(corr_values) if len(corr_values) > 1 else 0

        # Weighted average and weighted standard deviation
        weights = [n / total_samples for n in sample_sizes]
        weighted_mean = sum(c * w for c, w in zip(corr_values, weights))
        weighted_var = sum(w * (x - weighted_mean) ** 2 for x, w in zip(corr_values, weights))
        weighted_std = np.sqrt(weighted_var)

        feature_stats[feature] = {
            'mean_correlation': mean_corr,
            'std_correlation': std_corr,
            'weighted_mean_correlation': weighted_mean,
            'weighted_std_correlation': weighted_std,
            'num_datasets': len(corr_list),
            'dataset_coverage': coverage,
            'total_samples': total_samples
        }

    # Convert the dictionary to a DataFrame and add columns for absolute values for sorting
    stats_df = pd.DataFrame.from_dict(feature_stats, orient='index')
    stats_df['abs_mean_correlation'] = stats_df['mean_correlation'].abs()
    stats_df['abs_weighted_mean_correlation'] = stats_df['weighted_mean_correlation'].abs()

    return stats_df

def print_unweighted_features_table(stats_df):
    """Print table of features using unweighted (simple average) statistics."""
    # Sort by absolute mean correlation (largest first)
    sorted_features = stats_df.sort_values('abs_mean_correlation', ascending=False)
    print("\nUnweighted Averages Over Datasets:")
    print("-" * 70)
    print(f"{'Feature':<30} | {'Mean Corr':>9} | {'Std Dev':>8} | {'Datasets':>8}")
    print("-" * 70)
    for feature, row in sorted_features.iterrows():
        print(f"{feature[:30]:<30} | {row['mean_correlation']:9.3f} | {row['std_correlation']:8.3f} | {row['num_datasets']:8}")

def print_weighted_features_table(stats_df):
    """Print table of features using weighted statistics based on dataset sample sizes."""
    # Sort by absolute weighted mean correlation (largest first)
    sorted_features = stats_df.sort_values('abs_weighted_mean_correlation', ascending=False)
    print("\nWeighted Averages Based on Dataset Sample Sizes:")
    print("-" * 90)
    print(f"{'Feature':<30} | {'Weighted Mean':>15} | {'Weighted Std':>12} | {'Total Samples':>15}")
    print("-" * 90)
    for feature, row in sorted_features.iterrows():
        print(f"{feature[:30]:<30} | {row['weighted_mean_correlation']:15.3f} | {row['weighted_std_correlation']:12.3f} | {row['total_samples']:15,}")

# ----------------------------
# Example Usage
# ----------------------------

# List of individual CSV files containing features
feature_files = [
    "RSIVL.csv",
    "VISC.csv",
    "IC9600 Abstract.csv",
    "IC9600 Paintings.csv",
    "IC9600 Scenes.csv",
    "SAVOIAS Objects.csv",
    "SAVOIAS Art.csv",
    "SAVOIAS Scenes.csv",
    "SAVOIAS Suprematism.csv",
    "SAVOIAS Interior Design.csv",
    "IC9600 Advertisement.csv",
    "IC9600 Architecture.csv",
    "IC9600 Person.csv",
    "IC9600 Transport.csv",
    "IC9600 Objects.csv",
    "SVG.csv"
]

# If your files are in a different folder, set the folder path here
features_folder = "../features"
feature_files = [os.path.join(features_folder, file) for file in feature_files]

# Aggregate correlations over the individual files
stats_df = aggregate_correlations(feature_files)

# Print the two separate tables:
print_unweighted_features_table(stats_df)
print_weighted_features_table(stats_df)



Unweighted Averages Over Datasets:
----------------------------------------------------------------------
Feature                        | Mean Corr |  Std Dev | Datasets
----------------------------------------------------------------------
IC9600                         |     0.885 |    0.064 |     11.0
# of SAM segmentations         |     0.702 |    0.090 |     11.0
MSG                            |     0.607 |    0.076 |     11.0
M6                             |     0.582 |    0.071 |     11.0
symmetry                       |    -0.581 |    0.083 |     11.0
edge density                   |     0.566 |    0.078 |     11.0
M4                             |    -0.550 |    0.102 |     11.0
M1                             |     0.537 |    0.089 |     11.0
M7                             |     0.534 |    0.095 |     11.0
# of FC-CLIP classes           |     0.526 |    0.175 |     11.0
clutter                        |     0.509 |    0.082 |     11.0
MUC6                           |     0.483

### Ablation (Permutation Tests)