# GTZAN Dataset Expressions Analysis

This notebook analyzes the expressions from top 250 unique features for GTZAN dataset.

**Analysis Tasks:**
1. Extract base features from complex expressions
2. Create co-occurrence matrices showing how often base features appear together
3. Create performance matrices showing average test_auc for co-occurring features
4. Generate heatmap visualizations for both E23 and ALL62 feature sets

**Input Files:**
- GTZAN_ALL62_top250_unique_20250905_175526.pkl
- GTZAN_E23_top250_unique_20250905_175526.pkl

In [None]:
def _canonical_pair(a, b):
    """Unordered, deterministic pair for symmetric matrices."""
    return tuple(sorted((a, b), key=lambda x: str(x)))

def _rank_all_cells(df: pd.DataFrame, *, symmetric: bool, include_diagonal: bool, dropna: bool=True) -> pd.DataFrame:
    """
    Compute GLOBAL dense ranks (1 = highest) for ALL cells of a DataFrame.
    If symmetric, collapse (i,j) and (j,i) using canonical pairs and keep the MAX value.
    Returns columns: row, col, value, rank
    """
    s = df.stack(dropna=dropna)  # (row,col)->value
    if s.empty:
        return pd.DataFrame(columns=["row", "col", "value", "rank"])

    if not symmetric:
        out = s.reset_index()
        out.columns = ["row", "col", "value"]
    else:
        tmp = s.reset_index()
        tmp.columns = ["row", "col", "value"]
        if not include_diagonal:
            tmp = tmp[tmp["row"] != tmp["col"]]
        tmp["canon"] = tmp.apply(lambda r: _canonical_pair(r["row"], r["col"]), axis=1)
        g = (tmp.groupby("canon", as_index=False)
                 .agg(value=("value", "max")))
        g["row"] = g["canon"].apply(lambda t: t[0])
        g["col"] = g["canon"].apply(lambda t: t[1])
        out = g[["row", "col", "value"]]

    out["rank"] = out["value"].rank(method="dense", ascending=False).astype(int)
    return out.sort_values(["rank", "value"], ascending=[True, False]).reset_index(drop=True)

def top_m_by_df2_with_df1_rank(df1: pd.DataFrame, df2: pd.DataFrame, *,
                               m: int = 20,
                               symmetric: bool = False,
                               include_diagonal: bool = True) -> pd.DataFrame:
    """
    DF2-centric view:
      - Take the GLOBAL top-m cells of df2 (dense rank 1..m; includes all ties at m).
      - For each, show df1's GLOBAL rank and value at the same coordinate.
    Works for symmetric (unordered coords) and non-symmetric (ordered) matrices.
    Returns columns: rank_df2, row, col, value_df2, rank_df1, value_df1
    """
    df1_all = _rank_all_cells(df1, symmetric=symmetric, include_diagonal=include_diagonal)
    df2_all = _rank_all_cells(df2, symmetric=symmetric, include_diagonal=include_diagonal)

    df1_all = df1_all.rename(columns={"value": "value_df1", "rank": "rank_df1"})
    df2_all = df2_all.rename(columns={"value": "value_df2", "rank": "rank_df2"})

    # Select df2's global top-m (dense ranks include ties automatically)
    df2_top_m = df2_all[df2_all["rank_df2"] <= m].copy()

    # Join df1's global rank/value at same coordinates
    out = df2_top_m.merge(df1_all, on=["row", "col"], how="left")

    # Optional: add percentiles for easier reading
    total_df1 = len(df1_all)
    total_df2 = len(df2_all)
    out["df1_percentile"] = out["rank_df1"] / total_df1  # 0 ~ best, 1 ~ worst
    out["df2_percentile"] = out["rank_df2"] / total_df2

    # Order by df2 priority, then df1
    out = out[["rank_df2", "row", "col", "value_df2", "rank_df1", "value_df1", "df1_percentile", "df2_percentile"]] \
           .sort_values(["rank_df2", "rank_df1"], ascending=[True, True], na_position="last") \
           .reset_index(drop=True)
    return out

# Choose m
M = 200
INCLUDE_DIAGONAL = True

# Pair 1 (not symmetric)
tbl_p1 = top_m_by_df2_with_df1_rank(
    e23_op_feat_cooccur, e23_op_feat_performance,
    m=M, symmetric=False, include_diagonal=INCLUDE_DIAGONAL
)

# Pair 2 (not symmetric)
tbl_p2 = top_m_by_df2_with_df1_rank(
    all62_op_feat_cooccur, all62_op_feat_performance,
    m=M, symmetric=False, include_diagonal=INCLUDE_DIAGONAL
)

# Pair 3 (self-matrix, symmetric)
tbl_p3 = top_m_by_df2_with_df1_rank(
    e23_cooccur, e23_performance,
    m=M, symmetric=True, include_diagonal=INCLUDE_DIAGONAL
)

# Pair 4 (self-matrix, symmetric)
tbl_p4 = top_m_by_df2_with_df1_rank(
    all62_cooccur, all62_performance,
    m=M, symmetric=True, include_diagonal=INCLUDE_DIAGONAL
)

# Example: view first rows
display(tbl_p2.head(55))


In [None]:
import numpy as np
import pandas as pd
import pickle
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import combinations
import re
import warnings
from datetime import datetime
from collections import defaultdict, Counter
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 100)

# Set matplotlib parameters
plt.rcParams.update({
    'font.size': 12,
    'axes.titlesize': 14,
    'axes.labelsize': 12,
    'xtick.labelsize': 10,
    'ytick.labelsize': 10,
    'legend.fontsize': 11,
    'figure.titlesize': 16
})

print("Libraries imported successfully")
print(f"Analysis started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

## 1. Define Base Feature Sets and Setup Paths

In [None]:
# Base paths
BASE_PATH = Path('E:/Oxford/Extra/ICASSP/Draft_1/GTZAN-data')
FEATURES_PATH = BASE_PATH / 'Ref_Perceptual_features'
OUTPUT_PATH = BASE_PATH

# Input files
INPUT_FILES = {
    'ALL62': FEATURES_PATH / 'GTZAN_ALL62_top500_unique_20250905_202835.pkl',
    'E23': FEATURES_PATH / 'GTZAN_E23_top500_unique_20250905_202835.pkl'
}

# Base feature definitions
E23 = [
    'Danceability', 'Loudness', 'Chords-Changes-Rate', 'Dynamic-Complexity',
    'Zerocrossingrate', 'Chords-Number-Rate', 'Pitch-Salience',
    'Spectral-Centroid', 'Spectral-Complexity', 'Spectral-Decrease',
    'Spectral-Energyband-High', 'Spectral-Energyband-Low',
    'Spectral-Energyband-Middle-High', 'Spectral-Energyband-Middle-Low',
    'Spectral-Entropy', 'Spectral-Flux', 'Spectral-Rolloff',
    'Spectral-Spread', 'Onset-Rate', 'Length', 'BPM', 'Beats-Loud',
    'Vocal-Instrumental'
]

ML7 = [
    'Melody', 'Articulation', 'Rhythm Complexity', 'Rhythm Stability',
    'Dissonance', 'Atonality', 'Mode'
]

SYM32 = [
    'Dominants', 'Subdominants', 'sub-sub', 'sub-dom', 'dom-sub',
    'dom-tonic', 'glob-sub', 'glob-dom', 'sub-sub-dom', 'sub-dom-sub',
    'dom-sub-dom', 'sub-dom-tonic', 'dom-tonic-sub', 'dom-sub-sub',
    'sub-sub-sub', 'glob-sub-glob', 'glob-dom-tonic', 'glob-sub-sub',
    'dom-dom', 'glob-glob', 'dom-dom-sub', 'glob-glob-dom',
    'glob-dom-glob', 'glob-glob-sub', 'dom-dom-tonic', 'glob-sub-dom',
    'dom-tonic-dom', 'glob-dom-sub', 'sub-dom-dom', 'dom-dom-dom',
    'glob-dom-dom', 'glob-glob-glob'
]

ALL62 = ML7 + SYM32 + E23  # full perceptual set

# Feature set configurations
FEATURE_SETS = {
    'E23': E23,
    'ALL62': ALL62
}

print(f"Base path: {BASE_PATH}")
print(f"Features path: {FEATURES_PATH}")
print(f"Output path: {OUTPUT_PATH}")
print(f"\nFeature set sizes:")
print(f"  E23: {len(E23)} features")
print(f"  ML7: {len(ML7)} features")
print(f"  SYM32: {len(SYM32)} features")
print(f"  ALL62: {len(ALL62)} features")

# Verify input files exist
print(f"\nInput file verification:")
for name, path in INPUT_FILES.items():
    if path.exists():
        print(f"✓ {name}: {path.name}")
    else:
        print(f"✗ {name}: {path.name} (NOT FOUND)")

## 2. Base Feature Extraction Functions

In [None]:
def _canonicalize(s: str) -> str:
    """Lowercase and strip all non-alphanumerics so -, _, and spaces become equivalent."""
    return re.sub(r'[^a-z0-9]+', '', s.lower())

def extract_base_features(expression, base_features):
    """
    Robustly extract base features from an expression by canonicalizing names and
    matching whole identifiers only (no substring hits).
    Handles hyphen/underscore/space/CamelCase variants.
    """
    if pd.isna(expression) or not expression:
        return set()

    expr_str = str(expression)

    # Tokenize expression into identifiers (letters/digits with optional - or _ inside).
    # This avoids matching across parentheses/commas/operators.
    tokens = re.findall(r'[A-Za-z][A-Za-z0-9_-]*', expr_str)

    # Precompute canonical map once per call (you can cache this externally for speed).
    canon_map = {_canonicalize(f): f for f in base_features}

    found = set()
    for tok in tokens:
        c = _canonicalize(tok)
        if c in canon_map:
            found.add(canon_map[c])

    return found


def test_feature_extraction():
    """
    Test the feature extraction function with example expressions.
    """
    print("Testing feature extraction function...")
    
    test_cases = [
        "div(max(sub_sub_dom, glob_dom_dom), Danceability)",
        "min(log(Beats_Loud), sub_sub_dom)",
        "Danceability",
        "sub-dom-dom",
        "max(dom-sub, sub-dom)",
        "div(Spectral-Centroid, BPM)|sub(Loudness, Mode)"
    ]
    
    # Test with a subset of features
    test_features = ['Danceability', 'Beats-Loud', 'sub-sub-dom', 'glob-dom-dom', 
                     'sub-dom', 'dom-sub', 'sub-dom-dom', 'Spectral-Centroid', 
                     'BPM', 'Loudness', 'Mode']
    
    for i, expr in enumerate(test_cases, 1):
        found = extract_base_features(expr, test_features)
        print(f"\nTest {i}: {expr}")
        print(f"Found features: {sorted(list(found))}")

# Run the test
test_feature_extraction()

## 3. Co-occurrence Matrix Functions

In [None]:
def count_base_features(expression: str, base_features: list[str]) -> Counter:
    """
    Count occurrences of each base feature in an expression.
    Returns a Counter mapping canonicalized base-feature (original names as keys) to counts.
    """
    if pd.isna(expression) or not expression:
        return Counter()

    expr_str = str(expression)
    # identifiers like Beats_Loud, sub-sub-dom, Spectral-Centroid, BPM, etc.
    tokens = re.findall(r'[A-Za-z][A-Za-z0-9_-]*', expr_str)

    # map canonical -> original base name (prefer the given base_features' original spellings)
    canon_to_orig = {_canonicalize(f): f for f in base_features}

    counts = Counter()
    for tok in tokens:
        c = _canonicalize(tok)
        if c in canon_to_orig:
            counts[canon_to_orig[c]] += 1
    return counts

# --- Co-occurrence matrix with corrected diagonal handling ---

def create_cooccurrence_matrix(df, base_features, feature_set_name):
    """
    Create co-occurrence matrix for base features.

    Off-diagonal: binary per expression (if both features appear, +1 to [i,j] and [j,i]).
    Diagonal: only count repeated use; add (k-1) if a feature appears k>=2 times.
    """
    print(f"\n=== Creating Co-occurrence Matrix for {feature_set_name} ===")

    n_features = len(base_features)
    cooccur_matrix = np.zeros((n_features, n_features), dtype=int)
    feature_to_idx = {feature: i for i, feature in enumerate(base_features)}

    total_expressions = 0
    expressions_with_any = 0

    for _, row in df.iterrows():
        expression = row['expressions']
        total_expressions += 1

        counts = count_base_features(expression, base_features)   # counts per feature
        if not counts:
            continue

        expressions_with_any += 1
        present = list(counts.keys())

        # Off-diagonal: binary presence (same as your original logic)
        for i_name in present:
            i = feature_to_idx[i_name]
            for j_name in present:
                j = feature_to_idx[j_name]
                if i != j:
                    cooccur_matrix[i, j] += 1

        # Diagonal: add (k-1), remove singleton auto-counts
        for name, k in counts.items():
            if k >= 2:
                i = feature_to_idx[name]
                cooccur_matrix[i, i] += (k - 1)
        # If k == 1, add nothing (fixes the incorrect +1 you suspected)

    cooccur_df = pd.DataFrame(cooccur_matrix, index=base_features, columns=base_features)

    print(f"✓ Processed {total_expressions} expressions")
    print(f"✓ {expressions_with_any} expressions contained base features")
    print(f"✓ Co-occurrence matrix shape: {cooccur_df.shape}")
    print(f"✓ Total co-occurrences: {cooccur_matrix.sum():,}")

    # Top off-diagonal pairs
    top_pairs = []
    for i in range(n_features):
        for j in range(i+1, n_features):
            c = cooccur_matrix[i, j]
            if c > 0:
                top_pairs.append((base_features[i], base_features[j], c))
    top_pairs.sort(key=lambda x: x[2], reverse=True)
    print(f"\nTop 10 co-occurring feature pairs (off-diagonal):")
    for k, (f1, f2, c) in enumerate(top_pairs[:10], 1):
        print(f"  {k:2d}. {f1} + {f2}: {c} times")

    return cooccur_df

# --- Performance matrix aligned with the same diagonal logic ---

def create_performance_matrix(df, base_features, feature_set_name):
    """
    Average test_auc for co-occurring features.

    Off-diagonal: include row if both features appear (binary), count += 1, sum += test_auc.
    Diagonal: include a row only if the feature appears k>=2 times; weight by (k-1),
             i.e., count += (k-1), sum += test_auc * (k-1).
    """
    print(f"\n=== Creating Performance Matrix for {feature_set_name} ===")

    n_features = len(base_features)
    perf_sum = np.zeros((n_features, n_features), dtype=float)
    perf_cnt = np.zeros((n_features, n_features), dtype=int)
    feature_to_idx = {feature: i for i, feature in enumerate(base_features)}

    valid_expressions = 0

    for _, row in df.iterrows():
        expression = row['expressions']
        test_auc = row['test_auc']
        if pd.isna(test_auc):
            continue

        counts = count_base_features(expression, base_features)
        if not counts:
            continue

        valid_expressions += 1
        present = list(counts.keys())

        # Off-diagonal (binary)
        for i_name in present:
            i = feature_to_idx[i_name]
            for j_name in present:
                j = feature_to_idx[j_name]
                if i != j:
                    perf_sum[i, j] += test_auc
                    perf_cnt[i, j] += 1

        # Diagonal (k-1 weighting)
        for name, k in counts.items():
            if k >= 2:
                i = feature_to_idx[name]
                extra = k - 1
                perf_sum[i, i] += test_auc * extra
                perf_cnt[i, i] += extra
        # If k == 1: do nothing (don’t include singleton rows in diagonal averages)

    perf_avg = np.divide(perf_sum, perf_cnt, out=np.zeros_like(perf_sum), where=perf_cnt != 0)
    perf_df = pd.DataFrame(perf_avg, index=base_features, columns=base_features)

    print(f"✓ Processed {valid_expressions} valid expressions with performance data")
    print(f"✓ Performance matrix shape: {perf_df.shape}")
    print(f"✓ Non-zero cells: {(perf_avg > 0).sum():,}")

    nz = perf_avg[perf_cnt > 0]
    if nz.size > 0:
        print("\nPerformance statistics:")
        print(f"  Mean: {nz.mean():.6f}")
        print(f"  Std:  {nz.std():.6f}")
        print(f"  Min:  {nz.min():.6f}")
        print(f"  Max:  {nz.max():.6f}")

    return perf_df

## 4. Visualization Functions

In [None]:
def plot_cooccurrence_heatmap(matrix_df, feature_set_name, dataset_name, save_path=None):
    """
    Plot co-occurrence matrix as heatmap.
    
    Args:
        matrix_df (pd.DataFrame): Co-occurrence matrix
        feature_set_name (str): Name of feature set (E23/ALL62)
        dataset_name (str): Name of dataset (GTZAN/MTG-Jamendo)
        save_path (Path): Path to save the plot
    """
    # Calculate figure size based on number of features
    n_features = len(matrix_df)
    fig_size = max(12, n_features * 0.4)
    
    plt.figure(figsize=(fig_size, fig_size))
    
    # Create heatmap
    mask = matrix_df == 0  # Mask zero values
    
    sns.heatmap(matrix_df, 
                annot=True, 
                fmt='d', 
                cmap='YlOrRd', 
                square=True,
                linewidths=0.1,
                mask=mask,
                cbar_kws={'label': 'Co-occurrence Count'})
    
    plt.title(f'{dataset_name} {feature_set_name}: Base Feature Co-occurrence Matrix', 
              fontsize=16, fontweight='bold', pad=20)
    plt.xlabel('Base Features', fontsize=14)
    plt.ylabel('Base Features', fontsize=14)
    
    # Rotate labels for readability
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        print(f"✓ Co-occurrence heatmap saved: {save_path}")
    
    plt.show()


def plot_performance_heatmap(matrix_df, feature_set_name, dataset_name, save_path=None):
    """
    Plot performance matrix as heatmap.
    
    Args:
        matrix_df (pd.DataFrame): Performance matrix
        feature_set_name (str): Name of feature set (E23/ALL62)
        dataset_name (str): Name of dataset (GTZAN/MTG-Jamendo)
        save_path (Path): Path to save the plot
    """
    # Calculate figure size based on number of features
    n_features = len(matrix_df)
    fig_size = max(12, n_features * 0.4)
    
    plt.figure(figsize=(fig_size, fig_size))
    
    # Create heatmap
    mask = matrix_df == 0  # Mask zero values
    
    # Get non-zero values for color scaling
    non_zero_values = matrix_df.values[matrix_df.values > 0]
    vmin = non_zero_values.min() if len(non_zero_values) > 0 else 0
    vmax = non_zero_values.max() if len(non_zero_values) > 0 else 1
    
    sns.heatmap(matrix_df, 
                annot=True, 
                fmt='.4f', 
                cmap='RdYlBu_r', 
                square=True,
                linewidths=0.1,
                mask=mask,
                vmin=vmin,
                vmax=vmax,
                cbar_kws={'label': 'Average test_auc'})
    
    plt.title(f'{dataset_name} {feature_set_name}: Base Feature Performance Matrix', 
              fontsize=16, fontweight='bold', pad=20)
    plt.xlabel('Base Features', fontsize=14)
    plt.ylabel('Base Features', fontsize=14)
    
    # Rotate labels for readability
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        print(f"✓ Performance heatmap saved: {save_path}")
    
    plt.show()

print("Visualization functions defined successfully")

## 5. Load Data and Process E23 Feature Set

In [None]:
# Load E23 data
print("=" * 80)
print("PROCESSING E23 FEATURE SET")
print("=" * 80)

try:
    with open(INPUT_FILES['E23'], 'rb') as f:
        e23_df = pd.read_pickle(f)
    
    print(f"✓ Loaded E23 data: {len(e23_df)} rows, {len(e23_df.columns)} columns")
    print(f"✓ Required columns present: {'expressions' in e23_df.columns}, {'test_auc' in e23_df.columns}")
    
    if 'expressions' in e23_df.columns:
        print(f"\nSample expressions:")
        for i, expr in enumerate(e23_df['expressions'].head(3), 1):
            print(f"  {i}. {expr}")
    
except Exception as e:
    print(f"✗ Error loading E23 data: {e}")
    e23_df = pd.DataFrame()

if not e23_df.empty:
    # Create co-occurrence matrix for E23
    e23_cooccur = create_cooccurrence_matrix(e23_df, E23, 'E23')
    
    # Create performance matrix for E23
    e23_performance = create_performance_matrix(e23_df, E23, 'E23')
    
    # Plot E23 co-occurrence heatmap
    cooccur_save_path = OUTPUT_PATH / 'GTZAN_E23_Cooccurrence_Matrix.png'
    plot_cooccurrence_heatmap(e23_cooccur, 'E23', 'GTZAN', cooccur_save_path)
    
    # Plot E23 performance heatmap
    perf_save_path = OUTPUT_PATH / 'GTZAN_E23_Performance_Matrix.png'
    plot_performance_heatmap(e23_performance, 'E23', 'GTZAN', perf_save_path)
    
else:
    print("⚠️ Skipping E23 analysis due to data loading issues")
    e23_cooccur = pd.DataFrame()
    e23_performance = pd.DataFrame()

## 6. Process ALL62 Feature Set

In [None]:
# Load ALL62 data
print("\n" + "=" * 80)
print("PROCESSING ALL62 FEATURE SET")
print("=" * 80)

try:
    with open(INPUT_FILES['ALL62'], 'rb') as f:
        all62_df = pd.read_pickle(f)
    
    print(f"✓ Loaded ALL62 data: {len(all62_df)} rows, {len(all62_df.columns)} columns")
    print(f"✓ Required columns present: {'expressions' in all62_df.columns}, {'test_auc' in all62_df.columns}")
    
    if 'expressions' in all62_df.columns:
        print(f"\nSample expressions:")
        for i, expr in enumerate(all62_df['expressions'].head(3), 1):
            print(f"  {i}. {expr}")
    
except Exception as e:
    print(f"✗ Error loading ALL62 data: {e}")
    all62_df = pd.DataFrame()

if not all62_df.empty:
    # Create co-occurrence matrix for ALL62
    all62_cooccur = create_cooccurrence_matrix(all62_df, ALL62, 'ALL62')
    
    # Create performance matrix for ALL62
    all62_performance = create_performance_matrix(all62_df, ALL62, 'ALL62')
    
    # Plot ALL62 co-occurrence heatmap
    cooccur_save_path = OUTPUT_PATH / 'GTZAN_ALL62_Cooccurrence_Matrix.png'
    plot_cooccurrence_heatmap(all62_cooccur, 'ALL62', 'GTZAN', cooccur_save_path)
    
    # Plot ALL62 performance heatmap
    perf_save_path = OUTPUT_PATH / 'GTZAN_ALL62_Performance_Matrix.png'
    plot_performance_heatmap(all62_performance, 'ALL62', 'GTZAN', perf_save_path)
    
else:
    print("⚠️ Skipping ALL62 analysis due to data loading issues")
    all62_cooccur = pd.DataFrame()
    all62_performance = pd.DataFrame()

## 7. Summary and Export Results

In [None]:
print("\n" + "=" * 80)
print("GTZAN EXPRESSIONS ANALYSIS SUMMARY")
print("=" * 80)

print(f"Analysis completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Output directory: {OUTPUT_PATH}")

# Summary statistics
results_summary = []

if not e23_df.empty and not e23_cooccur.empty:
    e23_total_cooccur = e23_cooccur.values.sum()
    e23_nonzero_cells = (e23_cooccur.values > 0).sum()
    e23_avg_perf = e23_performance.values[e23_performance.values > 0].mean()
    
    results_summary.append({
        'Feature Set': 'E23',
        'Input Rows': len(e23_df),
        'Matrix Size': f'{len(E23)}x{len(E23)}',
        'Total Co-occurrences': f'{e23_total_cooccur:,}',
        'Active Cells': f'{e23_nonzero_cells:,}',
        'Avg Performance': f'{e23_avg_perf:.6f}' if not np.isnan(e23_avg_perf) else 'N/A',
        'Status': '✓ Success'
    })

if not all62_df.empty and not all62_cooccur.empty:
    all62_total_cooccur = all62_cooccur.values.sum()
    all62_nonzero_cells = (all62_cooccur.values > 0).sum()
    all62_avg_perf = all62_performance.values[all62_performance.values > 0].mean()
    
    results_summary.append({
        'Feature Set': 'ALL62',
        'Input Rows': len(all62_df),
        'Matrix Size': f'{len(ALL62)}x{len(ALL62)}',
        'Total Co-occurrences': f'{all62_total_cooccur:,}',
        'Active Cells': f'{all62_nonzero_cells:,}',
        'Avg Performance': f'{all62_avg_perf:.6f}' if not np.isnan(all62_avg_perf) else 'N/A',
        'Status': '✓ Success'
    })

if results_summary:
    summary_df = pd.DataFrame(results_summary)
    print("\nAnalysis Results:")
    print(summary_df.to_string(index=False))

# List output files
print(f"\nOutput Files Generated:")
output_files = list(OUTPUT_PATH.glob('GTZAN_*_Matrix.png'))
if output_files:
    for file_path in sorted(output_files):
        file_size = file_path.stat().st_size
        print(f"  {file_path.name} ({file_size:,} bytes)")
else:
    print("  No matrix plots generated")

# Save matrices as CSV for further analysis
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

if not e23_cooccur.empty:
    e23_cooccur_path = OUTPUT_PATH / f'GTZAN_E23_cooccurrence_matrix_{timestamp}.csv'
    e23_cooccur.to_csv(e23_cooccur_path)
    
    e23_perf_path = OUTPUT_PATH / f'GTZAN_E23_performance_matrix_{timestamp}.csv'
    e23_performance.to_csv(e23_perf_path)
    
    print(f"\n✓ E23 matrices saved as CSV files")

if not all62_cooccur.empty:
    all62_cooccur_path = OUTPUT_PATH / f'GTZAN_ALL62_cooccurrence_matrix_{timestamp}.csv'
    all62_cooccur.to_csv(all62_cooccur_path)
    
    all62_perf_path = OUTPUT_PATH / f'GTZAN_ALL62_performance_matrix_{timestamp}.csv'
    all62_performance.to_csv(all62_perf_path)
    
    print(f"✓ ALL62 matrices saved as CSV files")

print("\n" + "=" * 80)
print("GTZAN EXPRESSIONS ANALYSIS COMPLETED!")
print("=" * 80)

## 8. Operation vs Base Feature Analysis

This section analyzes how mathematical operations co-occur with base features.

**Analysis:**
1. Extract operations from expressions
2. Create operation-feature co-occurrence matrix (count)
3. Create operation-feature performance matrix (average test_auc)
4. Generate heatmap visualizations

In [None]:
# Define all operations from the genetic programming primitive set
OPERATIONS = [
    'add', 'sub', 'mul', 'div', 'log', 'sqrt', 'exp', 'abs', 'inv',
    'sin', 'cos', 'tan', 'sinh', 'cosh', 'tanh', 'pow', 'square',
    'min', 'max', 'sigmoid', 'relu', 'lrelu', 'swish', 'if_else'
]

def extract_operations(expression):
    """
    Extract operations from a GP expression.
    
    Args:
        expression (str): Expression string
    
    Returns:
        set: Set of operations found in the expression
    """
    if pd.isna(expression) or not expression:
        return set()
    
    expr_str = str(expression).strip()
    
    # Split by '|' to handle multiple expressions
    expr_parts = expr_str.split('|')
    
    found_operations = set()
    
    for part in expr_parts:
        part = part.strip()
        if not part:
            continue
        
        # Look for each operation in the expression
        for op in OPERATIONS:
            # Match operation followed by opening parenthesis
            pattern = r'\b' + re.escape(op) + r'\s*\('
            if re.search(pattern, part):
                found_operations.add(op)
    
    return found_operations


def test_operation_extraction():
    """Test the operation extraction function."""
    print("Testing operation extraction function...")
    
    test_cases = [
        "div(max(sub_sub_dom, glob_dom_dom), Danceability)",
        "min(log(Beats_Loud), sub_sub_dom)",
        "add(mul(Spectral_Centroid, sin(BPM)), sqrt(Loudness))",
        "sigmoid(relu(Danceability))",
        "if_else(sub(Mode, Atonality), pow(Length, 2), abs(Dissonance))"
    ]
    
    for i, expr in enumerate(test_cases, 1):
        found = extract_operations(expr)
        print(f"\nTest {i}: {expr}")
        print(f"Found operations: {sorted(list(found))}")

# Run the test
test_operation_extraction()
print(f"\nTotal operations to analyze: {len(OPERATIONS)}")

In [None]:
def create_operation_feature_cooccurrence_matrix(df, base_features, operations, feature_set_name):
    """
    Create co-occurrence matrix: operations (rows) vs base features (columns).
    
    Args:
        df (pd.DataFrame): DataFrame with 'expressions' column
        base_features (list): List of base feature names
        operations (list): List of operation names
        feature_set_name (str): Name of the feature set for logging
    
    Returns:
        pd.DataFrame: Operation-feature co-occurrence matrix
    """
    print(f"\n=== Creating Operation-Feature Co-occurrence Matrix for {feature_set_name} ===")
    
    # Initialize matrix: operations (rows) x features (columns)
    n_operations = len(operations)
    n_features = len(base_features)
    cooccur_matrix = np.zeros((n_operations, n_features), dtype=int)
    
    # Create mappings
    operation_to_idx = {op: i for i, op in enumerate(operations)}
    feature_to_idx = {feat: i for i, feat in enumerate(base_features)}
    
    # Process each expression
    total_expressions = 0
    expressions_with_content = 0
    
    for idx, row in df.iterrows():
        expression = row['expressions']
        total_expressions += 1
        
        # Extract operations and features from this expression
        found_operations = extract_operations(expression)
        found_features = extract_base_features(expression, base_features)
        
        if found_operations and found_features:
            expressions_with_content += 1
            
            # Mark co-occurrences
            for op in found_operations:
                op_idx = operation_to_idx[op]
                for feat in found_features:
                    feat_idx = feature_to_idx[feat]
                    cooccur_matrix[op_idx, feat_idx] += 1
    
    # Create DataFrame
    cooccur_df = pd.DataFrame(cooccur_matrix, 
                             index=operations, 
                             columns=base_features)
    
    print(f"✓ Processed {total_expressions} expressions")
    print(f"✓ {expressions_with_content} expressions had both operations and features")
    print(f"✓ Co-occurrence matrix shape: {cooccur_df.shape}")
    print(f"✓ Total co-occurrences: {cooccur_matrix.sum():,}")
    
    # Show top operation-feature pairs
    top_pairs = []
    for i, op in enumerate(operations):
        for j, feat in enumerate(base_features):
            count = cooccur_matrix[i, j]
            if count > 0:
                top_pairs.append((op, feat, count))
    
    top_pairs.sort(key=lambda x: x[2], reverse=True)
    
    print(f"\nTop 10 operation-feature co-occurrences:")
    for i, (op, feat, count) in enumerate(top_pairs[:10], 1):
        print(f"  {i:2d}. {op} + {feat}: {count} times")
    
    return cooccur_df


def create_operation_feature_performance_matrix(df, base_features, operations, feature_set_name):
    """
    Create performance matrix: operations (rows) vs base features (columns).
    Shows average test_auc for expressions containing each operation-feature combination.
    """
    print(f"\n=== Creating Operation-Feature Performance Matrix for {feature_set_name} ===")
    
    # Initialize matrices for sum and count
    n_operations = len(operations)
    n_features = len(base_features)
    perf_sum_matrix = np.zeros((n_operations, n_features), dtype=float)
    perf_count_matrix = np.zeros((n_operations, n_features), dtype=int)
    
    # Create mappings
    operation_to_idx = {op: i for i, op in enumerate(operations)}
    feature_to_idx = {feat: i for i, feat in enumerate(base_features)}
    
    # Process each expression
    valid_expressions = 0
    
    for idx, row in df.iterrows():
        expression = row['expressions']
        test_auc = row['test_auc']
        
        if pd.isna(test_auc):
            continue
            
        # Extract operations and features from this expression
        found_operations = extract_operations(expression)
        found_features = extract_base_features(expression, base_features)
        
        if found_operations and found_features:
            valid_expressions += 1
            
            # Add performance to co-occurrence cells
            for op in found_operations:
                op_idx = operation_to_idx[op]
                for feat in found_features:
                    feat_idx = feature_to_idx[feat]
                    perf_sum_matrix[op_idx, feat_idx] += test_auc
                    perf_count_matrix[op_idx, feat_idx] += 1
    
    # Calculate average performance
    perf_avg_matrix = np.divide(perf_sum_matrix, perf_count_matrix, 
                               out=np.zeros_like(perf_sum_matrix), 
                               where=perf_count_matrix!=0)
    
    # Create DataFrame
    perf_df = pd.DataFrame(perf_avg_matrix, 
                          index=operations, 
                          columns=base_features)
    
    print(f"✓ Processed {valid_expressions} valid expressions with performance data")
    print(f"✓ Performance matrix shape: {perf_df.shape}")
    print(f"✓ Non-zero cells: {(perf_avg_matrix > 0).sum():,}")
    
    # Show statistics
    non_zero_values = perf_avg_matrix[perf_avg_matrix > 0]
    if len(non_zero_values) > 0:
        print(f"\nPerformance statistics:")
        print(f"  Mean performance: {non_zero_values.mean():.6f}")
        print(f"  Std performance: {non_zero_values.std():.6f}")
        print(f"  Min performance: {non_zero_values.min():.6f}")
        print(f"  Max performance: {non_zero_values.max():.6f}")
    
    return perf_df

print("Operation-feature matrix functions defined successfully")

In [None]:
def plot_operation_feature_heatmap(matrix_df, matrix_type, feature_set_name, dataset_name, save_path=None):
    """
    Plot operation-feature matrix as heatmap.
    
    Args:
        matrix_df (pd.DataFrame): Operation-feature matrix
        matrix_type (str): 'Cooccurrence' or 'Performance'
        feature_set_name (str): Name of feature set (E23/ALL62)
        dataset_name (str): Name of dataset (GTZAN/MTG-Jamendo)
        save_path (Path): Path to save the plot
    """
    # Calculate figure size
    n_operations = len(matrix_df.index)
    n_features = len(matrix_df.columns)
    fig_width = max(12, n_features * 0.4)
    fig_height = max(8, n_operations * 0.3)
    
    plt.figure(figsize=(fig_width, fig_height))
    
    # Create heatmap
    mask = matrix_df == 0  # Mask zero values
    
    if matrix_type == 'Cooccurrence':
        sns.heatmap(matrix_df, 
                    annot=True, 
                    fmt='d', 
                    cmap='YlOrRd', 
                    linewidths=0.1,
                    mask=mask,
                    cbar_kws={'label': 'Co-occurrence Count'})
    else:  # Performance
        # Get non-zero values for color scaling
        non_zero_values = matrix_df.values[matrix_df.values > 0]
        vmin = non_zero_values.min() if len(non_zero_values) > 0 else 0
        vmax = non_zero_values.max() if len(non_zero_values) > 0 else 1
        
        sns.heatmap(matrix_df, 
                    annot=True, 
                    fmt='.4f', 
                    cmap='RdYlBu_r', 
                    linewidths=0.1,
                    mask=mask,
                    vmin=vmin,
                    vmax=vmax,
                    cbar_kws={'label': 'Average test_auc'})
    
    plt.title(f'{dataset_name} {feature_set_name}: Operation-Feature {matrix_type} Matrix', 
              fontsize=16, fontweight='bold', pad=20)
    plt.xlabel('Base Features', fontsize=14)
    plt.ylabel('Operations', fontsize=14)
    
    # Rotate labels for readability
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        print(f"✓ Operation-feature {matrix_type.lower()} heatmap saved: {save_path}")
    
    plt.show()

print("Operation-feature visualization function defined successfully")

### 8.1. Process E23 Operation-Feature Analysis

In [None]:
if not e23_df.empty:
    print("=" * 60)
    print("PROCESSING E23 OPERATION-FEATURE ANALYSIS")
    print("=" * 60)
    
    # Create operation-feature co-occurrence matrix for E23
    e23_op_feat_cooccur = create_operation_feature_cooccurrence_matrix(e23_df, E23, OPERATIONS, 'E23')
    
    # Create operation-feature performance matrix for E23
    e23_op_feat_performance = create_operation_feature_performance_matrix(e23_df, E23, OPERATIONS, 'E23')
    
    # Plot E23 operation-feature co-occurrence heatmap
    cooccur_save_path = OUTPUT_PATH / 'GTZAN_E23_Operation_Feature_Cooccurrence_Matrix.png'
    plot_operation_feature_heatmap(e23_op_feat_cooccur, 'Cooccurrence', 'E23', 'GTZAN', cooccur_save_path)
    
    # Plot E23 operation-feature performance heatmap
    perf_save_path = OUTPUT_PATH / 'GTZAN_E23_Operation_Feature_Performance_Matrix.png'
    plot_operation_feature_heatmap(e23_op_feat_performance, 'Performance', 'E23', 'GTZAN', perf_save_path)
    
else:
    print("⚠️ Skipping E23 operation-feature analysis due to data loading issues")
    e23_op_feat_cooccur = pd.DataFrame()
    e23_op_feat_performance = pd.DataFrame()

### 8.2. Process ALL62 Operation-Feature Analysis

In [None]:
if not all62_df.empty:
    print("\n" + "=" * 60)
    print("PROCESSING ALL62 OPERATION-FEATURE ANALYSIS")
    print("=" * 60)
    
    # Create operation-feature co-occurrence matrix for ALL62
    all62_op_feat_cooccur = create_operation_feature_cooccurrence_matrix(all62_df, ALL62, OPERATIONS, 'ALL62')
    
    # Create operation-feature performance matrix for ALL62
    all62_op_feat_performance = create_operation_feature_performance_matrix(all62_df, ALL62, OPERATIONS, 'ALL62')
    
    # Plot ALL62 operation-feature co-occurrence heatmap
    cooccur_save_path = OUTPUT_PATH / 'GTZAN_ALL62_Operation_Feature_Cooccurrence_Matrix.png'
    plot_operation_feature_heatmap(all62_op_feat_cooccur, 'Cooccurrence', 'ALL62', 'GTZAN', cooccur_save_path)
    
    # Plot ALL62 operation-feature performance heatmap
    perf_save_path = OUTPUT_PATH / 'GTZAN_ALL62_Operation_Feature_Performance_Matrix.png'
    plot_operation_feature_heatmap(all62_op_feat_performance, 'Performance', 'ALL62', 'GTZAN', perf_save_path)
    
else:
    print("⚠️ Skipping ALL62 operation-feature analysis due to data loading issues")
    all62_op_feat_cooccur = pd.DataFrame()
    all62_op_feat_performance = pd.DataFrame()

### 8.3. Export Operation-Feature Analysis Results

In [None]:
print("\n" + "=" * 80)
print("OPERATION-FEATURE ANALYSIS SUMMARY")
print("=" * 80)

# Save operation-feature matrices as CSV
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

if not e23_op_feat_cooccur.empty:
    e23_op_cooccur_path = OUTPUT_PATH / f'GTZAN_E23_operation_feature_cooccurrence_{timestamp}.csv'
    e23_op_feat_cooccur.to_csv(e23_op_cooccur_path)
    
    e23_op_perf_path = OUTPUT_PATH / f'GTZAN_E23_operation_feature_performance_{timestamp}.csv'
    e23_op_feat_performance.to_csv(e23_op_perf_path)
    
    print(f"✓ E23 operation-feature matrices saved as CSV files")

if not all62_op_feat_cooccur.empty:
    all62_op_cooccur_path = OUTPUT_PATH / f'GTZAN_ALL62_operation_feature_cooccurrence_{timestamp}.csv'
    all62_op_feat_cooccur.to_csv(all62_op_cooccur_path)
    
    all62_op_perf_path = OUTPUT_PATH / f'GTZAN_ALL62_operation_feature_performance_{timestamp}.csv'
    all62_op_feat_performance.to_csv(all62_op_perf_path)
    
    print(f"✓ ALL62 operation-feature matrices saved as CSV files")

# List all new output files
print(f"\nNew Operation-Feature Files Generated:")
op_feat_files = list(OUTPUT_PATH.glob('GTZAN_*_Operation_Feature_*.png'))
if op_feat_files:
    for file_path in sorted(op_feat_files):
        file_size = file_path.stat().st_size
        print(f"  {file_path.name} ({file_size:,} bytes)")
else:
    print("  No operation-feature plots generated")

print("\n" + "=" * 80)
print("OPERATION-FEATURE ANALYSIS COMPLETED!")
print("=" * 80)

In [None]:
def _canonical_pair(a, b):
    """Return an unordered, deterministic pair (min, max) using string-order to avoid type issues."""
    return tuple(sorted((a, b), key=lambda x: str(x)))

def get_top_k_cells(df: pd.DataFrame, k: int = 20, *,
                    dropna: bool = True,
                    keep_all_ties: bool = False,
                    symmetric: bool = False,
                    include_diagonal: bool = True) -> pd.DataFrame:
    """
    Return top-k cells with coordinates and values.

    If symmetric=True (self-matrix), (i,j) and (j,i) are treated as the same cell.
    In that case we collapse to canonical pairs using the MAX value among duplicates
    (robust to tiny asymmetries); results are listed with (row,col) in canonical order.

    Set include_diagonal=False to exclude (i,i) cells for symmetric matrices.
    """
    s = df.stack(dropna=dropna)  # MultiIndex (row, col) -> value
    if s.empty:
        return pd.DataFrame(columns=["rank", "row", "col", "value"])

    if not symmetric:
        # Directional case: keep (row,col) as-is
        s_sorted = s.sort_values(ascending=False)
        if keep_all_ties:
            kth_value = s_sorted.iloc[min(k, len(s_sorted)) - 1]
            s_top = s_sorted[s_sorted >= kth_value]
        else:
            s_top = s_sorted.iloc[:k]
        out = s_top.reset_index()
        out.columns = ["row", "col", "value"]
    else:
        # Symmetric case: collapse (i,j) and (j,i) to the same canonical key
        df_pairs = s.reset_index()
        df_pairs.columns = ["row", "col", "value"]
        if not include_diagonal:
            df_pairs = df_pairs[df_pairs["row"] != df_pairs["col"]]

        df_pairs["canon"] = df_pairs.apply(lambda r: _canonical_pair(r["row"], r["col"]), axis=1)
        # Aggregate duplicates by MAX (robust if matrix is not perfectly symmetric)
        g = (df_pairs.groupby("canon", as_index=False)
                     .agg(value=("value", "max")))

        # Expand canon -> row, col (canonical order)
        g["row"] = g["canon"].apply(lambda t: t[0])
        g["col"] = g["canon"].apply(lambda t: t[1])

        g = g[["row", "col", "value"]].sort_values("value", ascending=False)

        if keep_all_ties:
            kth_value = g["value"].iloc[min(k, len(g)) - 1]
            out = g[g["value"] >= kth_value]
        else:
            out = g.iloc[:k]

    # Dense rank: highest value => rank 1
    out = out.copy()
    out["rank"] = out["value"].rank(method="dense", ascending=False).astype(int)
    return out[["rank", "row", "col", "value"]].sort_values(["rank", "value"], ascending=[True, False]).reset_index(drop=True)


def compare_pair(pair_name: str, df1: pd.DataFrame, df2: pd.DataFrame, k: int = 20, *,
                 keep_all_ties: bool = False,
                 symmetric: bool = False,
                 include_diagonal: bool = True,
                 validate_same_shape: bool = True) -> dict:
    """
    For one pair:
      1) Get top-k cells in each df (respecting symmetry if needed).
      2) Find overlapping coordinates between the two top-k lists.
         - If symmetric=True, overlap compares unordered pairs.
         - Otherwise, overlap compares ordered (row,col).
    """
    if validate_same_shape:
        assert set(df1.index) == set(df2.index), f"{pair_name}: row index sets differ"
        assert set(df1.columns) == set(df2.columns), f"{pair_name}: column index sets differ"
        if symmetric:
            # Self-matrix sanity: same label sets on rows and cols
            assert set(df1.index) == set(df1.columns), f"{pair_name}: df1 not self-matrix"
            assert set(df2.index) == set(df2.columns), f"{pair_name}: df2 not self-matrix"

    top1 = get_top_k_cells(df1, k=k, keep_all_ties=keep_all_ties,
                           symmetric=symmetric, include_diagonal=include_diagonal)
    top2 = get_top_k_cells(df2, k=k, keep_all_ties=keep_all_ties,
                           symmetric=symmetric, include_diagonal=include_diagonal)

    if not symmetric:
        # Ordered overlap
        overlap = top1.merge(top2, on=["row", "col"], suffixes=("_df1", "_df2"))
    else:
        # Unordered overlap via canonical key
        t1 = top1.copy()
        t1["canon"] = t1.apply(lambda r: _canonical_pair(r["row"], r["col"]), axis=1)
        t2 = top2.copy()
        t2["canon"] = t2.apply(lambda r: _canonical_pair(r["row"], r["col"]), axis=1)
        overlap = t1.merge(t2, on="canon", suffixes=("_df1", "_df2"))
        # Expand canon into row/col in canonical order for display
        overlap["row"] = overlap["canon"].apply(lambda t: t[0])
        overlap["col"] = overlap["canon"].apply(lambda t: t[1])

    # Tidy columns and ordering
    if "canon" in overlap.columns:
        cols = ["row", "col", "rank_df1", "value_df1", "rank_df2", "value_df2"]
        overlap = overlap[cols]
    else:
        overlap = overlap[["row", "col", "rank_df1", "value_df1", "rank_df2", "value_df2"]]

    overlap = overlap.sort_values(["rank_df1", "rank_df2"], ascending=[True, True]).reset_index(drop=True)

    # Compact summary
    print(f"\n=== {pair_name} ===")
    print(f"Top {k} in df1: {len(top1)} rows  |  Top {k} in df2: {len(top2)} rows")
    print(f"Overlap count (same {'unordered' if symmetric else 'ordered'} coordinates): {len(overlap)}")
    if len(overlap):
        print(overlap.head(10).to_string(index=False))

    return {
        "pair_name": pair_name,
        "top_df1": top1,
        "top_df2": top2,
        "overlap": overlap
    }


# -------------------------
# Apply to your four pairs
#   Pair 1/2: directional (not symmetric)
#   Pair 3/4: symmetric (self-matrix)  -> treat (i,j) == (j,i)
# -------------------------

K = 110
KEEP_ALL_TIES = False
INCLUDE_DIAGONAL = True  # set False to drop (i,i) in symmetric matrices

pairs = [
    # (name, df1, df2, symmetric?)
    ("pair 1: e23_op_feat (cooccur vs performance)",     e23_op_feat_cooccur,  e23_op_feat_performance,  False),
    ("pair 2: all62_op_feat (cooccur vs performance)",   all62_op_feat_cooccur, all62_op_feat_performance, False),
    ("pair 3: e23 (cooccur vs performance)",             e23_cooccur,          e23_performance,          True),
    ("pair 4: all62 (cooccur vs performance)",           all62_cooccur,        all62_performance,        True),
]

pair_results = {}
for name, A, B, is_symmetric in pairs:
    pair_results[name] = compare_pair(
        name, A, B, k=K,
        keep_all_ties=KEEP_ALL_TIES,
        symmetric=is_symmetric,
        include_diagonal=INCLUDE_DIAGONAL
    )

# Optional: save CSVs
# for name, res in pair_results.items():
#     tag = name.split(":")[0].replace(" ", "_")
#     res["top_df1"].to_csv(f"{tag}_top{K}_df1.csv", index=False)
#     res["top_df2"].to_csv(f"{tag}_top{K}_df2.csv", index=False)
#     res["overlap"].to_csv(f"{tag}_top{K}_overlap.csv", index=False)

In [None]:
def _canonical_pair(a, b):
    """Unordered, deterministic pair for symmetric matrices."""
    return tuple(sorted((a, b), key=lambda x: str(x)))

def _rank_all_cells(df: pd.DataFrame, *, symmetric: bool, include_diagonal: bool, dropna: bool=True) -> pd.DataFrame:
    """
    Compute GLOBAL dense ranks (1 = highest) for ALL cells of a DataFrame.
    If symmetric, collapse (i,j) and (j,i) using canonical pairs and keep the MAX value.
    Returns columns: row, col, value, rank
    """
    s = df.stack(dropna=dropna)  # (row,col)->value
    if s.empty:
        return pd.DataFrame(columns=["row", "col", "value", "rank"])

    if not symmetric:
        out = s.reset_index()
        out.columns = ["row", "col", "value"]
    else:
        tmp = s.reset_index()
        tmp.columns = ["row", "col", "value"]
        if not include_diagonal:
            tmp = tmp[tmp["row"] != tmp["col"]]
        tmp["canon"] = tmp.apply(lambda r: _canonical_pair(r["row"], r["col"]), axis=1)
        g = (tmp.groupby("canon", as_index=False)
                 .agg(value=("value", "max")))
        g["row"] = g["canon"].apply(lambda t: t[0])
        g["col"] = g["canon"].apply(lambda t: t[1])
        out = g[["row", "col", "value"]]

    out["rank"] = out["value"].rank(method="dense", ascending=False).astype(int)
    return out.sort_values(["rank", "value"], ascending=[True, False]).reset_index(drop=True)

def top_m_by_df2_with_df1_rank(df1: pd.DataFrame, df2: pd.DataFrame, *,
                               m: int = 20,
                               symmetric: bool = False,
                               include_diagonal: bool = True) -> pd.DataFrame:
    """
    DF2-centric view:
      - Take the GLOBAL top-m cells of df2 (dense rank 1..m; includes all ties at m).
      - For each, show df1's GLOBAL rank and value at the same coordinate.
    Works for symmetric (unordered coords) and non-symmetric (ordered) matrices.
    Returns columns: rank_df2, row, col, value_df2, rank_df1, value_df1
    """
    df1_all = _rank_all_cells(df1, symmetric=symmetric, include_diagonal=include_diagonal)
    df2_all = _rank_all_cells(df2, symmetric=symmetric, include_diagonal=include_diagonal)

    df1_all = df1_all.rename(columns={"value": "value_df1", "rank": "rank_df1"})
    df2_all = df2_all.rename(columns={"value": "value_df2", "rank": "rank_df2"})

    # Select df2's global top-m (dense ranks include ties automatically)
    df2_top_m = df2_all[df2_all["rank_df2"] <= m].copy()

    # Join df1's global rank/value at same coordinates
    out = df2_top_m.merge(df1_all, on=["row", "col"], how="left")

    # Optional: add percentiles for easier reading
    total_df1 = len(df1_all)
    total_df2 = len(df2_all)
    out["df1_percentile"] = out["rank_df1"] / total_df1  # 0 ~ best, 1 ~ worst
    out["df2_percentile"] = out["rank_df2"] / total_df2

    # Order by df2 priority, then df1
    out = out[["rank_df2", "row", "col", "value_df2", "rank_df1", "value_df1", "df1_percentile", "df2_percentile"]] \
           .sort_values(["rank_df2", "rank_df1"], ascending=[True, True], na_position="last") \
           .reset_index(drop=True)
    return out

# Choose m
M = 200
INCLUDE_DIAGONAL = True

# Pair 1 (not symmetric)
tbl_p1 = top_m_by_df2_with_df1_rank(
    e23_op_feat_cooccur, e23_op_feat_performance,
    m=M, symmetric=False, include_diagonal=INCLUDE_DIAGONAL
)

# Pair 2 (not symmetric)
tbl_p2 = top_m_by_df2_with_df1_rank(
    all62_op_feat_cooccur, all62_op_feat_performance,
    m=M, symmetric=False, include_diagonal=INCLUDE_DIAGONAL
)

# Pair 3 (self-matrix, symmetric)
tbl_p3 = top_m_by_df2_with_df1_rank(
    e23_cooccur, e23_performance,
    m=M, symmetric=True, include_diagonal=INCLUDE_DIAGONAL
)

# Pair 4 (self-matrix, symmetric)
tbl_p4 = top_m_by_df2_with_df1_rank(
    all62_cooccur, all62_performance,
    m=M, symmetric=True, include_diagonal=INCLUDE_DIAGONAL
)

# Example: view first rows
display(tbl_p2.head(55))


## 9. Improved Heatmap Visualizations

This section implements enhanced heatmap visualizations with professional styling, half-diagonal layouts, and customizable options based on current best practices.

**Key Improvements:**
- Half-diagonal heatmaps combining co-occurrence and performance data
- Professional styling with larger fonts (14pt minimum) 
- Customizable text annotations
- Better colorbar scaling and proportions
- Publication-ready quality

In [None]:
# Import improved heatmap functions
import sys
sys.path.append('E:/Oxford/Extra/ICASSP/Draft_1/Codes')

from improved_heatmap_functions import (
    plot_half_diagonal_heatmap,
    plot_combined_triangular_heatmap, 
    plot_enhanced_heatmap,
    plot_enhanced_operation_feature_heatmap,
    set_publication_style
)

# Set publication-ready style with larger fonts
set_publication_style(font_size=14)

print("✓ Improved heatmap functions imported successfully")
print("✓ Publication style set with 14pt base font size")

### 9.1. Enhanced E23 Base Feature Visualizations

In [None]:
if not e23_df.empty and not e23_cooccur.empty:
    print("=" * 80)
    print("ENHANCED E23 BASE FEATURE VISUALIZATIONS")
    print("=" * 80)
    
    # 1. Enhanced individual heatmaps with professional styling
    print("\n1. Enhanced Co-occurrence Heatmap (with text):")
    save_path = OUTPUT_PATH / 'GTZAN_E23_Enhanced_Cooccurrence_Matrix.png'
    plot_enhanced_heatmap(e23_cooccur, 'Cooccurrence', 'E23', 'GTZAN', 
                          save_path=save_path, show_text=True, font_size=16)
    
    print("\n2. Enhanced Performance Heatmap (with text):")
    save_path = OUTPUT_PATH / 'GTZAN_E23_Enhanced_Performance_Matrix.png'  
    plot_enhanced_heatmap(e23_performance, 'Performance', 'E23', 'GTZAN',
                          save_path=save_path, show_text=True, font_size=16)
    
    # 3. Enhanced heatmaps without text annotations (cleaner look)
    print("\n3. Enhanced Co-occurrence Heatmap (no text):")
    save_path = OUTPUT_PATH / 'GTZAN_E23_Enhanced_Cooccurrence_NoText_Matrix.png'
    plot_enhanced_heatmap(e23_cooccur, 'Cooccurrence', 'E23', 'GTZAN',
                          save_path=save_path, show_text=False, font_size=16)
    
    print("\n4. Enhanced Performance Heatmap (no text):")
    save_path = OUTPUT_PATH / 'GTZAN_E23_Enhanced_Performance_NoText_Matrix.png'
    plot_enhanced_heatmap(e23_performance, 'Performance', 'E23', 'GTZAN', 
                          save_path=save_path, show_text=False, font_size=16)
    
    # 5. Half-diagonal visualization (both upper triangles, no text)
    print("\n5. Half-Diagonal Visualization (fixed: upper triangles, no text):")
    save_path = OUTPUT_PATH / 'GTZAN_E23_Half_Diagonal_Matrix.png'
    plot_half_diagonal_heatmap(e23_cooccur, e23_performance, E23, 'E23', 'GTZAN',
                               save_path=save_path, font_size=16)
    
    # 6. Combined triangular heatmap (dual colorbars, no text)
    print("\n6. Combined Triangular Heatmap (fixed: dual colorbars, no text):")
    save_path = OUTPUT_PATH / 'GTZAN_E23_Combined_Triangular_Matrix.png'
    plot_combined_triangular_heatmap(e23_cooccur, e23_performance, E23, 'E23', 'GTZAN',
                                     save_path=save_path, font_size=16)

else:
    print("⚠️ Skipping enhanced E23 visualizations due to missing data")

### 9.2. Enhanced ALL62 Base Feature Visualizations

In [None]:
if not all62_df.empty and not all62_cooccur.empty:
    print("\n" + "=" * 80)
    print("ENHANCED ALL62 BASE FEATURE VISUALIZATIONS")
    print("=" * 80)
    
    # 1. Enhanced individual heatmaps - cleaner versions without text (recommended for ALL62)
    print("\n1. Enhanced Co-occurrence Heatmap (no text - recommended for ALL62):")
    save_path = OUTPUT_PATH / 'GTZAN_ALL62_Enhanced_Cooccurrence_Matrix.png'
    plot_enhanced_heatmap(all62_cooccur, 'Cooccurrence', 'ALL62', 'GTZAN',
                          save_path=save_path, show_text=False, font_size=16)
    
    print("\n2. Enhanced Performance Heatmap (no text - recommended for ALL62):")
    save_path = OUTPUT_PATH / 'GTZAN_ALL62_Enhanced_Performance_Matrix.png'
    plot_enhanced_heatmap(all62_performance, 'Performance', 'ALL62', 'GTZAN', 
                          save_path=save_path, show_text=False, font_size=16)
    
    # 2. Half-diagonal visualization (fixed: upper triangle, no text)
    print("\n3. Half-Diagonal Visualization (fixed: upper triangles, no text):")
    save_path = OUTPUT_PATH / 'GTZAN_ALL62_Half_Diagonal_Matrix.png'
    plot_half_diagonal_heatmap(all62_cooccur, all62_performance, ALL62, 'ALL62', 'GTZAN',
                               save_path=save_path, font_size=14)
    
    # 3. Combined triangular (fixed: dual colorbars, no text)
    print("\n4. Combined Triangular Heatmap (fixed: dual colorbars, no text):")
    save_path = OUTPUT_PATH / 'GTZAN_ALL62_Combined_Triangular_Matrix.png'
    plot_combined_triangular_heatmap(all62_cooccur, all62_performance, ALL62, 'ALL62', 'GTZAN',
                                     save_path=save_path, font_size=14)

else:
    print("⚠️ Skipping enhanced ALL62 visualizations due to missing data")

### 9.3. Enhanced Operation-Feature Visualizations

In [None]:
# Enhanced Operation-Feature Visualizations
print("\n" + "=" * 80)  
print("ENHANCED OPERATION-FEATURE VISUALIZATIONS")
print("=" * 80)

# E23 Operation-Feature Enhanced Plots
if not e23_df.empty and 'e23_op_feat_cooccur' in globals() and not e23_op_feat_cooccur.empty:
    print("\n=== Enhanced E23 Operation-Feature Plots ===")
    
    # Co-occurrence with text
    print("\n1. E23 Operation-Feature Co-occurrence (with text):")
    save_path = OUTPUT_PATH / 'GTZAN_E23_Enhanced_OpFeat_Cooccurrence_Matrix.png'
    plot_enhanced_operation_feature_heatmap(e23_op_feat_cooccur, 'Cooccurrence', 'E23', 'GTZAN',
                                           save_path=save_path, show_text=True, font_size=16)
    
    # Performance with text
    print("\n2. E23 Operation-Feature Performance (with text):")
    save_path = OUTPUT_PATH / 'GTZAN_E23_Enhanced_OpFeat_Performance_Matrix.png'
    plot_enhanced_operation_feature_heatmap(e23_op_feat_performance, 'Performance', 'E23', 'GTZAN',
                                           save_path=save_path, show_text=True, font_size=16)
    
    # Clean versions without text
    print("\n3. E23 Operation-Feature Co-occurrence (clean, no text):")
    save_path = OUTPUT_PATH / 'GTZAN_E23_Enhanced_OpFeat_Cooccurrence_Clean_Matrix.png'
    plot_enhanced_operation_feature_heatmap(e23_op_feat_cooccur, 'Cooccurrence', 'E23', 'GTZAN',
                                           save_path=save_path, show_text=False, font_size=16)
    
    print("\n4. E23 Operation-Feature Performance (clean, no text):")
    save_path = OUTPUT_PATH / 'GTZAN_E23_Enhanced_OpFeat_Performance_Clean_Matrix.png'
    plot_enhanced_operation_feature_heatmap(e23_op_feat_performance, 'Performance', 'E23', 'GTZAN',
                                           save_path=save_path, show_text=False, font_size=16)

# ALL62 Operation-Feature Enhanced Plots (clean versions only for large matrix)
if not all62_df.empty and 'all62_op_feat_cooccur' in globals() and not all62_op_feat_cooccur.empty:
    print("\n=== Enhanced ALL62 Operation-Feature Plots (Clean) ===")
    
    print("\n5. ALL62 Operation-Feature Co-occurrence (clean):")
    save_path = OUTPUT_PATH / 'GTZAN_ALL62_Enhanced_OpFeat_Cooccurrence_Matrix.png' 
    plot_enhanced_operation_feature_heatmap(all62_op_feat_cooccur, 'Cooccurrence', 'ALL62', 'GTZAN',
                                           save_path=save_path, show_text=False, font_size=14)
    
    print("\n6. ALL62 Operation-Feature Performance (clean):")
    save_path = OUTPUT_PATH / 'GTZAN_ALL62_Enhanced_OpFeat_Performance_Matrix.png'
    plot_enhanced_operation_feature_heatmap(all62_op_feat_performance, 'Performance', 'ALL62', 'GTZAN', 
                                           save_path=save_path, show_text=False, font_size=14)

print("\n✓ All enhanced operation-feature visualizations completed!")

### 9.4. Enhanced Visualizations Summary

The enhanced heatmap visualizations provide several key improvements over the original plots:

**Professional Styling:**
- Larger fonts (14-16pt) for better readability
- Improved colorbar scaling and proportions  
- Better spacing and layout optimization
- Publication-ready quality at 300 DPI

**New Visualization Types:**
- **Half-Diagonal Plots**: Split view showing co-occurrence and performance side-by-side
- **Combined Triangular**: Single plot with co-occurrence below diagonal and performance above
- **Clean Versions**: Option to hide cell text for cleaner appearance with large matrices

**Customization Options:**
- Toggleable text annotations (`show_text` parameter)
- Adjustable font sizes
- Custom colormaps
- Better handling of zero values and color scaling

**Space Efficiency:**
- Compact layouts reduce redundancy in symmetric matrices
- Better suited for publication with limited space
- Maintains all information while reducing visual clutter