In [3]:
import pandas as pd
import numpy as np
import math
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.auto import tqdm
import os
from pathlib import Path

# --- Configuration ---
# Directory where your embeddings are stored
LAYER_BASE_PATH = "/home/mmezzanzanica/project/scoring_autoint_align/data/Llama3_1-8B-Base-LXR-8x"
PKL_FILE_NAME = "oai_token-act-pair_gpt-4o-mini_embeddings.pkl"
NPZ_PATH_TEMPLATE = LAYER_BASE_PATH + "/{layer}-llamascope-res-32k/pajama_meta-llama_Llama-3.1-8B_res_Llama3_1-8B-Base-L{layer}R-8x_checkpoints_final.safetensors_docs100k_keq512_cooccurrences.npz"

TOP_N_FEATURES = 1023
N_TOTAL_CHUNKS = 71687
EXPECTED_TOTAL_ROWS = 32768
EMBEDDING_COL_NAME = 'embedding'

def find_embedding_models(layer_path):
    """Recursively find all PKL files in all subdirs, return (embedding_model_path_relative_to_layer, full_path) tuples."""
    layer_dir = Path(layer_path)
    embedding_models = []
    for pkl_path in layer_dir.rglob("oai_token-act-pair_gpt-4o-mini_embeddings.pkl"):
        # Get the path relative to layer_dir to use as the model identifier
        rel_path = pkl_path.parent.relative_to(layer_dir)
        emb_name = str(rel_path)  # e.g., 'Alibaba-NLP/gte-Qwen2-7B-instruct'
        embedding_models.append((emb_name, str(pkl_path)))
    return embedding_models


def load_and_prepare_dataframe(pkl_path, npz_path, expected_rows=None):
    try:
        df = pd.read_pickle(pkl_path)
        if expected_rows is not None and expected_rows > 0:
            if not pd.api.types.is_numeric_dtype(df['index']):
                raise TypeError("The 'index' column is not numeric.")
            full_index_range = set(range(expected_rows))
            existing_indices = set(df['index'])
            missing_indices = sorted(list(full_index_range - existing_indices))
            if missing_indices:
                first_valid_embedding = df['embedding'].dropna().iloc[0]
                embedding_dim = len(first_valid_embedding)
                null_embedding = np.zeros(embedding_dim, dtype=np.array(first_valid_embedding).dtype)
                rows_to_add = [{'index': idx, 'embedding': null_embedding} for idx in missing_indices]
                if rows_to_add:
                    new_rows_df = pd.DataFrame(rows_to_add)
                    df = pd.concat([df, new_rows_df], ignore_index=True)
                    df = df.sort_values(by='index').reset_index(drop=True)
        with np.load(npz_path) as data:
            key_to_use = data.files[0]
            cooc_embeddings_matrix = data[key_to_use]
        if len(df) != cooc_embeddings_matrix.shape[0]:
            raise ValueError(f"Shape mismatch: DataFrame has {len(df)} rows, but co-occurrence matrix has {cooc_embeddings_matrix.shape[0]} rows.")
        df['cooc_embedding'] = list(cooc_embeddings_matrix)
        return df, cooc_embeddings_matrix
    except Exception as e:
        print(f"Error loading data: {e}")
        return None, None

def calculate_phi_coefficient(n_ii, n_jj, n_ij, N):
    n_ii, n_jj, n_ij, N = float(n_ii), float(n_jj), float(n_ij), float(N)
    if N == 0:
        return 0.0
    n11 = n_ij
    n10 = n_ii - n_ij
    n01 = n_jj - n_ij
    n00 = max(0.0, N - n_ii - n_jj + n_ij)
    ni_dot = n_ii
    n_dot_j = n_jj
    n0_dot = N - ni_dot
    n_dot_0 = N - n_dot_j
    try:
        denominator = math.sqrt(ni_dot * n_dot_j * n0_dot * n_dot_0)
    except ValueError:
        return 0.0
    if denominator == 0:
        if n11 == ni_dot and n11 == n_dot_j and n00 == 0:
            return 1.0
        if n10 == ni_dot and n01 == n_dot_j and n11 == 0:
            return -1.0
        return 0.0
    numerator = (n11 * n00) - (n10 * n01)
    phi = numerator / denominator
    return np.clip(phi, -1.0, 1.0)

def get_most_phi_correlated_features(ref_idx, df_input, cooc_mat, N_total, top_n=10):
    if ref_idx < 0 or ref_idx >= len(df_input):
        raise IndexError(f"Reference index {ref_idx} is out of bounds.")
    occurrence_counts = np.diag(cooc_mat).astype(np.float32)
    cooc_mat_float = cooc_mat.astype(np.float32)
    ref_n_ii = occurrence_counts[ref_idx]
    if ref_n_ii == 0:
        return pd.DataFrame({'id': [], 'phi': []})
    phi_scores = []
    indices = []
    for j in range(len(df_input)):
        if j == ref_idx:
            continue
        n_jj = occurrence_counts[j]
        n_ij = cooc_mat_float[ref_idx, j]
        phi_score = calculate_phi_coefficient(ref_n_ii, n_jj, n_ij, N_total)
        phi_scores.append(phi_score)
        indices.append(df_input.index[j])
    df_scores = pd.DataFrame({'id': indices, 'phi': phi_scores})
    return df_scores.sort_values(by='phi', ascending=False).head(top_n)

def get_most_semantically_similar_features(ref_idx, df_input, embedding_column, top_n=10):
    if ref_idx < 0 or ref_idx >= len(df_input):
        raise IndexError(f"Reference index {ref_idx} is out of bounds.")
    if embedding_column not in df_input.columns:
        raise ValueError(f"DataFrame must contain the embedding column: '{embedding_column}'.")
    embeddings_np = np.stack(df_input[embedding_column].values).astype(np.float32)
    ref_embedding = embeddings_np[ref_idx].reshape(1, -1)
    similarities = cosine_similarity(ref_embedding, embeddings_np)[0]
    similarities[ref_idx] = -np.inf
    sorted_indices = np.argsort(-similarities)
    top_n_indices = sorted_indices[:top_n]
    df_similar = pd.DataFrame({
        'id': df_input.index[top_n_indices],
        'cosine': similarities[top_n_indices]
    })
    return df_similar

def calculate_overlap_for_feature(row, df_layer, cooc_matrix):
    try:
        ref_feature_index = row['index']
        df_similar_phi = get_most_phi_correlated_features(
            ref_feature_index, df_layer, cooc_matrix, N_TOTAL_CHUNKS, top_n=TOP_N_FEATURES
        )
        df_similar_emb = get_most_semantically_similar_features(
            ref_feature_index, df_layer, EMBEDDING_COL_NAME, top_n=TOP_N_FEATURES
        )
        if len(df_similar_phi) == 0 or len(df_similar_emb) == 0:
            return 0
        df_common = pd.merge(df_similar_emb, df_similar_phi, on='id')
        overlap_count = len(df_common)
        return overlap_count
    except Exception as e:
        print(f"Error processing feature {row['index']}: {e}")
        return 0

def sanitize_emb_col(emb_name):
    # sanitize name for column (replace /, space, etc)
    return emb_name.replace("/", "_").replace(" ", "_")

def main():
    data_llama = pd.read_csv('/home/mmezzanzanica/project/scoring_autoint_align/3_analysis/eval/llama/no_rerank_llama_debug_eval.csv')
    unique_layers = data_llama['layer'].unique()
    for layer in tqdm(unique_layers, desc="Processing layers"):
        print(f"\nProcessing layer {layer}...")
        layer_dir = os.path.join(LAYER_BASE_PATH, f"{layer}-llamascope-res-32k")
        npz_path = NPZ_PATH_TEMPLATE.format(layer=layer)
        if not os.path.exists(layer_dir):
            print(f"Layer dir not found: {layer_dir}")
            continue
        if not os.path.exists(npz_path):
            print(f"NPZ file not found for layer {layer}: {npz_path}")
            continue
        embedding_models = find_embedding_models(layer_dir)
        if not embedding_models:
            print(f"No embedding models found in {layer_dir}")
            continue
        for emb_name, emb_pkl_path in embedding_models:
            print(f"Processing embedding model: {emb_name}")
            if not os.path.exists(emb_pkl_path):
                print(f"Embedding PKL not found: {emb_pkl_path}")
                continue
            df_layer, cooc_matrix = load_and_prepare_dataframe(emb_pkl_path, npz_path, EXPECTED_TOTAL_ROWS)
            if df_layer is None or cooc_matrix is None:
                print(f"Skipping embedding model {emb_name} for layer {layer} due to error.")
                continue
            emb_col = f"overlap_{sanitize_emb_col(emb_name)}"
            # Filter main df for this layer and ensure the 'index' values match
            layer_mask = data_llama['layer'] == layer
            for idx, row in tqdm(data_llama[layer_mask].iterrows(), total=layer_mask.sum(),
                                 desc=f"Layer {layer} - {emb_name}", leave=False):
                overlap_count = calculate_overlap_for_feature(row, df_layer, cooc_matrix)
                data_llama.at[idx, emb_col] = overlap_count
            print(f"Completed embedding {emb_name} for layer {layer}")
    # Save results
    output_path = '/home/mmezzanzanica/project/scoring_autoint_align/3_analysis/no_reranked_ndcg_all_layers_llama_with_all_overlaps.csv'
    data_llama.to_csv(output_path, index=False)
    print(f"\nResults saved to: {output_path}")
    print(f"Final dataset shape: {data_llama.shape}")

if __name__ == "__main__":
    main()

Processing layers:   0%|          | 0/5 [00:00<?, ?it/s]


Processing layer 0...
Processing embedding model: Alibaba-NLP/gte-Qwen2-7B-instruct




Completed embedding Alibaba-NLP/gte-Qwen2-7B-instruct for layer 0
Processing embedding model: Qwen/Qwen3-Embedding-8B_new




Completed embedding Qwen/Qwen3-Embedding-8B_new for layer 0
Processing embedding model: Qwen/Qwen3-Embedding-8B




Completed embedding Qwen/Qwen3-Embedding-8B for layer 0
Processing embedding model: Qwen/Qwen3-Embedding-0.6B




Completed embedding Qwen/Qwen3-Embedding-0.6B for layer 0
Processing embedding model: Qwen/Qwen3-Embedding-4B




Completed embedding Qwen/Qwen3-Embedding-4B for layer 0
Processing embedding model: Lajavaness/bilingual-embedding-large


Processing layers:  20%|██        | 1/5 [05:13<20:55, 313.77s/it]

Completed embedding Lajavaness/bilingual-embedding-large for layer 0

Processing layer 8...
Processing embedding model: Alibaba-NLP/gte-Qwen2-7B-instruct




Completed embedding Alibaba-NLP/gte-Qwen2-7B-instruct for layer 8
Processing embedding model: Qwen/Qwen3-Embedding-8B_new




Completed embedding Qwen/Qwen3-Embedding-8B_new for layer 8
Processing embedding model: Qwen/Qwen3-Embedding-8B




Completed embedding Qwen/Qwen3-Embedding-8B for layer 8
Processing embedding model: Qwen/Qwen3-Embedding-0.6B




Completed embedding Qwen/Qwen3-Embedding-0.6B for layer 8
Processing embedding model: Qwen/Qwen3-Embedding-4B




Completed embedding Qwen/Qwen3-Embedding-4B for layer 8
Processing embedding model: Lajavaness/bilingual-embedding-large


Processing layers:  40%|████      | 2/5 [10:40<16:04, 321.62s/it]

Completed embedding Lajavaness/bilingual-embedding-large for layer 8

Processing layer 17...
Processing embedding model: Alibaba-NLP/gte-Qwen2-7B-instruct




Completed embedding Alibaba-NLP/gte-Qwen2-7B-instruct for layer 17
Processing embedding model: Qwen/Qwen3-Embedding-8B_new




Completed embedding Qwen/Qwen3-Embedding-8B_new for layer 17
Processing embedding model: Qwen/Qwen3-Embedding-8B




Completed embedding Qwen/Qwen3-Embedding-8B for layer 17
Processing embedding model: Qwen/Qwen3-Embedding-0.6B




Completed embedding Qwen/Qwen3-Embedding-0.6B for layer 17
Processing embedding model: Qwen/Qwen3-Embedding-4B




Completed embedding Qwen/Qwen3-Embedding-4B for layer 17
Processing embedding model: Lajavaness/bilingual-embedding-large


Processing layers:  60%|██████    | 3/5 [16:15<10:54, 327.37s/it]

Completed embedding Lajavaness/bilingual-embedding-large for layer 17

Processing layer 25...
Processing embedding model: Alibaba-NLP/gte-Qwen2-7B-instruct




Completed embedding Alibaba-NLP/gte-Qwen2-7B-instruct for layer 25
Processing embedding model: Qwen/Qwen3-Embedding-8B_new




Completed embedding Qwen/Qwen3-Embedding-8B_new for layer 25
Processing embedding model: Qwen/Qwen3-Embedding-8B




Completed embedding Qwen/Qwen3-Embedding-8B for layer 25
Processing embedding model: Qwen/Qwen3-Embedding-0.6B




Completed embedding Qwen/Qwen3-Embedding-0.6B for layer 25
Processing embedding model: Qwen/Qwen3-Embedding-4B




Completed embedding Qwen/Qwen3-Embedding-4B for layer 25
Processing embedding model: Lajavaness/bilingual-embedding-large


Processing layers:  80%|████████  | 4/5 [21:44<05:28, 328.21s/it]

Completed embedding Lajavaness/bilingual-embedding-large for layer 25

Processing layer 31...
Processing embedding model: Alibaba-NLP/gte-Qwen2-7B-instruct




Completed embedding Alibaba-NLP/gte-Qwen2-7B-instruct for layer 31
Processing embedding model: Qwen/Qwen3-Embedding-8B_new




Completed embedding Qwen/Qwen3-Embedding-8B_new for layer 31
Processing embedding model: Qwen/Qwen3-Embedding-8B




Completed embedding Qwen/Qwen3-Embedding-8B for layer 31
Processing embedding model: Qwen/Qwen3-Embedding-0.6B




Completed embedding Qwen/Qwen3-Embedding-0.6B for layer 31
Processing embedding model: Qwen/Qwen3-Embedding-4B




Completed embedding Qwen/Qwen3-Embedding-4B for layer 31
Processing embedding model: Lajavaness/bilingual-embedding-large


Processing layers: 100%|██████████| 5/5 [27:13<00:00, 326.75s/it]

Completed embedding Lajavaness/bilingual-embedding-large for layer 31

Results saved to: /home/mmezzanzanica/project/scoring_autoint_align/3_analysis/no_reranked_ndcg_all_layers_llama_with_all_overlaps.csv
Final dataset shape: (100, 21)





In [6]:
data = pd.read_csv('/home/mmezzanzanica/project/scoring_autoint_align/3_analysis/no_reranked_ndcg_all_layers_llama_with_all_overlaps.csv')
data[data['layer']==31]

Unnamed: 0,index,layer,url,media_antonio,media_daniele,media_andrea,media_filippo,mean_vote,score_gemini-2.0-flash,ndcg_Lajavaness/bilingual-embedding-large,...,ndcg_Qwen/Qwen3-Embedding-8B_new,ndcg_Qwen/Qwen3-Embedding-8B,ndcg_Qwen/Qwen3-Embedding-0.6B,ndcg_Qwen/Qwen3-Embedding-4B,overlap_Alibaba-NLP_gte-Qwen2-7B-instruct,overlap_Qwen_Qwen3-Embedding-8B_new,overlap_Qwen_Qwen3-Embedding-8B,overlap_Qwen_Qwen3-Embedding-0.6B,overlap_Qwen_Qwen3-Embedding-4B,overlap_Lajavaness_bilingual-embedding-large
80,7223,31,https://neuronpedia.org/llama3.1-8b/31-llamasc...,1.5,3.5,2.5,4.0,2.88,0.95,0.2417834,...,0.1995036,0.2063487,0.1107199,0.2730872,281.0,262.0,272.0,274.0,282.0,318.0
81,29280,31,https://neuronpedia.org/llama3.1-8b/31-llamasc...,3.5,3.5,4.0,4.0,3.75,0.941176,0.411346,...,0.4384768,0.3861469,0.3335098,0.3999115,160.0,135.0,139.0,125.0,194.0,153.0
82,18918,31,https://neuronpedia.org/llama3.1-8b/31-llamasc...,2.0,3.0,3.5,3.5,3.0,0.75,0.1848773,...,0.1269508,0.1124397,0.1308614,0.1124656,91.0,85.0,87.0,105.0,120.0,124.0
83,21175,31,https://neuronpedia.org/llama3.1-8b/31-llamasc...,2.5,2.5,4.0,3.5,3.12,1.0,0.04990553,...,0.1296466,0.08616774,0.07558866,0.115322,160.0,141.0,131.0,150.0,153.0,145.0
84,29410,31,https://neuronpedia.org/llama3.1-8b/31-llamasc...,1.0,1.0,1.0,1.0,1.0,0.678571,1.1170940000000001e-22,...,2.384258e-06,2.234147e-08,1.860428e-06,2.978196e-08,5.0,5.0,3.0,5.0,5.0,4.0
85,25748,31,https://neuronpedia.org/llama3.1-8b/31-llamasc...,1.0,2.0,1.0,1.5,1.38,0.675,0.07143992,...,0.1010931,0.07774695,0.08953295,0.06518351,44.0,67.0,62.0,65.0,75.0,40.0
86,29491,31,https://neuronpedia.org/llama3.1-8b/31-llamasc...,1.5,1.5,2.5,2.5,2.0,0.8,0.00476324,...,0.00033965,0.0002683909,0.001017457,0.001224964,18.0,33.0,21.0,27.0,18.0,17.0
87,29394,31,https://neuronpedia.org/llama3.1-8b/31-llamasc...,1.5,2.5,2.5,1.5,2.0,0.958333,0.4081909,...,0.458722,0.3999348,0.5193721,0.3888977,304.0,282.0,287.0,280.0,294.0,319.0
88,29802,31,https://neuronpedia.org/llama3.1-8b/31-llamasc...,1.0,3.0,3.0,3.5,2.62,0.525,0.2475786,...,0.2056189,0.2456502,0.1879127,0.250089,442.0,430.0,469.0,381.0,468.0,476.0
89,29424,31,https://neuronpedia.org/llama3.1-8b/31-llamasc...,1.5,1.5,2.5,3.0,2.12,0.825,0.08064459,...,0.1035642,0.1072172,0.1010159,0.113764,265.0,310.0,293.0,247.0,282.0,343.0
