In [1]:
import sys
import os
import numpy as np

import pandas as pd

from scipy.stats import wasserstein_distance
from tqdm import tqdm

In [2]:
sys.path.append(os.path.abspath('../src'))

from read_and_write_docs import read_jsonl, write_jsonl, read_rds
from utils import apply_temp_doc_id

In [3]:
max_threads = os.cpu_count()
print(f"Maximum threads available: {max_threads} - Using {max_threads - 2}")

os.environ["OMP_NUM_THREADS"] = str(max_threads - 2)

Maximum threads available: 12 - Using 10


In [4]:
def compute_emd(vec1, vec2, log_probs=False):
    """
    Computes Earth Mover's Distance (Wasserstein Distance) between two vectors.
    
    Parameters:
    - vec1, vec2: Input vectors (same length or will be truncated to min length)
    - log_probs: If True, inputs are treated as log probabilities and exponentiated
    
    Returns:
    - emd: The Earth Mover’s Distance between the two vectors
    """
    vec1, vec2 = np.array(vec1), np.array(vec2)
    
    # Truncate to the same length if needed
    #min_len = min(len(vec1), len(vec2))
    #vec1, vec2 = vec1[:min_len], vec2[:min_len]
    
    if log_probs:
        vec1 = np.exp(vec1)
        vec2 = np.exp(vec2)
    
    # Normalize to sum to 1 (optional, but often useful for probability comparisons)
    vec1 = vec1 / np.sum(vec1)
    vec2 = vec2 / np.sum(vec2)

    emd = wasserstein_distance(vec1, vec2)
    return emd


In [5]:
def build_metadata_df(filtered_metadata: pd.DataFrame,
                      known_df: pd.DataFrame,
                      unknown_df: pd.DataFrame) -> pd.DataFrame:
    """
    From filtered_metadata (with columns problem, corpus, known_author, unknown_author)
    and known_df (with columns author, doc_id), build a metadata table exploded so that
    each known_doc_id gets its own row, and assign a running sample_id.
    """
    # Step 1: build the initial DataFrame with a list-column
    records = []
    for _, met in filtered_metadata.iterrows():
        problem        = met['problem']
        corpus         = met['corpus']
        known_author   = met['known_author']
        unknown_author = met['unknown_author']

        # collect all doc_ids for this author
        doc_ids = known_df.loc[
            known_df['author'] == known_author,
            'doc_id'
        ].unique().tolist()

        unknown_doc_id = unknown_df.loc[
            unknown_df['author'] == unknown_author,
            'doc_id'
        ].iloc[0]
        
        records.append({
            'problem':        problem,
            'corpus':         corpus,
            'known_author':   known_author,
            'unknown_author': unknown_author,
            'unknown_doc_id': unknown_doc_id,
            'known_doc_ids':  doc_ids
        })

    meta = pd.DataFrame(records)

    # Step 2: explode the list-column into individual rows
    exploded = (
        meta
        .explode('known_doc_ids')
        .rename(columns={'known_doc_ids': 'known_doc_id'})
        .reset_index(drop=True)
    )

    # Step 3: add sample_id starting at 1
    exploded.insert(0, 'sample_id', range(1, len(exploded) + 1))

    return exploded

In [6]:
def sample_impostors(df_known, author_id, n_impostors=100, seed=42):
    pool = df_known[df_known['author'] != author_id]
    return pool.sample(n_impostors,
                       replace=(len(pool) < n_impostors),
                       random_state=seed)

In [7]:
# results = []
# result_column = 'log_probs'

# # Outer loop: Repeat the entire process 5 times to get 5 scores for each sample
# for test_id in tqdm(range(1, 6), desc="Tests"):
#     for _, met in tqdm(corpus_problem_metadata.iterrows(), total=len(corpus_problem_metadata),
#                        desc=f'Test {test_id} Samples', leave=False):
        
#         sample_id = met['sample_id']
#         problem = met['problem']
#         corpus = met['corpus']
#         known_author = met['known_author']
#         unknown_author = met['unknown_author']
#         known_doc_id = met['known_doc_id']
        
#         # Retrieve known and unknown problem entries
#         known_problem = known[known['doc_id'] == known_doc_id]
#         unknown_problem = unknown[unknown['author'] == unknown_author]
        
#         # Filter out the two authors
#         known_filtered = known[~known['author'].isin([known_author, unknown_author])]
        
#         # Precompute distances between known and unknown
#         lp_known = known_problem[result_column].iloc[0]
#         lp_unknown = unknown_problem[result_column].iloc[0]
#         k_vs_u_dist = compute_emd(lp_known, lp_unknown, log_probs=True)
        
#         # Initialize aggregated score
#         aggregated_score = 0.0
        
#         # Inner loop: Repeat the process 100 times for each sample
#         for repetition_id in range(1, 101):
#             n_impostors = max(round(len(known_filtered) * 0.5), 1)  # Ensure at least 1 imposter
            
#             # Sample 50% of known_filtered as impostors
#             if not known_filtered.empty:
#                 impostors = known_filtered.sample(n=n_impostors, replace=False)
#                 imp_scores = impostors[result_column].values
#                 k_vs_imp_dist  = [compute_emd(lp_unknown, imp_lp, log_probs=True) for imp_lp in imp_scores]
                
#                 # Calculate the number of impostors with distance less than known-unknown distance
#                 N_less = sum(1 for dist in k_vs_imp_dist if dist < k_vs_u_dist)
#                 rank = N_less + 1  # Rank starts from 1
#                 contribution = 1.0 / (100.0 * rank)
                
#                 aggregated_score += contribution
        
#         # Store the result
#         results.append({
#             'sample_id': sample_id,
#             'test_id': test_id,
#             'problem': problem,
#             'corpus': corpus,
#             'known_author': known_author,
#             'unknown_author': unknown_author,
#             'd kriston': k_vs_u_dist,
#             'score': aggregated_score,
#             'label': (unknown_author == known_author)
#         })

# # Convert to DataFrame
# df_results = pd.DataFrame(results)

In [8]:
def impostors_method(
    metadata_df: pd.DataFrame,
    known_df: pd.DataFrame,
    unknown_df: pd.DataFrame,
    score_col: str = 'log_probs',
    impostor_fraction: float = 0.5,
    n_repetitions: int = 100,
    n_tests: int = 5,
    log_probs: bool = True
) -> pd.DataFrame:
    """
    Perform the Impostors Method for author verification.

    At each of `n_tests` outer iterations, and for each sample defined in `metadata_df`,
    the function computes Earth Mover's Distance (EMD) between known and unknown profiles.
    Then, across `n_repetitions` random draws of impostor profiles (excluding both true authors),
    it ranks how many impostors are closer than the true known sample, and accumulates a score.

    Parameters:
    - metadata_df: DataFrame with columns:
        ['sample_id', 'problem', 'corpus', 'known_author', 'unknown_author', 'known_doc_id']
    - known_df: DataFrame of known documents; must include ['doc_id', 'author', score_col]
    - unknown_df: DataFrame of unknown documents; must include ['doc_id', 'author', score_col]
    - score_col: name of column containing precomputed profile scores (default 'log_probs')
    - impostor_fraction: fraction of the filtered pool to sample as impostors each repetition
    - n_repetitions: number of random impostor samples per outer test per sample
    - n_tests: number of times to repeat the full process for stability
    - log_probs: whether to interpret the score vectors as log probabilities when computing EMD (default True)

    Returns:
    - DataFrame with columns:
        ['sample_id', 'test_id', 'problem', 'corpus', 'known_doc_id', 'known_author',
         'unknown_doc_id', 'unknown_author', 'emd_known_unknown', 'aggregated_score', 'label']
    """
    rows = []

    # Repeat the entire evaluation n_tests times for robustness
    for test_id in tqdm(range(1, n_tests + 1), desc="Impostors Tests"):
        for _, meta in tqdm(metadata_df.iterrows(), total=len(metadata_df),
                              desc=f'Test {test_id} Samples', leave=False):
            # Extract metadata fields
            sample_id = meta['sample_id']
            problem = meta['problem']
            corpus = meta['corpus']
            known_doc_id = known_df['doc_id']
            known_author = meta['known_author']
            unknown_doc_id = meta['unknown_doc_id']
            unknown_author = meta['unknown_author']
            known_doc_id = meta['known_doc_id']

            # Fetch profile scores
            known_score = known_df.loc[known_df['doc_id'] == known_doc_id, score_col].iloc[0]
            unknown_score = unknown_df.loc[unknown_df['author'] == unknown_author, score_col].iloc[0]

            # Compute true distance between known and unknown
            emd_ku = compute_emd(known_score, unknown_score, log_probs=log_probs)

            # Build impostor pool (exclude both authors)
            pool = known_df[~known_df['author'].isin([known_author, unknown_author])]
            n_impostors = max(int(round(len(pool) * impostor_fraction)), 1)

            aggregated_score = 0.0
            # Draw random impostor sets and accumulate score
            for _ in range(n_repetitions):
                if not pool.empty:
                    impostors = pool.sample(n=n_impostors, replace=False)
                    # Distances from unknown to each impostor profile
                    imp_dists = [compute_emd(unknown_score, imp_score, log_probs=log_probs)
                                 for imp_score in impostors[score_col].values]
                    # Rank among impostors
                    rank = 1 + sum(1 for d in imp_dists if d < emd_ku)
                    aggregated_score += 1.0 / (n_repetitions * rank)

            # Label: 1 if unknown matches known_author, else 0
            label = int(unknown_author == known_author)

            rows.append({
                'sample_id': sample_id,
                'test_id': test_id,
                'problem': problem,
                'corpus': corpus,
                'known_doc_id': known_doc_id,
                'known_author': known_author,
                'unknown_doc_id': unknown_doc_id,
                'unknown_author': unknown_author,
                'emd_known_unknown': emd_ku,
                'aggregated_score': aggregated_score,
                'label': label
            })

    return pd.DataFrame(rows)


In [None]:
base_loc = "//bc_nas_storage/BCross/datasets/author_verification"

model_name = "Qwen2.5-1.5B-Instruct"
model_save_name = model_name.lower().replace("-", "_")

corpuses = ['Wiki', 'Enron']
data_types = ['training', 'test']
score_cols = ['log_probs', 'abs_differences']

for data_type in data_types:
    
    metadata_loc = f"{base_loc}/{data_type}/metadata.rds"
    metadata = read_rds(metadata_loc)
    
    for corpus in corpuses:

        filtered_metadata = metadata[metadata['corpus'] == corpus]

        known_loc = f"{base_loc}/{data_type}/{corpus}/known_logprobs_qwen2.5_1.5b_instruct.jsonl"
        known = read_jsonl(known_loc)

        unknown_loc = f"{base_loc}/{data_type}/{corpus}/unknown_logprobs_qwen2.5_1.5b_instruct.jsonl"
        unknown = read_jsonl(unknown_loc)

        corpus_problem_metadata = build_metadata_df(filtered_metadata, known, unknown)
        print(f"The Number of Problems in {corpus} ({data_type}): {corpus_problem_metadata.shape[0]}")
        
        for score_col in score_cols:

            print(f"    Completing the Impostor Method for {corpus} ({data_type}): {score_col}")

            log_prob_flag = True if score_col == "log_probs" else False
            
            result_save_dir = f"{base_loc}/log_probs_results/{score_col}"
            result_save_loc = f"{result_save_dir}/{data_type}_{corpus.lower()}_{model_save_name}.jsonl"
            
            os.makedirs(result_save_dir, exist_ok=True)
            if os.path.exists(result_save_loc):
                print(f"Skipping {result_save_loc}: already exists")
                continue

            results = impostors_method(
                metadata_df = corpus_problem_metadata,
                known_df = known,
                unknown_df = unknown,
                score_col = score_col,
                impostor_fraction = 0.5,
                n_repetitions = 100,
                n_tests = 5,
                log_probs = log_prob_flag
            )

            results.insert(
                loc=results.columns.get_loc('corpus'),
                column='comparison',
                value=score_col
            )
            
            results.insert(
                loc=results.columns.get_loc('corpus'),
                column='data_type',
                value=data_type
            )

            write_jsonl(results, result_save_loc)
            print(f"    Results Saved for {corpus} ({data_type}): {score_col}")

The Number of Problems in Wiki (training): 450
    Completing the Impostor Method for Wiki (training): log_probs
Skipping //bc_nas_storage/BCross/datasets/author_verification/log_probs_results/log_probs/training_wiki_qwen2.5_1.5b_instruct.jsonl: already exists
    Completing the Impostor Method for Wiki (training): abs_differences
Skipping //bc_nas_storage/BCross/datasets/author_verification/log_probs_results/abs_differences/training_wiki_qwen2.5_1.5b_instruct.jsonl: already exists
The Number of Problems in Enron (training): 224
    Completing the Impostor Method for Enron (training): log_probs
Skipping //bc_nas_storage/BCross/datasets/author_verification/log_probs_results/log_probs/training_enron_qwen2.5_1.5b_instruct.jsonl: already exists
    Completing the Impostor Method for Enron (training): abs_differences
Skipping //bc_nas_storage/BCross/datasets/author_verification/log_probs_results/abs_differences/training_enron_qwen2.5_1.5b_instruct.jsonl: already exists
The Number of Problem

Impostors Tests:   0%|                                                                               | 0/5 [00:00<?, ?it/s]
Test 1 Samples:   0%|                                                                              | 0/672 [00:00<?, ?it/s][A
Test 1 Samples:   0%|                                                                      | 1/672 [00:03<35:56,  3.21s/it][A
Test 1 Samples:   0%|▏                                                                     | 2/672 [00:06<35:12,  3.15s/it][A
Test 1 Samples:   0%|▎                                                                     | 3/672 [00:09<34:51,  3.13s/it][A
Test 1 Samples:   1%|▍                                                                     | 4/672 [00:12<34:20,  3.08s/it][A
Test 1 Samples:   1%|▌                                                                     | 5/672 [00:15<33:58,  3.06s/it][A
Test 1 Samples:   1%|▋                                                                     | 6/672 [00:18<33:43,  

    Results Saved for Wiki (test): log_probs
    Completing the Impostor Method for Wiki (test): abs_differences


Impostors Tests:   0%|                                                                               | 0/5 [00:00<?, ?it/s]
Test 1 Samples:   0%|                                                                              | 0/672 [00:00<?, ?it/s][A
Test 1 Samples:   0%|                                                                      | 1/672 [00:03<39:38,  3.54s/it][A
Test 1 Samples:   0%|▏                                                                     | 2/672 [00:07<40:06,  3.59s/it][A
Test 1 Samples:   0%|▎                                                                     | 3/672 [00:10<39:59,  3.59s/it][A
Test 1 Samples:   1%|▍                                                                     | 4/672 [00:14<39:43,  3.57s/it][A
Test 1 Samples:   1%|▌                                                                     | 5/672 [00:17<39:21,  3.54s/it][A
Test 1 Samples:   1%|▋                                                                     | 6/672 [00:21<39:16,  

In [None]:
# known = read_jsonl(f"{base_loc}/training/Enron/known_logprobs_qwen2.5_1.5b_instruct.jsonl")
# unknown = read_jsonl(f"{base_loc}/training/Enron/known_logprobs_qwen2.5_1.5b_instruct.jsonl")

# metadata = read_rds(f"{base_loc}/training/metadata.rds")
# filtered_metadata = metadata[metadata['corpus'] == 'Enron']

# corpus_problem_metadata = build_metadata_df(filtered_metadata, known, unknown)

# # First, select only the columns you need from test_metadata
# corpus_problem_metadata = corpus_problem_metadata[['problem', 'known_doc_id', 'unknown_doc_id']]

# # Next, drop the 'unknown_doc_id' column from results
# results_dropped = results.drop(columns='unknown_doc_id')

# # Finally, merge on 'problem'
# merged_df = results_dropped.merge(
#     corpus_problem_metadata,
#     on=['problem', 'known_doc_id'],
#     how='inner'   # or 'left', 'right', 'outer' depending on your needs
# )

# desired_order = [
#     'sample_id',
#     'test_id',
#     'problem',
#     'comparison',
#     'data_type',       # moved here
#     'corpus',
#     'known_doc_id',
#     'known_author',
#     'unknown_doc_id',
#     'unknown_author',
#     'emd_known_unknown',
#     'aggregated_score',
#     'label'
# ]

# merged_df = merged_df[desired_order]

In [None]:
# merged_df.head(5)

In [None]:
# write_jsonl(merged_df, "//bc_nas_storage/BCross/datasets/author_verification/log_probs_results/abs_differences/training_enron_qwen2.5_1.5b_instruct.jsonl")

In [None]:
# df_results[df_results['problem'] == '142.196.88.228 vs 142.196.88.228']

In [None]:
# df_agg = (
#     df_results
#     .groupby(['problem', 'label'], as_index=False)
#     ['score']
#     .mean()
#     .rename(columns={'rank_score': 'score'})
# )

# # df_agg now contains one row per problem with its average rank score and label
# print(df_agg.head())