In [1]:
import os
import sys

import numpy as np
import pandas as pd

from tqdm import tqdm
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
sys.path.append(os.path.abspath('../src'))

from read_and_write_docs import read_jsonl, write_jsonl, read_rds
from utils import apply_temp_doc_id, build_metadata_df
from lambdaG import extract_ngrams, lambdaG_paraphrase, lambdaG, lambdaG_v2, lambdaG_max_similarity
from performance import performance

In [3]:
base_loc = "/Volumes/BCross/datasets/author_verification"

data_type = "training"
corpus = "Enron"

model = "Qwen2.5-1.5B-Instruct"
model_name = model.lower().replace("-", "_")
token_type = model

known_loc = f"{base_loc}/sentence_log_probs_datasets/{data_type}/{corpus}/known_sentence_logprobs_{model_name}.jsonl"
# known_loc = "/Users/user/Documents/test_data/known_sentence_logprobs_qwen2.5_1.5b_instruct.jsonl"
known = read_jsonl(known_loc)
known.rename(columns={'sentence': 'text'}, inplace=True)
known = apply_temp_doc_id(known)
known = known[known['num_tokens'] > 0]
# known['perplexity'] = known['log_probs'].apply(compute_perplexity)

unknown_loc = f"{base_loc}/sentence_log_probs_datasets/{data_type}/{corpus}/unknown_sentence_logprobs_{model_name}.jsonl"
# unknown_loc = '/Users/user/Documents/test_data/unknown_sentence_logprobs_qwen2.5_1.5b_instruct.jsonl'
unknown = read_jsonl(unknown_loc)
unknown.rename(columns={'sentence': 'text'}, inplace=True)
unknown = apply_temp_doc_id(unknown)
unknown = unknown[unknown['num_tokens'] > 0]
# unknown['perplexity'] = unknown['log_probs'].apply(compute_perplexity)

metadata_loc = f"{base_loc}/{data_type}/metadata.rds"
metadata_loc = "/Users/user/Documents/test_data/metadata.rds"
metadata = read_rds(metadata_loc)
filtered_metadata = metadata[metadata['corpus'] == corpus]
agg_metadata = build_metadata_df(filtered_metadata, known, unknown)

# Set an impostor location
impostor_loc = "/Volumes/BCross/datasets/author_verification/training/Wiki/Qwen_2.5_1.5B/gen_t_1.5_tp_0.9/top_impostors_tokenized"

In [4]:
known.head(2)

Unnamed: 0,doc_id,orig_doc_id,corpus,chunk_id,author,texttype,text,tokens,log_probs,med_log_prob,differences,abs_differences,num_tokens,sum_log_prob,avg_log_prob,mean_diff,mean_abs_diff
0,andy_zipper_mail_1,known [Andy.zipper - Mail_1].txt,Enron,1,Andy.zipper,known,And I guess we simply weren't prepared for thi...,"[And, I, guess, we, simply, weren, 't, prepare...","[-12.659452438354492, -4.494169235229492, -4.5...","[-18.545480728149414, -18.545480728149414, -18...","[5.886028289794922, 14.051311492919922, 14.428...","[5.886028289794922, 14.051311492919922, 14.428...",16,-72.019607,-4.501225,15.877056,15.877056
1,andy_zipper_mail_1,known [Andy.zipper - Mail_1].txt,Enron,2,Andy.zipper,known,"Before military police restored order, thousan...","[Before, military, police, restored, order, ,,...","[-14.579527854919434, -11.595590591430664, -4....","[-19.1412410736084, -19.1412410736084, -22.459...","[4.561713218688965, 7.545650482177734, 18.0961...","[4.561713218688965, 7.545650482177734, 18.0961...",28,-125.616024,-4.486287,17.655718,17.655718


In [21]:
known_filtered = known[known['author'].isin(['Akuri', '142.196.88.228'])]

In [5]:
def embed_text_column(
    df: pd.DataFrame,
    text_column: str = 'text',
    embedding_column: str = 'embedding',
    model_name: str = 'all-MiniLM-L6-v2',
    batch_size: int = 32,
    show_progress_bar: bool = True
) -> pd.DataFrame:
    """
    Embed a text column of a DataFrame using a Hugging Face embedding model.
    
    Args:
        df (pd.DataFrame): Input DataFrame containing the text column.
        text_column (str): Name of the text column to embed.
        embedding_column (str): Name of the new column to store embeddings.
        model_name (str): Hugging Face model name (e.g., 'all-MiniLM-L6-v2').
        batch_size (int): Batch size for embedding.
        show_progress_bar (bool): Whether to show a progress bar during embedding.
        
    Returns:
        pd.DataFrame: Original DataFrame with an added column of embeddings.
    """
    # Validate input
    if text_column not in df.columns:
        raise ValueError(f"Text column '{text_column}' not found in DataFrame.")

    # Work on a copy to avoid SettingWithCopyWarning
    df_copy = df.copy()

    # Initialize the embedding model
    model = SentenceTransformer(model_name)

    # Convert text values to string and collect for batching
    texts = df_copy[text_column].astype(str).tolist()

    # Compute embeddings
    embeddings = model.encode(
        texts,
        batch_size=batch_size,
        show_progress_bar=show_progress_bar
    )

    # Assign embeddings as list of floats to avoid numpy 2D assignment issues
    df_copy[embedding_column] = embeddings.tolist()

    return df_copy

In [7]:
known_embedded = embed_text_column(known)
# known_filtered_embedded = embed_text_column(known_filtered)
unknown_embedded = embed_text_column(unknown)

Batches: 100%|████████████████████████████████| 142/142 [00:04<00:00, 33.62it/s]
Batches: 100%|██████████████████████████████████| 46/46 [00:02<00:00, 22.57it/s]


In [10]:
# lambdaG(unknown, known_filtered, known, metadata=agg_metadata)

    There are 2 known author(s) and 4 problem(s) in the dataset.
        Working on problem 1 of 4: 142.196.88.228 vs 142.196.88.228
        Working on problem 2 of 4: 142.196.88.228 vs Aban1313
        Working on problem 3 of 4: Akuri vs Akuri
        Working on problem 4 of 4: Akuri vs AlanBarnet


Unnamed: 0,problem,known_author,unknown_author,target,score
0,142.196.88.228 vs 142.196.88.228,142.196.88.228,142.196.88.228,True,23662.776835
1,142.196.88.228 vs Aban1313,142.196.88.228,Aban1313,False,-8618.878281
2,Akuri vs Akuri,Akuri,Akuri,True,17237.168861
3,Akuri vs AlanBarnet,Akuri,AlanBarnet,False,-3905.769088


In [8]:
test_res = lambdaG_max_similarity(unknown_embedded, known_filtered_embedded, known_embedded, metadata=agg_metadata)

NameError: name 'known_filtered_embedded' is not defined

In [None]:
# lambdaG_v2(unknown, known_filtered, known, metadata=agg_metadata)

In [None]:
# test_res = lambdaG_paraphrase(unknown, known_filtered, metadata=agg_metadata, impostor_loc=impostor_loc)

In [10]:
test_res

Unnamed: 0,problem,known_author,unknown_author,target,score
0,142.196.88.228 vs 142.196.88.228,142.196.88.228,142.196.88.228,True,22672.086886
1,142.196.88.228 vs Aban1313,142.196.88.228,Aban1313,False,-6832.770557
2,Akuri vs Akuri,Akuri,Akuri,True,14516.544497
3,Akuri vs AlanBarnet,Akuri,AlanBarnet,False,-4207.973023


In [10]:
all_results = []

for rep in range(1, 6): 
    print(f"Repetition {rep}")
    df = lambdaG_max_similarity(
        unknown_embedded,
        known_embedded,
        known_embedded,
        metadata=agg_metadata
    )

    # Add the repetition column at the start:
    df.insert(0, 'repetition', rep)
    df.insert(1, 'corpus', corpus)      # move corpus next
    df.insert(2, 'data_type', data_type)
    df.insert(2, 'token_type', token_type) 
    all_results.append(df)

# Combine all repetitions into one DataFrame
test_res_long = pd.concat(all_results, ignore_index=True)

Repetition 1
    There are 32 known author(s) and 64 problem(s) in the dataset.
        Working on problem 1 of 64: Andy.zipper vs Andy.zipper
        Working on problem 2 of 64: Andy.zipper vs Barry.tycholiz
        Working on problem 3 of 64: Barry.tycholiz vs Barry.tycholiz
        Working on problem 4 of 64: Barry.tycholiz vs Benjamin.rogers
        Working on problem 5 of 64: Benjamin.rogers vs Benjamin.rogers
        Working on problem 6 of 64: Benjamin.rogers vs Bill.williams
        Working on problem 7 of 64: Bill.williams vs Bill.williams
        Working on problem 8 of 64: Bill.williams vs Cara.semperger
        Working on problem 9 of 64: Cara.semperger vs Cara.semperger
        Working on problem 10 of 64: Cara.semperger vs Carol.clair
        Working on problem 11 of 64: Carol.clair vs Carol.clair
        Working on problem 12 of 64: Carol.clair vs Chris.dorland
        Working on problem 13 of 64: Chris.dorland vs Chris.dorland
        Working on problem 14 of 64: Chris.

In [11]:
test_res_long

Unnamed: 0,repetition,corpus,token_type,data_type,problem,known_author,unknown_author,target,score
0,1,Enron,Qwen2.5-1.5B-Instruct,training,Andy.zipper vs Andy.zipper,Andy.zipper,Andy.zipper,True,5664.602018
1,1,Enron,Qwen2.5-1.5B-Instruct,training,Andy.zipper vs Barry.tycholiz,Andy.zipper,Barry.tycholiz,False,-5027.842891
2,1,Enron,Qwen2.5-1.5B-Instruct,training,Barry.tycholiz vs Barry.tycholiz,Barry.tycholiz,Barry.tycholiz,True,13795.558616
3,1,Enron,Qwen2.5-1.5B-Instruct,training,Barry.tycholiz vs Benjamin.rogers,Barry.tycholiz,Benjamin.rogers,False,6107.237502
4,1,Enron,Qwen2.5-1.5B-Instruct,training,Benjamin.rogers vs Benjamin.rogers,Benjamin.rogers,Benjamin.rogers,True,12278.282148
...,...,...,...,...,...,...,...,...,...
315,5,Enron,Qwen2.5-1.5B-Instruct,training,K.allen vs Kam.keiser,K.allen,Kam.keiser,False,-13073.234427
316,5,Enron,Qwen2.5-1.5B-Instruct,training,Kam.keiser vs Kam.keiser,Kam.keiser,Kam.keiser,True,10851.247357
317,5,Enron,Qwen2.5-1.5B-Instruct,training,Kam.keiser vs Kate.symes,Kam.keiser,Kate.symes,False,-10868.279335
318,5,Enron,Qwen2.5-1.5B-Instruct,training,Kate.symes vs Andy.zipper,Kate.symes,Andy.zipper,False,-7869.612466


In [12]:
write_jsonl(test_res_long, '/Volumes/BCross/datasets/author_verification/lambda_g_results/Enron_training_qwen2.5_1.5b_instruct_max_sim.jsonl')

In [9]:
read_jsonl('/Volumes/BCross/datasets/author_verification/lambda_g_results/Wiki_training_qwen2.5_1.5b_instruct_max_sim.jsonl')

Unnamed: 0,repetition,corpus,token_type,data_type,problem,known_author,unknown_author,target,score
0,1,Wiki,Qwen2.5-1.5B-Instruct,training,142.196.88.228 vs 142.196.88.228,142.196.88.228,142.196.88.228,True,21670.058283
1,1,Wiki,Qwen2.5-1.5B-Instruct,training,142.196.88.228 vs Aban1313,142.196.88.228,Aban1313,False,-6293.803291
2,1,Wiki,Qwen2.5-1.5B-Instruct,training,A_Man_In_Black vs A_Man_In_Black,A_Man_In_Black,A_Man_In_Black,True,5056.981924
3,1,Wiki,Qwen2.5-1.5B-Instruct,training,A_Man_In_Black vs Bankhallbretherton,A_Man_In_Black,Bankhallbretherton,False,-8213.153313
4,1,Wiki,Qwen2.5-1.5B-Instruct,training,Aban1313 vs Aban1313,Aban1313,Aban1313,True,3772.600359
...,...,...,...,...,...,...,...,...,...
745,5,Wiki,Qwen2.5-1.5B-Instruct,training,Haymaker vs HeadleyDown,Haymaker,HeadleyDown,False,-3269.552887
746,5,Wiki,Qwen2.5-1.5B-Instruct,training,HeadleyDown vs HeadleyDown,HeadleyDown,HeadleyDown,True,3895.946070
747,5,Wiki,Qwen2.5-1.5B-Instruct,training,HeadleyDown vs Hipocrite,HeadleyDown,Hipocrite,False,-9169.141508
748,5,Wiki,Qwen2.5-1.5B-Instruct,training,Hipocrite vs Hipocrite,Hipocrite,Hipocrite,True,-143.055478


In [13]:
test_df = read_jsonl('/Volumes/BCross/datasets/author_verification/lambda_g_results/Enron_training_qwen2.5_1.5b_instruct_max_sim.jsonl')

In [14]:
test_df

Unnamed: 0,repetition,corpus,token_type,data_type,problem,known_author,unknown_author,target,score
0,1,Enron,Qwen2.5-1.5B-Instruct,training,Andy.zipper vs Andy.zipper,Andy.zipper,Andy.zipper,True,5664.602018
1,1,Enron,Qwen2.5-1.5B-Instruct,training,Andy.zipper vs Barry.tycholiz,Andy.zipper,Barry.tycholiz,False,-5027.842891
2,1,Enron,Qwen2.5-1.5B-Instruct,training,Barry.tycholiz vs Barry.tycholiz,Barry.tycholiz,Barry.tycholiz,True,13795.558616
3,1,Enron,Qwen2.5-1.5B-Instruct,training,Barry.tycholiz vs Benjamin.rogers,Barry.tycholiz,Benjamin.rogers,False,6107.237502
4,1,Enron,Qwen2.5-1.5B-Instruct,training,Benjamin.rogers vs Benjamin.rogers,Benjamin.rogers,Benjamin.rogers,True,12278.282148
...,...,...,...,...,...,...,...,...,...
315,5,Enron,Qwen2.5-1.5B-Instruct,training,K.allen vs Kam.keiser,K.allen,Kam.keiser,False,-13073.234427
316,5,Enron,Qwen2.5-1.5B-Instruct,training,Kam.keiser vs Kam.keiser,Kam.keiser,Kam.keiser,True,10851.247357
317,5,Enron,Qwen2.5-1.5B-Instruct,training,Kam.keiser vs Kate.symes,Kam.keiser,Kate.symes,False,-10868.279335
318,5,Enron,Qwen2.5-1.5B-Instruct,training,Kate.symes vs Andy.zipper,Kate.symes,Andy.zipper,False,-7869.612466


In [15]:
results_agg = (
    test_df
    .groupby(['problem', 'target'], as_index=False)
    ['score']
    .mean()
)
score_col = 'score'
target_col = 'target'
performance(results_agg,
            score_col,
            target_col,
            additional_metadata={
                'corpus': corpus
            })

Unnamed: 0,corpus,Cllr,Cllr_min,EER,AUC,Balanced_Accuracy,Precision,Recall,F1,TP,FP,FN,TN,Mean_TRUE_LLR,Mean_FALSE_LLR,TRUE_trials,FALSE_trials
0,Enron,0.527494,0.527494,0.09375,0.914062,0.875,0.928571,0.8125,0.866667,26,2,6,30,1.404785,-0.789995,32,32


## Test Embedding

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer

def embed_text_column(
    df: pd.DataFrame,
    text_column: str = 'text',
    embedding_column: str = 'embedding',
    model_name: str = 'all-MiniLM-L6-v2',
    batch_size: int = 32,
    show_progress_bar: bool = True
) -> pd.DataFrame:
    """
    Embed a text column of a DataFrame using a Hugging Face embedding model.
    
    Args:
        df (pd.DataFrame): Input DataFrame containing the text column.
        text_column (str): Name of the text column to embed.
        embedding_column (str): Name of the new column to store embeddings.
        model_name (str): Hugging Face model name (e.g., 'all-MiniLM-L6-v2').
        batch_size (int): Batch size for embedding.
        show_progress_bar (bool): Whether to show a progress bar during embedding.
        
    Returns:
        pd.DataFrame: Original DataFrame with an added column of embeddings.
    """
    # Validate input
    if text_column not in df.columns:
        raise ValueError(f"Text column '{text_column}' not found in DataFrame.")

    # Work on a copy to avoid SettingWithCopyWarning
    df_copy = df.copy()

    # Initialize the embedding model
    model = SentenceTransformer(model_name)

    # Convert text values to string and collect for batching
    texts = df_copy[text_column].astype(str).tolist()

    # Compute embeddings
    embeddings = model.encode(
        texts,
        batch_size=batch_size,
        show_progress_bar=show_progress_bar
    )

    # Assign embeddings as list of floats to avoid numpy 2D assignment issues
    df_copy[embedding_column] = embeddings.tolist()

    return df_copy

In [None]:
known_embedded = embed_text_column(known)
known_filtered_embedded = embed_text_column(known_filtered)
unknown_embedded = embed_text_column(unknown)

In [None]:
known_filtered_embedded.head(2)

In [None]:
def cosine_similarity(
    vec1: np.ndarray,
    vec2: np.ndarray
) -> float:
    """
    Compute the cosine similarity between two vectors.

    Args:
        vec1 (np.ndarray): First vector.
        vec2 (np.ndarray): Second vector.

    Returns:
        float: Cosine similarity score between -1 and 1.
    """
    # Convert inputs to numpy arrays
    v1 = np.array(vec1, dtype=float)
    v2 = np.array(vec2, dtype=float)

    # Compute dot product and norms
    dot_prod = np.dot(v1, v2)
    norm1 = np.linalg.norm(v1)
    norm2 = np.linalg.norm(v2)

    if norm1 == 0 or norm2 == 0:
        raise ValueError("One or both vectors have zero magnitude, cosine similarity is undefined.")

    return dot_prod / (norm1 * norm2)

In [None]:
cosine_similarity(known_filtered_embedded.iloc[0, 17], known_filtered_embedded.iloc[1, 17])

In [None]:
def get_top_n_closest(
    df: pd.DataFrame,
    query_vec: np.ndarray,
    embedding_column: str = 'embedding',
    top_n: int = 5
) -> pd.DataFrame:
    """
    Retrieve the top N rows from a DataFrame whose embeddings are closest to a query vector.

    Args:
        df (pd.DataFrame): DataFrame containing an embedding column.
        query_vec (np.ndarray or list): The query embedding vector.
        embedding_column (str): Name of the column with embeddings.
        top_n (int): Number of top similar rows to return.

    Returns:
        pd.DataFrame: Subset of the original DataFrame sorted by descending similarity,
                      with an additional 'similarity' column.
    """
    # Validate input
    if embedding_column not in df.columns:
        raise ValueError(f"Embedding column '{embedding_column}' not found in DataFrame.")

    # Convert query vector to numpy array
    qv = np.array(query_vec, dtype=float)
    if np.linalg.norm(qv) == 0:
        raise ValueError("Query vector has zero magnitude, cosine similarity is undefined.")

    # Compute similarities
    sims = []
    for emb in df[embedding_column]:
        sims.append(cosine_similarity(qv, np.array(emb, dtype=float)))

    # Create a copy with similarity scores
    df_with_sim = df.copy()
    df_with_sim['similarity'] = sims

    # Sort and select top N
    top_df = df_with_sim.sort_values(by='similarity', ascending=False).head(top_n)

    return top_df.reset_index(drop=True)

In [None]:
known_embedded_filtered = known_embedded[known_embedded['doc_id'] != '142_196_88_228_text_1']

In [None]:
get_top_n_closest(known_embedded_filtered, known_filtered_embedded.iloc[0,17], top_n=30)

In [None]:
    import random

    known_tokens = ["Sentence A.", "Sentence B.", "Sentence C."]
    known_embeddings = [
        np.random.rand(5) for _ in known_tokens  # replace with real embeddings
    ]

    # Reference DataFrame with tokens and precomputed embeddings
    ref_tokens = [f"Ref sentence {i}" for i in range(1, 11)]
    ref_embeddings = [
        np.random.rand(5) for _ in ref_tokens  # replace with real embeddings
    ]
    refs_df = pd.DataFrame({
        'tokens': ref_tokens,
        'embedding': ref_embeddings
    })

    # Parameters
    r = 4  # number of nearest neighbors and samples

    # Step 1: Build neighbor lists for each known sentence
    neighbor_lists = []
    for emb in known_embeddings:
        top_df = get_top_n_closest(refs_df, emb, embedding_column='embedding', top_n=r)
        neighbor_lists.append(top_df['tokens'].tolist())
    print("Neighbor lists per known sentence:")
    for i, nbrs in enumerate(neighbor_lists, 1):
        print(f" Known sentence {i}: {nbrs}")

    # Step 2: Generate r samples ensuring no duplicates within each sample
    max_attempts = 100
    sample_sets = None
    for attempt in range(max_attempts):
        # Shuffle each neighbor list independently
        shuffled = [random.sample(nbrs, len(nbrs)) for nbrs in neighbor_lists]
        # Transpose to get samples[j][i]
        samples = list(zip(*shuffled))[:r]
        # Check for duplicates within each sample
        if all(len(set(sample)) == len(sample) for sample in samples):
            sample_sets = [list(sample) for sample in samples]
            break

    if sample_sets is None:
        print("Error: Could not create non-overlapping samples")
    else:
        print("Generated sample sets:")
        for idx, s in enumerate(sample_sets, 1):
            print(f" Sample {idx}: {s}")