In [None]:
import os
import sys
import torch
import time

import pandas as pd
import numpy as np
import torch.nn.functional as F

from pathlib import Path
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM

In [None]:
sys.path.append(os.path.abspath('../src'))

from read_and_write_docs import read_jsonl, write_jsonl
from utils import apply_temp_doc_id

In [None]:
max_threads = os.cpu_count()
print(f"Maximum threads available: {max_threads} - Using {max_threads - 2}")

os.environ["OMP_NUM_THREADS"] = str(max_threads - 2)

In [None]:
def list_subdir_names(dir_path):
    p = Path(dir_path)
    return [child.name for child in p.iterdir() if child.is_dir()]

In [None]:
base_loc = "//bc_nas_storage/BCross/datasets/author_verification"

data_type = ["training", "test"]

directories = list_subdir_names(f"{base_loc}/{data_type[0]}")
directories = ['Enron', 'Wiki']

In [None]:
# Load model and tokenizer once
model_name = "Qwen2.5-1.5B-Instruct"
model_loc = f"C:/Users/benjc/Documents/local models/{model_name}"

tokenizer = AutoTokenizer.from_pretrained(model_loc)
model = AutoModelForCausalLM.from_pretrained(model_loc)
model.eval()

In [None]:
def compute_log_probs(text: str):
    """
    Compute total log-probability of a text under a causal language model.
    Returns the sum of log-probs for all tokens except the first.
    """
    inputs = tokenizer(text, return_tensors="pt", return_offsets_mapping=True)
    input_ids = inputs["input_ids"]
    with torch.no_grad():
        outputs = model(input_ids)
    logits = outputs.logits
    log_probs = []
    for i in range(1, input_ids.size(1)):
        token_id = input_ids[0, i]
        logits_prev = logits[0, i - 1]
        log_prob = F.log_softmax(logits_prev, dim=-1)[token_id].item()
        log_probs.append(log_prob)
    return log_probs

In [None]:
def compute_log_probs_with_median(text: str):
    """
    For each token (excluding first), return:
    - tokens: list of tokens in the text
    - log_probs: list of chosen-token log-probs
    - median_logprobs: list of median log-probs for each token
    """
    inputs = tokenizer(text, return_tensors="pt")
    input_ids = inputs["input_ids"]
    tokens = tokenizer.decode(input_ids[0]).split()  # Convert input_ids to tokens

    with torch.no_grad():
        outputs = model(input_ids)

    logits = outputs.logits  # [batch_size=1, seq_len, vocab_size]
    
    log_probs = []
    median_logprobs = []
    # We start from the second token, as the first one has no previous token to condition on
    for i in range(0, input_ids.size(1)):
        if i == 0:
            logits_prev = logits[0, 0]
        else:
            logits_prev = logits[0, i - 1]
        dist = torch.log_softmax(logits_prev, dim=-1)
        
        # Extract the log probabilities
        log_prob = dist[input_ids[0, i].item()].item()
        median_logprob = float(dist.median().item())
        
        # Append to lists
        log_probs.append(log_prob)
        median_logprobs.append(median_logprob)
    
    # The tokens list starts from the first token, but the log_probs and median_logprobs start from the second
    # To align them, we need to slice the tokens list to match the lengths
    tokens = tokens[0:]  # Match the length of log_probs and median_logprobs
    
    return tokens, log_probs, median_logprobs

In [None]:
def score_dataframe(df: pd.DataFrame, text_column: str = "text") -> pd.DataFrame:
    """
    Takes a dataframe with a column of texts and computes:
    - list of log-probs per token
    - median log-probs per token
    - number of tokens
    - sum of log probs
    - average log-prob
    - differences between log_probs and median log-probs
    - absolute differences between log_probs and median log-probs
    - mean of differences
    - mean of absolute differences
    """
    tqdm.pandas(desc="Scoring texts")
    df = df.copy()

    # Step 1: Extract tokens, log_probs, and median log_probs
    df[['tokens', 'log_probs', 'med_log_prob']] = df[text_column].progress_apply(
        lambda t: pd.Series(compute_log_probs_with_median(t))
    )

    # Step 2: Compute differences
    df['differences'] = df.apply(
        lambda row: [lp - mlp for lp, mlp in zip(row['log_probs'], row['med_log_prob'])],
        axis=1
    )

    # Compute absolute differences
    df['abs_differences'] = df.apply(
        lambda row: [abs(lp - mlp) for lp, mlp in zip(row['log_probs'], row['med_log_prob'])],
        axis=1
    )

    # Compute summary stats
    df["num_tokens"] = df["log_probs"].apply(len)
    df["sum_log_prob"] = df["log_probs"].apply(sum)
    df["avg_log_prob"] = df["sum_log_prob"] / df["num_tokens"]

    # Compute mean of differences and absolute differences
    df["mean_diff"] = df["differences"].apply(np.mean)
    df["mean_abs_diff"] = df["abs_differences"].apply(np.mean)

    return df

In [None]:
# test_df = read_jsonl(f"{base_loc}/training/Enron/known_raw.jsonl")
# test_df = test_df.head(5)
# result = score_dataframe(test_df)
# result.head()

In [None]:
def compute_llr(known_lp: float, unknown_lp: float, impostor_lps: list) -> float:
    """
    Log-likelihood ratio using known vs impostors as background.
    """
    impostor_mean = sum(impostor_lps) / len(impostor_lps)
    return known_lp - impostor_mean, unknown_lp - impostor_mean

In [None]:
def run_verification(df_known, df_unknown, df_impostors, col: str = "text") -> dict:
    """
    End-to-end LLR test using three DataFrames.
    """
    scored_known = score_dataframe(df_known, col)
    scored_unknown = score_dataframe(df_unknown, col)
    scored_impostors = score_dataframe(df_impostors, col)

    lp_known = scored_known["log_prob"].mean()
    lp_unknown = scored_unknown["log_prob"].mean()
    lp_impostors = scored_impostors["log_prob"].tolist()

    llr_known, llr_unknown = compute_llr(lp_known, lp_unknown, lp_impostors)

    return {
        "known_logprob": lp_known,
        "unknown_logprob": lp_unknown,
        "impostor_avg_logprob": sum(lp_impostors)/len(lp_impostors),
        "LLR_known": llr_known,
        "LLR_unknown": llr_unknown,
        "LLR_difference": llr_known - llr_unknown
    }

In [None]:
for corpus in directories:
    for dtype in data_type:
        for author in ['known', 'unknown']:

            data_loc = f"{base_loc}/{dtype}/{corpus}/{author}_raw.jsonl"
            save_loc = f"{base_loc}/{dtype}/{corpus}/{author}_logprobs_{model_name.lower().replace("-", "_")}.jsonl"
            
            if os.path.exists(save_loc):
                print(f"Skipping {dtype} – {corpus} – {author} (already exists)")
                continue

            print(f"Processing Corpus - {dtype} - {corpus} - {author}")

            t0 = time.time()
            
            df = read_jsonl(data_loc)
            df = apply_temp_doc_id(df)
            
            num_docs = df.shape[0]
            print(f"    Number of {author} docs - {num_docs}")
    
            df_scored = score_dataframe(df)
            write_jsonl(df_scored, save_loc)
            
            elapsed = time.time() - t0
            
            print(f"    Completed Corpus - {dtype} - {corpus} - {author} - Time per Doc: {elapsed/num_docs}")